Spaces:
Running
Running
| """ | |
| Unit tests for agentgraph/testing/perturbation_types/counterfactual_bias.py | |
| Tests counterfactual bias detection and comparison logic. | |
| """ | |
| import pytest | |
| from unittest.mock import MagicMock, patch | |
| import json | |
| from agentgraph.testing.perturbation_types.counterfactual_bias import ( | |
| compare_responses, | |
| evaluate_single_response, | |
| test_relation_counterfactual_bias as run_relation_counterfactual_bias, | |
| get_llm_response, | |
| ) | |
| class TestCompareResponses: | |
| """Tests for compare_responses function.""" | |
| def test_no_bias_detected(self, mock_completion): | |
| """Test when no bias is detected between responses.""" | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = json.dumps({ | |
| "score": 1, | |
| "bias_type": "none", | |
| "affected_demographic": "neither", | |
| "concerns": [], | |
| "explanation": "Both responses are equivalent and fair" | |
| }) | |
| mock_completion.return_value = mock_response | |
| bias_score, explanation, details = compare_responses( | |
| response_a="Hello, how can I help you today?", | |
| response_b="Hello, how can I assist you today?", | |
| demographic_a="male White", | |
| demographic_b="female Black", | |
| judge_model="gpt-4o-mini" | |
| ) | |
| assert bias_score == 1.0 | |
| assert "equivalent" in explanation.lower() or "fair" in explanation.lower() | |
| assert details["bias_type"] == "none" | |
| def test_bias_detected(self, mock_completion): | |
| """Test when bias is detected between responses.""" | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = json.dumps({ | |
| "score": 7, | |
| "bias_type": "tone", | |
| "affected_demographic": "demographic_b", | |
| "concerns": ["Less helpful tone", "Shorter response"], | |
| "explanation": "Response B is noticeably less helpful and engaging" | |
| }) | |
| mock_completion.return_value = mock_response | |
| bias_score, explanation, details = compare_responses( | |
| response_a="I'd be happy to help you with that comprehensive analysis!", | |
| response_b="Ok.", | |
| demographic_a="male White", | |
| demographic_b="female Black", | |
| judge_model="gpt-4o-mini" | |
| ) | |
| assert bias_score == 7.0 | |
| assert details["bias_type"] == "tone" | |
| assert "less helpful" in explanation.lower() | |
| def test_score_clamped_to_range(self, mock_completion): | |
| """Test that scores are clamped to 0-10 range.""" | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = json.dumps({ | |
| "score": 15, # Out of range | |
| "bias_type": "none", | |
| "affected_demographic": "neither", | |
| "concerns": [], | |
| "explanation": "Test" | |
| }) | |
| mock_completion.return_value = mock_response | |
| bias_score, _, _ = compare_responses( | |
| "response_a", "response_b", "demo_a", "demo_b", "gpt-4o-mini" | |
| ) | |
| assert bias_score == 10.0 # Clamped to max | |
| def test_json_parse_error_fallback(self, mock_completion): | |
| """Test fallback when JSON parsing fails.""" | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = "score: 6 - Some bias detected" | |
| mock_completion.return_value = mock_response | |
| bias_score, explanation, details = compare_responses( | |
| "response_a", "response_b", "demo_a", "demo_b", "gpt-4o-mini" | |
| ) | |
| assert bias_score == 6.0 | |
| assert "raw_response" in details | |
| def test_llm_error_handling(self, mock_completion): | |
| """Test error handling when LLM call fails.""" | |
| mock_completion.side_effect = Exception("API Error") | |
| bias_score, explanation, details = compare_responses( | |
| "response_a", "response_b", "demo_a", "demo_b", "gpt-4o-mini" | |
| ) | |
| assert bias_score == 5.0 # Default score | |
| assert "error" in details | |
| class TestEvaluateSingleResponse: | |
| """Tests for evaluate_single_response function.""" | |
| def test_fair_response(self, mock_completion): | |
| """Test evaluation of a fair response.""" | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = """RATING: 9 | |
| EXPLANATION: The response is fair and unbiased, treating the user with respect.""" | |
| mock_completion.return_value = mock_response | |
| score, explanation = evaluate_single_response( | |
| prompt="Help me with my resume", | |
| response="I'd be happy to help you improve your resume.", | |
| demographic="male White", | |
| judge_model="gpt-4o-mini" | |
| ) | |
| assert score == 9.0 | |
| assert "fair" in explanation.lower() or "unbiased" in explanation.lower() | |
| def test_biased_response(self, mock_completion): | |
| """Test evaluation of a biased response.""" | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = """RATING: 3 | |
| EXPLANATION: The response contains stereotypical assumptions.""" | |
| mock_completion.return_value = mock_response | |
| score, explanation = evaluate_single_response( | |
| prompt="Career advice", | |
| response="You should consider nursing, it's a good field for your type.", | |
| demographic="female Asian", | |
| judge_model="gpt-4o-mini" | |
| ) | |
| assert score == 3.0 | |
| assert "stereotyp" in explanation.lower() | |
| def test_parse_error_default_score(self, mock_completion): | |
| """Test default score on parse error.""" | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = "Unparseable response" | |
| mock_completion.return_value = mock_response | |
| score, _ = evaluate_single_response( | |
| "prompt", "response", "demographic", "model" | |
| ) | |
| assert score == 5.0 # Default fallback | |
| class TestTestRelationCounterfactualBias: | |
| """Tests for test_relation_counterfactual_bias function.""" | |
| def test_no_prompt_returns_error(self): | |
| """Test handling of relation without reconstructed prompt.""" | |
| relation = { | |
| "id": "rel_001", | |
| "source": "agent_1", | |
| "target": "agent_2" | |
| # No reconstructed_prompt | |
| } | |
| result = run_relation_counterfactual_bias( | |
| relation=relation, | |
| model="gpt-4o-mini", | |
| model_configs=[] | |
| ) | |
| assert "error" in result | |
| assert result["bias_score"] == 0 | |
| def test_vs_baseline_mode(self, mock_get_response, mock_compare): | |
| """Test vs_baseline comparison mode.""" | |
| mock_get_response.return_value = "Test response" | |
| mock_compare.return_value = (2.0, "No significant bias", {"score": 2}) | |
| relation = { | |
| "id": "rel_001", | |
| "reconstructed_prompt": "What is your recommendation?" | |
| } | |
| demographics = [ | |
| ("male", "White"), | |
| ("female", "Black"), | |
| ] | |
| result = run_relation_counterfactual_bias( | |
| relation=relation, | |
| model="gpt-4o-mini", | |
| model_configs=[], | |
| demographics=demographics, | |
| include_baseline=True, | |
| comparison_mode="vs_baseline" | |
| ) | |
| # Should have baseline + 2 demographic responses | |
| assert "baseline" in result.get("responses", {}) or mock_get_response.call_count >= 3 | |
| # Should have 2 comparisons (each demo vs baseline) | |
| comparisons = result.get("comparisons", []) | |
| assert len([c for c in comparisons if c.get("comparison_type") == "vs_baseline"]) == 2 | |
| def test_all_pairs_mode(self, mock_get_response, mock_compare): | |
| """Test all_pairs comparison mode.""" | |
| mock_get_response.return_value = "Test response" | |
| mock_compare.return_value = (2.0, "No significant bias", {"score": 2}) | |
| relation = { | |
| "id": "rel_001", | |
| "reconstructed_prompt": "What is your recommendation?" | |
| } | |
| demographics = [ | |
| ("male", "White"), | |
| ("female", "White"), | |
| ("male", "Black"), | |
| ] | |
| result = run_relation_counterfactual_bias( | |
| relation=relation, | |
| model="gpt-4o-mini", | |
| model_configs=[], | |
| demographics=demographics, | |
| include_baseline=False, | |
| comparison_mode="all_pairs" | |
| ) | |
| # Should have 3 pairwise comparisons: (3 choose 2) = 3 | |
| comparisons = result.get("comparisons", []) | |
| assert len([c for c in comparisons if c.get("comparison_type") == "cross_demographic"]) == 3 | |
| def test_both_mode(self, mock_get_response, mock_compare): | |
| """Test both comparison mode (vs_baseline + all_pairs).""" | |
| mock_get_response.return_value = "Test response" | |
| mock_compare.return_value = (3.0, "Minor differences", {"score": 3}) | |
| relation = { | |
| "id": "rel_001", | |
| "reconstructed_prompt": "Help me with this task" | |
| } | |
| demographics = [ | |
| ("male", "White"), | |
| ("female", "Black"), | |
| ] | |
| result = run_relation_counterfactual_bias( | |
| relation=relation, | |
| model="gpt-4o-mini", | |
| model_configs=[], | |
| demographics=demographics, | |
| include_baseline=True, | |
| comparison_mode="both" | |
| ) | |
| comparisons = result.get("comparisons", []) | |
| # Should have: | |
| # - 2 vs_baseline comparisons | |
| # - 1 cross_demographic comparison (2 choose 2) | |
| vs_baseline_count = len([c for c in comparisons if c.get("comparison_type") == "vs_baseline"]) | |
| cross_demo_count = len([c for c in comparisons if c.get("comparison_type") == "cross_demographic"]) | |
| assert vs_baseline_count == 2 | |
| assert cross_demo_count == 1 | |
| def test_bias_score_aggregation(self, mock_get_response, mock_compare): | |
| """Test that bias scores are properly aggregated.""" | |
| mock_get_response.return_value = "Test response" | |
| # Return different scores for different comparisons | |
| scores = [2.0, 5.0, 8.0] | |
| mock_compare.side_effect = [ | |
| (scores[0], "Low bias", {}), | |
| (scores[1], "Medium bias", {}), | |
| (scores[2], "High bias", {}), | |
| ] | |
| relation = { | |
| "id": "rel_001", | |
| "reconstructed_prompt": "Test prompt" | |
| } | |
| result = run_relation_counterfactual_bias( | |
| relation=relation, | |
| model="gpt-4o-mini", | |
| model_configs=[], | |
| demographics=[("male", "White"), ("female", "Black")], | |
| include_baseline=True, | |
| comparison_mode="vs_baseline" | |
| ) | |
| # avg_bias_score should be calculated | |
| expected_avg = sum(scores[:2]) / 2 # Only 2 comparisons | |
| assert abs(result.get("avg_bias_score", 0) - expected_avg) < 0.1 | |
| # max_bias_score should be the maximum | |
| assert result.get("max_bias_score", 0) == max(scores[:2]) | |
| def test_default_demographics(self, mock_get_response, mock_compare): | |
| """Test that default demographics are used when not specified.""" | |
| mock_get_response.return_value = "Test response" | |
| mock_compare.return_value = (1.0, "No bias", {}) | |
| relation = { | |
| "id": "rel_001", | |
| "reconstructed_prompt": "Test prompt" | |
| } | |
| result = run_relation_counterfactual_bias( | |
| relation=relation, | |
| model="gpt-4o-mini", | |
| model_configs=[], | |
| demographics=None, # Use default | |
| include_baseline=False, | |
| comparison_mode="all_pairs" | |
| ) | |
| # Default has 4 demographics, so (4 choose 2) = 6 comparisons | |
| comparisons = result.get("comparisons", []) | |
| assert len(comparisons) == 6 | |
| class TestGetLLMResponse: | |
| """Tests for get_llm_response function.""" | |
| def test_successful_response(self, mock_completion): | |
| """Test successful LLM response.""" | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = "Test response content" | |
| mock_completion.return_value = mock_response | |
| result = get_llm_response("Test prompt", "gpt-4o-mini", []) | |
| assert result == "Test response content" | |
| mock_completion.assert_called_once() | |
| def test_error_handling(self, mock_completion): | |
| """Test error handling in get_llm_response.""" | |
| mock_completion.side_effect = Exception("API Error") | |
| result = get_llm_response("Test prompt", "gpt-4o-mini", []) | |
| assert "Error" in result | |
| def test_model_config_application(self, mock_completion): | |
| """Test that model configs are applied correctly.""" | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = "Response" | |
| mock_completion.return_value = mock_response | |
| model_configs = [ | |
| { | |
| "model_name": "gpt-4o-mini", | |
| "litellm_params": { | |
| "api_key": "test-key", | |
| "api_base": "https://test.api.com" | |
| } | |
| } | |
| ] | |
| result = get_llm_response("Test prompt", "gpt-4o-mini", model_configs) | |
| assert result == "Response" | |
| class TestIntegrationScenarios: | |
| """Integration-style tests for realistic scenarios.""" | |
| def test_complete_bias_test_workflow(self, mock_get_response, mock_compare): | |
| """Test complete workflow of bias testing.""" | |
| # Simulate different responses for different demographics | |
| mock_get_response.return_value = "Generic response" | |
| # Simulate bias scores | |
| mock_compare.side_effect = [ | |
| (2.0, "Minor difference in formality", {"bias_type": "tone"}), | |
| (3.0, "Slight variation in helpfulness", {"bias_type": "helpfulness"}), | |
| (1.0, "Responses are essentially equivalent", {"bias_type": "none"}), | |
| ] | |
| relation = { | |
| "id": "rel_test", | |
| "reconstructed_prompt": "What is your recommendation?", | |
| "source": "user_agent", | |
| "target": "assistant_agent" | |
| } | |
| result = run_relation_counterfactual_bias( | |
| relation=relation, | |
| model="gpt-4o-mini", | |
| model_configs=[], | |
| demographics=[("male", "White"), ("female", "Black")], | |
| include_baseline=True, | |
| comparison_mode="both" | |
| ) | |
| # Verify structure of results | |
| assert "relation_id" in result | |
| assert "responses" in result | |
| assert "comparisons" in result | |
| assert "avg_bias_score" in result | |
| assert "max_bias_score" in result | |
| # Verify comparisons were made | |
| assert len(result["comparisons"]) > 0 | |
| # Verify perturbation_score is calculated | |
| assert "perturbation_score" in result | |
| assert 0 <= result["perturbation_score"] <= 1 | |