| """ |
| Integration tests for the feature extraction pipeline. |
| |
| Tests the combined functionality of dataset loading, text processing, |
| and feature extraction working together. |
| """ |
| import pytest |
| import numpy as np |
| import pandas as pd |
| import tempfile |
| import sqlite3 |
| from pathlib import Path |
|
|
| from hopcroft_skill_classification_tool_competition.features import ( |
| load_data_from_db, |
| create_feature_dataset, |
| extract_tfidf_features, |
| prepare_labels, |
| get_text_columns, |
| get_label_columns, |
| ) |
|
|
|
|
| @pytest.mark.integration |
| class TestFeatureExtractionPipeline: |
| """Integration tests for complete feature extraction pipeline.""" |
| |
| def test_full_pipeline_from_dataframe_to_features(self, sample_dataframe): |
| """Test complete pipeline from DataFrame to features and labels.""" |
| |
| features, vectorizer = extract_tfidf_features(sample_dataframe, max_features=50) |
| |
| |
| labels = prepare_labels(sample_dataframe) |
| |
| |
| assert features.shape[0] == len(labels) |
| assert features.shape[0] == len(sample_dataframe) |
| |
| |
| assert isinstance(features, np.ndarray) |
| assert isinstance(labels, pd.DataFrame) |
| |
| |
| assert not np.any(np.isnan(features)) |
| assert not np.any(np.isinf(features)) |
| assert not labels.isnull().any().any() |
| |
| def test_pipeline_with_database_to_features(self, temp_db): |
| """Test pipeline from database loading to feature extraction.""" |
| |
| df = load_data_from_db(temp_db) |
| |
| |
| features, vectorizer = extract_tfidf_features(df, max_features=50) |
| |
| |
| labels = prepare_labels(df) |
| |
| |
| assert features.shape[0] == len(df) |
| assert labels.shape[0] == len(df) |
| assert features.shape[0] == labels.shape[0] |
| |
| def test_create_feature_dataset_integration(self, temp_db): |
| """Test the complete create_feature_dataset function.""" |
| features, labels, feature_names, label_names = create_feature_dataset( |
| db_path=temp_db, |
| save_processed=False |
| ) |
| |
| |
| assert isinstance(features, np.ndarray) |
| assert isinstance(labels, pd.DataFrame) |
| assert isinstance(feature_names, np.ndarray) |
| assert isinstance(label_names, list) |
| |
| |
| assert features.shape[0] == labels.shape[0] |
| assert features.shape[1] == len(feature_names) |
| assert labels.shape[1] == len(label_names) |
| |
| def test_pipeline_preserves_sample_count(self, sample_dataframe): |
| """Test that no samples are lost during pipeline.""" |
| initial_count = len(sample_dataframe) |
| |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
| labels = prepare_labels(sample_dataframe) |
| |
| assert features.shape[0] == initial_count |
| assert labels.shape[0] == initial_count |
| |
| def test_pipeline_with_various_text_lengths(self): |
| """Test pipeline with documents of varying lengths.""" |
| df = pd.DataFrame({ |
| 'issue text': [ |
| 'short', |
| 'This is a medium length text with several words', |
| 'This is a very long text ' * 50, |
| ], |
| 'issue description': ['desc1', 'desc2', 'desc3'], |
| 'Label1': [1, 0, 1], |
| 'Label2': [0, 1, 1], |
| }) |
| |
| features, _ = extract_tfidf_features(df, max_features=50) |
| labels = prepare_labels(df) |
| |
| |
| assert features.shape[0] == 3 |
| assert labels.shape[0] == 3 |
| |
| |
| assert not np.all(features == 0) |
|
|
|
|
| @pytest.mark.integration |
| class TestDataFlowConsistency: |
| """Integration tests for data consistency through the pipeline.""" |
| |
| def test_text_cleaning_affects_features(self, sample_dataframe): |
| """Test that text cleaning impacts feature extraction.""" |
| |
| dirty_df = sample_dataframe.copy() |
| dirty_df['issue text'] = [ |
| "Bug https://example.com with <b>HTML</b>", |
| "Feature with ```code block```", |
| "Update with extra spaces", |
| "Test with 😀 emoji", |
| "Normal clean text", |
| ] |
| |
| |
| features_dirty, _ = extract_tfidf_features(dirty_df, max_features=50, min_df=1, max_df=1.0) |
| |
| |
| clean_df = sample_dataframe.copy() |
| clean_df['issue text'] = [ |
| "Bug with HTML", |
| "Feature with", |
| "Update with extra spaces", |
| "Test with emoji", |
| "Normal clean text", |
| ] |
| |
| |
| features_clean, _ = extract_tfidf_features(clean_df, max_features=50, min_df=1, max_df=1.0) |
| |
| |
| |
| assert features_dirty.shape == features_clean.shape |
| |
| def test_label_binarization_consistency(self): |
| """Test that label binarization is consistent.""" |
| df = pd.DataFrame({ |
| 'issue text': ['text1', 'text2', 'text3'], |
| 'issue description': ['desc1', 'desc2', 'desc3'], |
| 'Label1': [0, 5, 10], |
| 'Label2': [1, 0, 100], |
| }) |
| |
| labels = prepare_labels(df) |
| |
| |
| assert set(labels.values.flatten()).issubset({0, 1}) |
| |
| |
| assert labels.loc[0, 'Label1'] == 0 |
| assert labels.loc[1, 'Label1'] == 1 |
| assert labels.loc[2, 'Label1'] == 1 |
| assert labels.loc[0, 'Label2'] == 1 |
| assert labels.loc[1, 'Label2'] == 0 |
| assert labels.loc[2, 'Label2'] == 1 |
| |
| def test_feature_label_alignment(self, sample_dataframe): |
| """Test that features and labels remain aligned.""" |
| features, _ = extract_tfidf_features(sample_dataframe, max_features=50) |
| labels = prepare_labels(sample_dataframe) |
| |
| |
| for i in range(len(sample_dataframe)): |
| |
| assert features[i].shape[0] > 0 |
| assert labels.iloc[i].shape[0] > 0 |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.slow |
| class TestLargeDatasetHandling: |
| """Integration tests with larger datasets (marked as slow).""" |
| |
| def test_pipeline_with_large_dataset(self): |
| """Test pipeline with a larger number of samples.""" |
| |
| n_samples = 1000 |
| df = pd.DataFrame({ |
| 'issue text': [f'Issue number {i} with some text' for i in range(n_samples)], |
| 'issue description': [f'Description for issue {i}' for i in range(n_samples)], |
| 'Label1': np.random.randint(0, 2, n_samples), |
| 'Label2': np.random.randint(0, 2, n_samples), |
| 'Label3': np.random.randint(0, 2, n_samples), |
| }) |
| |
| features, _ = extract_tfidf_features(df, max_features=500) |
| labels = prepare_labels(df) |
| |
| assert features.shape[0] == n_samples |
| assert labels.shape[0] == n_samples |
| assert features.shape[1] <= 500 |
| |
| def test_pipeline_with_many_labels(self): |
| """Test pipeline with many label columns.""" |
| n_labels = 50 |
| df = pd.DataFrame({ |
| 'issue text': ['text1', 'text2', 'text3'], |
| 'issue description': ['desc1', 'desc2', 'desc3'], |
| }) |
| |
| |
| for i in range(n_labels): |
| df[f'Label_{i}'] = np.random.randint(0, 2, 3) |
| |
| labels = prepare_labels(df) |
| |
| assert labels.shape[1] == n_labels |
| assert set(labels.values.flatten()).issubset({0, 1}) |
|
|
|
|
| @pytest.mark.integration |
| class TestSaveAndLoadIntegration: |
| """Integration tests for saving and loading processed data.""" |
| |
| def test_save_and_load_features(self, temp_db): |
| """Test saving features and labels then loading them back.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| from hopcroft_skill_classification_tool_competition.features import ( |
| create_feature_dataset, |
| load_processed_data |
| ) |
| |
| |
| with pytest.MonkeyPatch.context() as m: |
| tmpdir_path = Path(tmpdir) |
| tfidf_dir = tmpdir_path / "tfidf" |
| tfidf_dir.mkdir(parents=True) |
| |
| |
| features_orig, labels_orig, _, _ = create_feature_dataset( |
| db_path=temp_db, |
| save_processed=True |
| ) |
| |
| |
| np.save(tfidf_dir / "features_tfidf.npy", features_orig) |
| np.save(tfidf_dir / "labels_tfidf.npy", labels_orig.values) |
| |
| |
| features_loaded = np.load(tfidf_dir / "features_tfidf.npy") |
| labels_loaded = np.load(tfidf_dir / "labels_tfidf.npy") |
| |
| |
| np.testing.assert_array_equal(features_orig, features_loaded) |
| np.testing.assert_array_equal(labels_orig.values, labels_loaded) |
|
|
|
|
| @pytest.mark.integration |
| class TestErrorHandlingInPipeline: |
| """Integration tests for error handling throughout pipeline.""" |
| |
| def test_pipeline_with_missing_columns(self): |
| """Test pipeline behavior with missing expected columns.""" |
| df = pd.DataFrame({ |
| 'wrong_col_1': ['text1', 'text2'], |
| 'wrong_col_2': ['desc1', 'desc2'], |
| 'Label1': [1, 0], |
| }) |
| |
| |
| text_cols = get_text_columns(df) |
| assert len(text_cols) == 0 |
| |
| |
| |
| with pytest.raises(ValueError, match="No text columns found"): |
| extract_tfidf_features(df) |
| |
| def test_pipeline_with_all_nan_text(self): |
| """Test pipeline with all NaN text values raises appropriate error. |
| |
| TF-IDF cannot build a vocabulary from empty/NaN documents, |
| so it should raise a ValueError with a descriptive message. |
| """ |
| df = pd.DataFrame({ |
| 'issue text': [None, None, None], |
| 'issue description': [None, None, None], |
| 'Label1': [1, 0, 1], |
| }) |
| |
| |
| with pytest.raises(ValueError, match="empty vocabulary"): |
| extract_tfidf_features(df, max_features=50) |
| |
| def test_pipeline_with_empty_labels(self): |
| """Test pipeline when no labels are present.""" |
| df = pd.DataFrame({ |
| 'issue text': ['text1', 'text2'], |
| 'issue description': ['desc1', 'desc2'], |
| |
| }) |
| |
| label_cols = get_label_columns(df) |
| |
| |
| assert len(label_cols) == 0 |
|
|