| """ |
| Root pytest configuration and shared fixtures. |
| |
| This module provides fixtures that are available to all test modules. |
| """ |
| import pytest |
| import numpy as np |
| import pandas as pd |
| import tempfile |
| import sqlite3 |
| from pathlib import Path |
| from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
|
| @pytest.fixture |
| def sample_text_data(): |
| """Fixture providing sample text data for testing.""" |
| return [ |
| "Fixed bug in authentication system using OAuth2", |
| "Implemented REST API endpoint for user data retrieval", |
| "Added unit tests for data processing pipeline", |
| "Refactored code to improve performance and reduce memory usage", |
| "Updated database schema with new migration scripts", |
| ] |
|
|
|
|
| @pytest.fixture |
| def sample_dirty_text(): |
| """Fixture providing text with common GitHub noise.""" |
| return [ |
| "Fixed bug https://github.com/repo/issues/123 in auth system", |
| "Added feature with <b>HTML tags</b> and `inline code`", |
| "Removed emoji 😀 and special characters", |
| """Updated docs with code block: |
| ```python |
| def foo(): |
| pass |
| ``` |
| """, |
| "Fixed multiple spaces and\n\nnewlines", |
| ] |
|
|
|
|
| @pytest.fixture |
| def sample_labels(): |
| """Fixture providing sample multi-label data.""" |
| return pd.DataFrame({ |
| 'Language': [1, 1, 1, 0, 1], |
| 'Data Structure': [1, 0, 0, 1, 1], |
| 'Testing': [0, 0, 1, 0, 0], |
| 'API': [1, 1, 0, 0, 0], |
| 'DevOps': [0, 0, 0, 1, 1], |
| }) |
|
|
|
|
| @pytest.fixture |
| def sample_dataframe(sample_text_data, sample_labels): |
| """Fixture providing complete sample dataframe.""" |
| df = pd.DataFrame({ |
| 'Repo Name': ['repo1', 'repo2', 'repo1', 'repo3', 'repo2'], |
| 'PR #': [1, 2, 3, 4, 5], |
| 'issue text': [sample_text_data[0], sample_text_data[1], |
| sample_text_data[2], sample_text_data[3], |
| sample_text_data[4]], |
| 'issue description': ['Description for issue 1', 'Description for issue 2', |
| 'Description for issue 3', 'Description for issue 4', |
| 'Description for issue 5'], |
| }) |
| |
| |
| for col in sample_labels.columns: |
| df[col] = sample_labels[col].values |
| |
| return df |
|
|
|
|
| @pytest.fixture |
| def temp_db(sample_dataframe): |
| """Fixture providing temporary SQLite database.""" |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.db', delete=False) as f: |
| db_path = f.name |
| |
| |
| conn = sqlite3.connect(db_path) |
| sample_dataframe.to_sql('nlbse_tool_competition_data_by_issue', |
| conn, if_exists='replace', index=False) |
| conn.close() |
| |
| yield Path(db_path) |
| |
| |
| Path(db_path).unlink() |
|
|
|
|
| @pytest.fixture |
| def sample_tfidf_vectorizer(): |
| """Fixture providing a simple TF-IDF vectorizer.""" |
| vectorizer = TfidfVectorizer( |
| max_features=100, |
| ngram_range=(1, 2), |
| stop_words='english' |
| ) |
| return vectorizer |
|
|
|
|
| @pytest.fixture |
| def sample_sparse_features(): |
| """Fixture providing sample sparse feature matrix.""" |
| |
| features = np.zeros((100, 50)) |
| |
| |
| for i in range(100): |
| |
| n_nonzero = np.random.randint(5, 11) |
| indices = np.random.choice(50, n_nonzero, replace=False) |
| features[i, indices] = np.random.rand(n_nonzero) |
| |
| return features |
|
|
|
|
| @pytest.fixture |
| def sample_multilabel_data(): |
| """Fixture providing sample multi-label classification data.""" |
| n_samples = 100 |
| n_labels = 10 |
| |
| |
| labels = np.zeros((n_samples, n_labels), dtype=int) |
| |
| for i in range(n_samples): |
| |
| n_labels_per_sample = np.random.randint(1, 6) |
| label_indices = np.random.choice(n_labels, n_labels_per_sample, replace=False) |
| labels[i, label_indices] = 1 |
| |
| return labels |
|
|
|
|
| @pytest.fixture |
| def empty_text_samples(): |
| """Fixture providing edge case: empty or null text samples.""" |
| return [ |
| "", |
| None, |
| " ", |
| "\n\n\n", |
| "a", |
| ] |
|
|