| """ |
| Unit tests for dataset.py module. |
| |
| Tests functions for downloading and extracting the SkillScope dataset. |
| """ |
| import pytest |
| from pathlib import Path |
| import tempfile |
| import zipfile |
| import sqlite3 |
| from unittest.mock import patch, MagicMock |
|
|
| from hopcroft_skill_classification_tool_competition.dataset import ( |
| download_skillscope_dataset, |
| ) |
|
|
|
|
| @pytest.mark.unit |
| class TestDatasetDownload: |
| """Unit tests for dataset download functionality.""" |
| |
| def test_download_returns_path(self): |
| """Test that download function returns a Path object.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| output_dir = Path(tmpdir) |
| |
| |
| with patch('hopcroft_skill_classification_tool_competition.dataset.hf_hub_download') as mock_download: |
| |
| zip_path = output_dir / "skillscope_data.zip" |
| db_path = output_dir / "skillscope_data.db" |
| |
| |
| conn = sqlite3.connect(db_path) |
| conn.execute("CREATE TABLE test (id INTEGER)") |
| conn.close() |
| |
| |
| with zipfile.ZipFile(zip_path, 'w') as zf: |
| zf.write(db_path, arcname='skillscope_data.db') |
| |
| |
| db_path.unlink() |
| |
| |
| mock_download.return_value = str(zip_path) |
| |
| result = download_skillscope_dataset(output_dir) |
| |
| assert isinstance(result, Path) |
| assert result.exists() |
| assert result.name == "skillscope_data.db" |
| |
| def test_download_creates_directory(self): |
| """Test that download creates output directory if it doesn't exist.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| output_dir = Path(tmpdir) / "nonexistent" / "nested" / "dir" |
| |
| assert not output_dir.exists() |
| |
| with patch('hopcroft_skill_classification_tool_competition.dataset.hf_hub_download') as mock_download: |
| |
| temp_db = Path(tmpdir) / "skillscope_data.db" |
| conn = sqlite3.connect(temp_db) |
| conn.execute("CREATE TABLE test (id INTEGER)") |
| conn.close() |
| |
| |
| zip_path = Path(tmpdir) / "skillscope_data.zip" |
| with zipfile.ZipFile(zip_path, 'w') as zf: |
| zf.write(temp_db, arcname='skillscope_data.db') |
| |
| mock_download.return_value = str(zip_path) |
| |
| download_skillscope_dataset(output_dir) |
| |
| assert output_dir.exists() |
| |
| def test_download_skips_if_exists(self): |
| """Test that download is skipped if database already exists.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| output_dir = Path(tmpdir) |
| db_path = output_dir / "skillscope_data.db" |
| |
| |
| conn = sqlite3.connect(db_path) |
| conn.execute("CREATE TABLE test (id INTEGER)") |
| conn.close() |
| |
| with patch('hopcroft_skill_classification_tool_competition.dataset.hf_hub_download') as mock_download: |
| result = download_skillscope_dataset(output_dir) |
| |
| |
| mock_download.assert_not_called() |
| assert result == db_path |
| |
| def test_download_extracts_zip(self): |
| """Test that zip file is properly extracted.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| output_dir = Path(tmpdir) |
| |
| with patch('hopcroft_skill_classification_tool_competition.dataset.hf_hub_download') as mock_download: |
| |
| temp_db = Path(tmpdir) / "temp_skillscope_data.db" |
| conn = sqlite3.connect(temp_db) |
| conn.execute("CREATE TABLE nlbse_tool_competition_data_by_issue (id INTEGER)") |
| conn.close() |
| |
| zip_path = output_dir / "skillscope_data.zip" |
| with zipfile.ZipFile(zip_path, 'w') as zf: |
| zf.write(temp_db, arcname='skillscope_data.db') |
| |
| temp_db.unlink() |
| mock_download.return_value = str(zip_path) |
| |
| result = download_skillscope_dataset(output_dir) |
| |
| |
| assert result.exists() |
| |
| |
| conn = sqlite3.connect(result) |
| cursor = conn.cursor() |
| cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") |
| tables = cursor.fetchall() |
| conn.close() |
| |
| assert len(tables) > 0 |
| |
| def test_download_cleans_up_zip(self): |
| """Test that zip file is deleted after extraction.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| output_dir = Path(tmpdir) |
| |
| with patch('hopcroft_skill_classification_tool_competition.dataset.hf_hub_download') as mock_download: |
| |
| temp_db = Path(tmpdir) / "temp_db.db" |
| conn = sqlite3.connect(temp_db) |
| conn.execute("CREATE TABLE test (id INTEGER)") |
| conn.close() |
| |
| zip_path = output_dir / "skillscope_data.zip" |
| with zipfile.ZipFile(zip_path, 'w') as zf: |
| zf.write(temp_db, arcname='skillscope_data.db') |
| |
| temp_db.unlink() |
| mock_download.return_value = str(zip_path) |
| |
| download_skillscope_dataset(output_dir) |
| |
| |
| assert not zip_path.exists() |
| |
| def test_download_raises_on_missing_database(self): |
| """Test that error is raised if database not in zip.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| output_dir = Path(tmpdir) |
| |
| with patch('hopcroft_skill_classification_tool_competition.dataset.hf_hub_download') as mock_download: |
| |
| zip_path = output_dir / "skillscope_data.zip" |
| with zipfile.ZipFile(zip_path, 'w') as zf: |
| zf.writestr('dummy.txt', 'dummy content') |
| |
| mock_download.return_value = str(zip_path) |
| |
| with pytest.raises(FileNotFoundError): |
| download_skillscope_dataset(output_dir) |
|
|
|
|
| @pytest.mark.unit |
| class TestDatasetEdgeCases: |
| """Unit tests for edge cases in dataset handling.""" |
| |
| def test_download_with_none_output_dir(self): |
| """Test download with None as output directory (should use default).""" |
| with patch('hopcroft_skill_classification_tool_competition.dataset.hf_hub_download') as mock_download: |
| with patch('hopcroft_skill_classification_tool_competition.dataset.RAW_DATA_DIR') as mock_raw_dir: |
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| mock_raw_dir.__truediv__ = MagicMock(return_value=Path(tmpdir) / "skillscope_data.db") |
| |
| |
| db_path = Path(tmpdir) / "skillscope_data.db" |
| db_path.parent.mkdir(parents=True, exist_ok=True) |
| conn = sqlite3.connect(db_path) |
| conn.execute("CREATE TABLE test (id INTEGER)") |
| conn.close() |
| |
| |
| result = download_skillscope_dataset(None) |
| |
| assert isinstance(result, Path) |
| |
| def test_download_handles_permission_error(self): |
| """Test handling of permission errors during file operations.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| output_dir = Path(tmpdir) |
| |
| with patch('hopcroft_skill_classification_tool_competition.dataset.hf_hub_download') as mock_download: |
| |
| |
| pass |
|
|
|
|
| @pytest.mark.unit |
| class TestDatasetIntegration: |
| """Integration-like tests for dataset module (still unit-scoped).""" |
| |
| def test_download_produces_valid_sqlite_database(self): |
| """Test that downloaded file is a valid SQLite database.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| output_dir = Path(tmpdir) |
| |
| with patch('hopcroft_skill_classification_tool_competition.dataset.hf_hub_download') as mock_download: |
| |
| temp_db = Path(tmpdir) / "temp.db" |
| conn = sqlite3.connect(temp_db) |
| conn.execute(""" |
| CREATE TABLE nlbse_tool_competition_data_by_issue ( |
| id INTEGER PRIMARY KEY, |
| repo_name TEXT, |
| pr_number INTEGER |
| ) |
| """) |
| conn.execute(""" |
| INSERT INTO nlbse_tool_competition_data_by_issue |
| VALUES (1, 'test_repo', 123) |
| """) |
| conn.commit() |
| conn.close() |
| |
| |
| zip_path = output_dir / "skillscope_data.zip" |
| with zipfile.ZipFile(zip_path, 'w') as zf: |
| zf.write(temp_db, arcname='skillscope_data.db') |
| |
| temp_db.unlink() |
| mock_download.return_value = str(zip_path) |
| |
| result = download_skillscope_dataset(output_dir) |
| |
| |
| conn = sqlite3.connect(result) |
| cursor = conn.cursor() |
| cursor.execute("SELECT * FROM nlbse_tool_competition_data_by_issue") |
| rows = cursor.fetchall() |
| conn.close() |
| |
| assert len(rows) == 1 |
| assert rows[0][1] == 'test_repo' |
| assert rows[0][2] == 123 |
|
|