| """ |
| Prepare baseline/reference data for drift detection. |
| This script samples representative data from the training set. |
| """ |
|
|
| import pickle |
| import pandas as pd |
| import numpy as np |
| import sqlite3 |
| from pathlib import Path |
| from sklearn.model_selection import train_test_split |
|
|
| |
| PROJECT_ROOT = Path(__file__).parent.parent.parent.parent |
| BASELINE_DIR = Path(__file__).parent.parent / "baseline" |
| BASELINE_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
| def load_training_data(): |
| """Load the original training dataset from SQLite database.""" |
| |
| db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db" |
| |
| if not db_path.exists(): |
| raise FileNotFoundError(f"Database not found at {db_path}") |
| |
| print(f"Loading data from database: {db_path}") |
| conn = sqlite3.connect(db_path) |
| |
| |
| query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000" |
| df = pd.read_sql_query(query, conn) |
| conn.close() |
| |
| print(f"Loaded {len(df)} training samples") |
| return df |
|
|
|
|
| def prepare_baseline(df, sample_size=1000, random_state=42): |
| """ |
| Sample representative baseline data. |
| |
| Args: |
| df: Training dataframe |
| sample_size: Number of samples for baseline |
| random_state: Random seed for reproducibility |
| |
| Returns: |
| Baseline dataframe |
| """ |
| |
| if 'label' in df.columns: |
| _, baseline_df = train_test_split( |
| df, |
| test_size=sample_size, |
| random_state=random_state, |
| stratify=df['label'] |
| ) |
| else: |
| baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state) |
| |
| print(f"Sampled {len(baseline_df)} baseline samples") |
| return baseline_df |
|
|
|
|
| def extract_features(df): |
| """ |
| Extract features used for drift detection. |
| Should match the features used by your model. |
| """ |
| |
| |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() |
| exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id'] |
| feature_columns = [col for col in numeric_cols if col not in exclude_cols] |
| |
| X = df[feature_columns].values |
| |
| print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples") |
| return X |
|
|
|
|
| def save_baseline(baseline_data, filename="reference_data.pkl"): |
| """Save baseline data to disk.""" |
| baseline_path = BASELINE_DIR / filename |
| |
| with open(baseline_path, 'wb') as f: |
| pickle.dump(baseline_data, f) |
| |
| print(f"Baseline saved to {baseline_path}") |
| print(f" Shape: {baseline_data.shape}") |
| print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB") |
|
|
|
|
| def main(): |
| """Main execution.""" |
| print("=" * 60) |
| print("Preparing Baseline Data for Drift Detection") |
| print("=" * 60) |
| |
| |
| df = load_training_data() |
| |
| |
| baseline_df = prepare_baseline(df, sample_size=1000) |
| |
| |
| X_baseline = extract_features(baseline_df) |
| |
| |
| save_baseline(X_baseline) |
| |
| print("\n" + "=" * 60) |
| print("Baseline preparation complete!") |
| print("=" * 60) |
|
|
|
|
| if __name__ == "__main__": |
| main() |