File size: 7,811 Bytes
86b932c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import os
import sys
import json
import time
import logging
import pickle
import numpy as np
import pandas as pd
import yaml
from sklearn.model_selection import StratifiedShuffleSplit

# Fix paths for imports
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if str(_PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(_PROJECT_ROOT))

from src.utils.text_utils import clean_text, build_full_text, word_count, text_length_bucket
from src.utils.domain_weights import compute_domain_weights
from src.utils.freshness import apply_freshness_score

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    datefmt="%H:%M:%S"
)
logger = logging.getLogger("stage2_preprocessing")

class KerasStyleTokenizer:
    """A lightweight, PyTorch-compatible word tokenizer mimicking Keras's Tokenizer."""
    def __init__(self, num_words=None, oov_token="<OOV>"):
        self.num_words = num_words
        self.oov_token = oov_token
        self.word_index = {self.oov_token: 1} # 0 is reserved for padding
        self.word_counts = {}
        
    def fit_on_texts(self, texts):
        for text in texts:
            # clean_text already removes punctuation, we just split by space
            words = str(text).split()
            for w in words:
                self.word_counts[w] = self.word_counts.get(w, 0) + 1
                
        # Sort by frequency
        sorted_words = sorted(self.word_counts.items(), key=lambda x: x[1], reverse=True)
        
        for idx, (w, _) in enumerate(sorted_words):
            if self.num_words and idx >= self.num_words - 2:
                break
            self.word_index[w] = idx + 2
            
    def texts_to_sequences(self, texts):
        seqs = []
        for text in texts:
            words = str(text).split()
            seq = [self.word_index.get(w, 1) for w in words]
            seqs.append(seq)
        return seqs

def truncate_str_array(df, col):
    """Memory fix: force string type for arrays."""
    return df[col].astype(str).values

def run_preprocessing(cfg: dict = None):
    t0 = time.perf_counter()
    if cfg is None:
        cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml")
        with open(cfg_path, "r", encoding="utf-8") as f:
            cfg = yaml.safe_load(f)

    logger.info("STAGE 2: PREPROCESSING START")
    processed_dir = os.path.join(_PROJECT_ROOT, cfg["paths"]["processed_dir"])
    splits_dir = os.path.join(_PROJECT_ROOT, cfg["paths"]["splits_dir"])
    models_dir = os.path.join(_PROJECT_ROOT, cfg["paths"]["models_dir"])
    os.makedirs(splits_dir, exist_ok=True)
    os.makedirs(models_dir, exist_ok=True)
    
    # 1. Load Data
    csv_path = os.path.join(processed_dir, "unified.csv")
    df = pd.read_csv(csv_path)
    df["published_date"] = pd.to_datetime(df["published_date"], errors="coerce")
    logger.info("Loaded unified CSV: %d rows", len(df))
    
    # 2. Extract Text Length Features & Clean
    # "Concatenate title + '. ' + text as full_text" -> use build_full_text
    df["full_text"] = df.apply(lambda r: build_full_text(
        str(r["title"]) if pd.notna(r["title"]) else "", 
        str(r["text"]) if pd.notna(r["text"]) else ""
    ), axis=1)
    # the prompt specifies cleaning the text by lowercasing, removing HTML/URLs/special bounds.
    # clean_text handles exactly this cleanly.
    logger.info("Applying text cleaning (HTML, URLs, whitespace, punctuation) ...")
    df["clean_text"] = df["full_text"].apply(clean_text)
    
    logger.info("Calculating word counts and text buckets ...")
    df["word_count"] = df["clean_text"].apply(word_count)
    df["text_length_bucket"] = df["word_count"].apply(text_length_bucket)
    
    # 3. Domain Weights
    ds_cfg = cfg.get("dataset", {})
    min_domains = ds_cfg.get("min_domain_samples", 20)
    max_multi = cfg.get("inference", {}).get("max_multiplier", 10)
    
    logger.info("Computing domain-aware sample weights...")
    df = compute_domain_weights(df, min_domain_samples=min_domains, max_multiplier=max_multi)
    
    # 4. Freshness
    logger.info("Applying temporal freshness scores...")
    df = apply_freshness_score(df, is_inference=False)
    
    # 5. Train/Val/Test Splits
    # The user clarified exactly:
    # stratified_holdout -> stage 3 proxy
    # testing -> sacred
    # train pool -> split 85/15 into train and val.
    
    test_mask = df["dataset_origin"] == "testing"
    holdout_mask = df["dataset_origin"] == "stratified_holdout"
    train_pool_mask = ~(test_mask | holdout_mask)
    
    test_df = df[test_mask].copy()
    holdout_df = df[holdout_mask].copy()
    train_pool_df = df[train_pool_mask].copy()
    
    # Split train_pool into 85% train, 15% validation using StratifiedShuffleSplit
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
    train_pool_df = train_pool_df.reset_index(drop=True)
    
    train_idx, val_idx = next(sss.split(train_pool_df, train_pool_df["binary_label"]))
    train_df = train_pool_df.iloc[train_idx].copy()
    val_df = train_pool_df.iloc[val_idx].copy()
    
    logger.info("SPLITS SUMMARY:")
    logger.info("  Train:     %d rows", len(train_df))
    logger.info("  Val:       %d rows", len(val_df))
    logger.info("  Holdout:   %d rows", len(holdout_df))
    logger.info("  Sacred:    %d rows", len(test_df))
    
    # 6. Save splits metadata and arrays
    # Saving raw text separately just for PyTorch dataset convenience (faster than pd.read_csv for big models)
    splits_dict = {
        "train": train_df,
        "val": val_df, 
        "holdout": holdout_df,
        "test": test_df
    }
    
    for split_name, split_data in splits_dict.items():
        np.save(os.path.join(splits_dir, f"X_text_{split_name}.npy"), truncate_str_array(split_data, "clean_text"))
        np.save(os.path.join(splits_dir, f"y_{split_name}.npy"), split_data["binary_label"].values)
        np.save(os.path.join(splits_dir, f"w_{split_name}.npy"), split_data["sample_weight"].values)
        
        meta = {
            "size": len(split_data),
            "fake_count": int((split_data["binary_label"] == 0).sum()),
            "true_count": int((split_data["binary_label"] == 1).sum()),
            "word_count_median": float(split_data["word_count"].median()),
            "freshness_mean": float(split_data["freshness_score"].mean())
        }
        with open(os.path.join(splits_dir, f"meta_{split_name}.json"), "w") as f:
            json.dump(meta, f, indent=2)

    # Save train_ids.csv explicitly
    train_df[["article_id"]].to_csv(os.path.join(splits_dir, "train_ids.csv"), index=False)
    # Also save the full preprocessed test sets to CSV for easy loading during Stage 3 / Evaluation
    train_df.to_csv(os.path.join(splits_dir, "df_train.csv"), index=False)
    val_df.to_csv(os.path.join(splits_dir, "df_val.csv"), index=False)
    holdout_df.to_csv(os.path.join(splits_dir, "df_holdout.csv"), index=False)
    test_df.to_csv(os.path.join(splits_dir, "df_test.csv"), index=False)

    # 7. Tokenization (LSTM)
    logger.info("Fitting LSTM Tokenizer on Train split...")
    # Max features for LSTM or generic defaults usually just load all words. We will let it cap at e.g., 50k
    vocab_size = cfg.get("preprocessing", {}).get("max_tfidf_features", 50000)
    tok = KerasStyleTokenizer(num_words=vocab_size)
    tok.fit_on_texts(train_df["clean_text"])
    
    tok_path = os.path.join(models_dir, "tokenizer.pkl")
    with open(tok_path, "wb") as f:
        pickle.dump(tok, f)
    logger.info(f"Saved tokenizer to {tok_path} (vocab size: {len(tok.word_index)})")

    t_end = time.perf_counter()
    logger.info("STAGE 2 FINISHED in %.2f seconds", t_end - t0)

if __name__ == "__main__":
    run_preprocessing()