TruthLens / src /utils /freshness.py
DevPatel0611's picture
Clean build with correct gitignore
86b932c
from datetime import datetime, timezone
import pandas as pd
import numpy as np
def calculate_freshness(
published_date,
has_date: bool,
is_inference: bool = False,
reference_date: datetime = None
) -> float:
"""
Calculate the temporal freshness score for a single article.
Rules:
- score = 1.0 if article is < 30 days old
- score = max(0.1, 1 - (days_old / 365)) for older articles
- score = 0.5 if has_date is False (neutral for training)
- score = 0.35 if has_date is False AND called from inference
Args:
published_date: The published date of the article (datetime or NaT).
has_date: Boolean flag indicating if a valid date is present.
is_inference: Whether the scoring is happening during live inference.
reference_date: The date to compute 'days_old' against (defaults to now).
Returns:
Float score between 0.1 and 1.0.
"""
if not has_date or pd.isna(published_date):
return 0.35 if is_inference else 0.50
if reference_date is None:
reference_date = datetime.now(timezone.utc)
# Ensure published_date is timezone-aware
if pd.api.types.is_scalar(published_date) and getattr(published_date, 'tzinfo', None) is None:
# Assuming UTC if naive, typical for web dates
try:
published_date = published_date.replace(tzinfo=timezone.utc)
except Exception:
pass
days_old = (reference_date - published_date).days
# Handle future dates gracefully (e.g., bad parsed data)
if days_old < 0:
days_old = 0
if days_old < 30:
return 1.0
return max(0.1, 1.0 - (days_old / 365.0))
def apply_freshness_score(df: pd.DataFrame, is_inference: bool = False) -> pd.DataFrame:
"""
Apply freshness scoring to a DataFrame.
"""
df = df.copy()
ref_date = datetime.now(timezone.utc)
# Vectorized execution wrapper
df["freshness_score"] = df.apply(
lambda r: calculate_freshness(
r.get("published_date"),
r.get("has_date", pd.notna(r.get("published_date"))),
is_inference,
ref_date
),
axis=1
)
return df