Spaces:
Running
Running
| from datetime import datetime, timezone | |
| import pandas as pd | |
| import numpy as np | |
| def calculate_freshness( | |
| published_date, | |
| has_date: bool, | |
| is_inference: bool = False, | |
| reference_date: datetime = None | |
| ) -> float: | |
| """ | |
| Calculate the temporal freshness score for a single article. | |
| Rules: | |
| - score = 1.0 if article is < 30 days old | |
| - score = max(0.1, 1 - (days_old / 365)) for older articles | |
| - score = 0.5 if has_date is False (neutral for training) | |
| - score = 0.35 if has_date is False AND called from inference | |
| Args: | |
| published_date: The published date of the article (datetime or NaT). | |
| has_date: Boolean flag indicating if a valid date is present. | |
| is_inference: Whether the scoring is happening during live inference. | |
| reference_date: The date to compute 'days_old' against (defaults to now). | |
| Returns: | |
| Float score between 0.1 and 1.0. | |
| """ | |
| if not has_date or pd.isna(published_date): | |
| return 0.35 if is_inference else 0.50 | |
| if reference_date is None: | |
| reference_date = datetime.now(timezone.utc) | |
| # Ensure published_date is timezone-aware | |
| if pd.api.types.is_scalar(published_date) and getattr(published_date, 'tzinfo', None) is None: | |
| # Assuming UTC if naive, typical for web dates | |
| try: | |
| published_date = published_date.replace(tzinfo=timezone.utc) | |
| except Exception: | |
| pass | |
| days_old = (reference_date - published_date).days | |
| # Handle future dates gracefully (e.g., bad parsed data) | |
| if days_old < 0: | |
| days_old = 0 | |
| if days_old < 30: | |
| return 1.0 | |
| return max(0.1, 1.0 - (days_old / 365.0)) | |
| def apply_freshness_score(df: pd.DataFrame, is_inference: bool = False) -> pd.DataFrame: | |
| """ | |
| Apply freshness scoring to a DataFrame. | |
| """ | |
| df = df.copy() | |
| ref_date = datetime.now(timezone.utc) | |
| # Vectorized execution wrapper | |
| df["freshness_score"] = df.apply( | |
| lambda r: calculate_freshness( | |
| r.get("published_date"), | |
| r.get("has_date", pd.notna(r.get("published_date"))), | |
| is_inference, | |
| ref_date | |
| ), | |
| axis=1 | |
| ) | |
| return df | |