HONGRIZON
/

JNU-TSB

@@ -1,374 +0,0 @@
-from __future__ import annotations
-import json
-import os
-from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
-import pandas as pd
-import torch
-try:
-    from .configuration_jnu_tsb import JNUTSBConfig
-    from .event_extractor import COVARIATE_COLUMNS, EventExtractor
-except ImportError:  # pragma: no cover - local execution fallback
-    from configuration_jnu_tsb import JNUTSBConfig
-    from event_extractor import COVARIATE_COLUMNS, EventExtractor
-class JNUTSBRuntime:
-    """Runtime used by the custom pipeline, handler.py, Gradio Space, and R examples.
-    Routes inputs into three paths:
-      1. stock only -> Chronos-2 forecast
-      2. news only -> Polyglot/keyword event extraction
-      3. both -> news covariates + stock context -> Chronos-2 forecast
-    """
-    def __init__(
-        self,
-        chronos_model_id: str = "amazon/chronos-2",
-        llm_model_id: str = "EleutherAI/polyglot-ko-1.3b",
-        device: Optional[str] = None,
-        quantile_levels: Optional[Sequence[float]] = None,
-        use_llm_extractor: bool = True,
-        max_new_tokens: int = 96,
-    ) -> None:
-        self.chronos_model_id = chronos_model_id
-        self.llm_model_id = llm_model_id
-        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
-        self.quantile_levels = list(quantile_levels or [0.1, 0.5, 0.9])
-        self.use_llm_extractor = bool(use_llm_extractor)
-        self.max_new_tokens = int(max_new_tokens)
-        self._chronos = None
-        self._tokenizer = None
-        self._llm = None
-        self._extractor = None
-    @classmethod
-    def from_config(cls, config: Union[JNUTSBConfig, Dict[str, Any]], **overrides: Any) -> "JNUTSBRuntime":
-        if isinstance(config, JNUTSBConfig):
-            data = config.to_runtime_dict()
-        else:
-            data = dict(config)
-        data.update({k: v for k, v in overrides.items() if v is not None})
-        return cls(
-            chronos_model_id=data.get("chronos_model_id", "amazon/chronos-2"),
-            llm_model_id=data.get("llm_model_id", "EleutherAI/polyglot-ko-1.3b"),
-            quantile_levels=data.get("quantile_levels", [0.1, 0.5, 0.9]),
-            use_llm_extractor=data.get("use_llm_extractor", True),
-            device=data.get("device"),
-            max_new_tokens=data.get("max_new_tokens", 96),
-        )
-    @classmethod
-    def from_config_dir(cls, model_dir: Union[str, os.PathLike[str]], **overrides: Any) -> "JNUTSBRuntime":
-        config_path = Path(model_dir) / "config.json"
-        with open(config_path, "r", encoding="utf-8") as f:
-            config = json.load(f)
-        return cls.from_config(config, **overrides)
-    @property
-    def chronos(self):
-        if self._chronos is None:
-            from chronos import Chronos2Pipeline
-            self._chronos = Chronos2Pipeline.from_pretrained(
-                self.chronos_model_id,
-                device_map=self.device,
-            )
-        return self._chronos
-    @property
-    def tokenizer(self):
-        if self._tokenizer is None:
-            from transformers import AutoTokenizer
-            self._tokenizer = AutoTokenizer.from_pretrained(self.llm_model_id)
-            if self._tokenizer.pad_token is None:
-                self._tokenizer.pad_token = self._tokenizer.eos_token
-        return self._tokenizer
-    @property
-    def llm(self):
-        if self._llm is None:
-            from transformers import AutoModelForCausalLM
-            dtype = torch.float16 if self.device.startswith("cuda") else torch.float32
-            self._llm = AutoModelForCausalLM.from_pretrained(
-                self.llm_model_id,
-                torch_dtype=dtype,
-                device_map="auto" if self.device.startswith("cuda") else None,
-            )
-            if not self.device.startswith("cuda"):
-                self._llm.to(self.device)
-            self._llm.eval()
-        return self._llm
-    @property
-    def extractor(self) -> EventExtractor:
-        if self._extractor is None:
-            self._extractor = EventExtractor(
-                generate_fn=self._generate_text if self.use_llm_extractor else None,
-                use_llm=self.use_llm_extractor,
-            )
-        return self._extractor
-    def _generate_text(self, prompt: str) -> str:
-        tokenizer = self.tokenizer
-        model = self.llm
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1536)
-        model_device = next(model.parameters()).device
-        inputs = {k: v.to(model_device) for k, v in inputs.items()}
-        with torch.no_grad():
-            out = model.generate(
-                **inputs,
-                max_new_tokens=self.max_new_tokens,
-                do_sample=False,
-                pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-            )
-        gen_ids = out[0, inputs["input_ids"].shape[1]:]
-        return tokenizer.decode(gen_ids, skip_special_tokens=True)
-    def predict(
-        self,
-        inputs: Optional[Dict[str, Any]] = None,
-        *,
-        news: Optional[Iterable[Dict[str, Any]]] = None,
-        stock: Optional[Union[pd.DataFrame, List[Dict[str, Any]], Dict[str, Any], str, os.PathLike[str]]] = None,
-        future_news: Optional[Iterable[Dict[str, Any]]] = None,
-        future_covariates: Optional[Union[pd.DataFrame, List[Dict[str, Any]], Dict[str, Any], str, os.PathLike[str]]] = None,
-        prediction_length: int = 5,
-        quantile_levels: Optional[Sequence[float]] = None,
-        timestamp_column: str = "timestamp",
-        target: str = "target",
-        id_column: str = "item_id",
-        use_llm_extractor: Optional[bool] = None,
-    ) -> Dict[str, Any]:
-        inputs = dict(inputs or {})
-        news = news if news is not None else inputs.get("news")
-        stock = stock if stock is not None else inputs.get("stock")
-        future_news = future_news if future_news is not None else inputs.get("future_news")
-        future_covariates = future_covariates if future_covariates is not None else inputs.get("future_covariates")
-        old_use_llm = self.use_llm_extractor
-        if use_llm_extractor is not None and bool(use_llm_extractor) != old_use_llm:
-            self.use_llm_extractor = bool(use_llm_extractor)
-            self._extractor = None
-        try:
-            news_list = self._normalize_news(news)
-            future_news_list = self._normalize_news(future_news)
-            has_text = len(news_list) > 0
-            stock_df = self._to_dataframe(stock) if stock is not None else None
-            has_numeric = stock_df is not None and len(stock_df) > 0
-            if not has_text and not has_numeric:
-                raise ValueError("news와 stock 중 최소 하나는 필요합니다.")
-            q = list(quantile_levels or self.quantile_levels)
-            if has_text and not has_numeric:
-                daily_covariates = self.extractor.aggregate_to_daily(news_list, timestamp_column=timestamp_column)
-                return {
-                    "model": "JNU-TSB",
-                    "route": "text_only",
-                    "events": [self.extractor.extract(str(item.get("title", ""))) for item in news_list],
-                    "daily_covariates": self._df_to_records(daily_covariates),
-                }
-            stock_df = self._prepare_stock_df(stock_df, timestamp_column=timestamp_column, target=target, id_column=id_column)
-            if has_text and has_numeric:
-                context_df = self._merge_news_covariates(stock_df, news_list, timestamp_column=timestamp_column)
-                future_df = self._prepare_future_covariates(
-                    stock_df=context_df,
-                    future_news=future_news_list,
-                    future_covariates=future_covariates,
-                    prediction_length=prediction_length,
-                    timestamp_column=timestamp_column,
-                    id_column=id_column,
-                )
-                pred = self._predict_chronos_df(
-                    context_df,
-                    future_df=future_df,
-                    prediction_length=int(prediction_length),
-                    quantile_levels=q,
-                    id_column=id_column,
-                    timestamp_column=timestamp_column,
-                    target=target,
-                )
-                return {
-                    "model": "JNU-TSB",
-                    "route": "hybrid",
-                    "prediction": self._df_to_records(pred),
-                    "context_columns": list(context_df.columns),
-                    "future_covariates_used": future_df is not None,
-                    "notes": "News was converted to daily covariates and merged into the Chronos-2 context.",
-                }
-            pred = self._predict_chronos_df(
-                stock_df,
-                future_df=None,
-                prediction_length=int(prediction_length),
-                quantile_levels=q,
-                id_column=id_column,
-                timestamp_column=timestamp_column,
-                target=target,
-            )
-            return {"model": "JNU-TSB", "route": "chronos_only", "prediction": self._df_to_records(pred)}
-        finally:
-            if use_llm_extractor is not None and bool(use_llm_extractor) != old_use_llm:
-                self.use_llm_extractor = old_use_llm
-                self._extractor = None
-    def __call__(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
-        return self.predict(*args, **kwargs)
-    def _normalize_news(self, news: Optional[Iterable[Dict[str, Any]]]) -> List[Dict[str, Any]]:
-        if news is None:
-            return []
-        if isinstance(news, dict):
-            if "data" in news and isinstance(news["data"], list):
-                news = news["data"]
-            else:
-                news = [news]
-        out: List[Dict[str, Any]] = []
-        for item in list(news):
-            if not isinstance(item, dict):
-                continue
-            title = item.get("title") or item.get("headline") or item.get("text") or ""
-            date = item.get("date") or item.get("timestamp")
-            normalized = dict(item)
-            normalized["title"] = str(title)
-            if date is not None:
-                normalized["date"] = date
-            out.append(normalized)
-        return out
-    def _to_dataframe(self, data: Union[pd.DataFrame, List[Dict[str, Any]], Dict[str, Any], str, os.PathLike[str], None]) -> Optional[pd.DataFrame]:
-        if data is None:
-            return None
-        if isinstance(data, pd.DataFrame):
-            return data.copy()
-        if isinstance(data, (str, os.PathLike)):
-            return pd.read_csv(data)
-        if isinstance(data, list):
-            return pd.DataFrame(data)
-        if isinstance(data, dict):
-            if "data" in data and isinstance(data["data"], list):
-                return pd.DataFrame(data["data"])
-            try:
-                return pd.DataFrame(data)
-            except ValueError:
-                return pd.DataFrame([data])
-        raise TypeError(f"지원하지 않는 데이터 타입입니다: {type(data)}")
-    def _prepare_stock_df(self, df: pd.DataFrame, timestamp_column: str, target: str, id_column: str) -> pd.DataFrame:
-        df = df.copy()
-        if timestamp_column not in df.columns:
-            for candidate in ["date", "Date", "datetime", "time"]:
-                if candidate in df.columns:
-                    df = df.rename(columns={candidate: timestamp_column})
-                    break
-        if target not in df.columns:
-            for candidate in ["close", "Close", "price", "value", "y"]:
-                if candidate in df.columns:
-                    df = df.rename(columns={candidate: target})
-                    break
-        if timestamp_column not in df.columns or target not in df.columns:
-            raise ValueError(f"stock에는 `{timestamp_column}`와 `{target}` 컬럼이 필요합니다.")
-        if id_column not in df.columns:
-            df[id_column] = "series_0"
-        df[timestamp_column] = pd.to_datetime(df[timestamp_column])
-        df[target] = pd.to_numeric(df[target], errors="coerce")
-        df = df.dropna(subset=[timestamp_column, target])
-        return df.sort_values([id_column, timestamp_column]).reset_index(drop=True)
-    def _merge_news_covariates(self, stock_df: pd.DataFrame, news: Iterable[Dict[str, Any]], timestamp_column: str) -> pd.DataFrame:
-        cov = self.extractor.aggregate_to_daily(news, timestamp_column=timestamp_column)
-        context = stock_df.copy()
-        day_col = "__day__"
-        context[day_col] = pd.to_datetime(context[timestamp_column]).dt.floor("D")
-        cov = cov.rename(columns={timestamp_column: day_col})
-        merged = context.merge(cov, on=day_col, how="left").drop(columns=[day_col])
-        for col in COVARIATE_COLUMNS:
-            if col not in merged.columns:
-                merged[col] = 0.0
-            merged[col] = merged[col].fillna(0.0).astype(float)
-        return merged
-    def _prepare_future_covariates(
-        self,
-        stock_df: pd.DataFrame,
-        future_news: Optional[List[Dict[str, Any]]],
-        future_covariates: Optional[Union[pd.DataFrame, List[Dict[str, Any]], Dict[str, Any], str, os.PathLike[str]]],
-        prediction_length: int,
-        timestamp_column: str,
-        id_column: str,
-    ) -> Optional[pd.DataFrame]:
-        if future_covariates is not None:
-            fut = self._to_dataframe(future_covariates)
-            if fut is None or len(fut) == 0:
-                return None
-            if id_column not in fut.columns:
-                fut[id_column] = stock_df[id_column].iloc[0]
-            if timestamp_column not in fut.columns:
-                raise ValueError(f"future_covariates에는 `{timestamp_column}` 컬럼이 필요합니다.")
-            fut[timestamp_column] = pd.to_datetime(fut[timestamp_column])
-            for col in COVARIATE_COLUMNS:
-                if col not in fut.columns:
-                    fut[col] = 0.0
-            return fut
-        if not future_news:
-            return None
-        first_id = stock_df[id_column].iloc[0]
-        timestamps = pd.to_datetime(stock_df[timestamp_column]).drop_duplicates().sort_values()
-        last_ts = timestamps.max()
-        freq = pd.infer_freq(timestamps) or "D"
-        future_dates = pd.date_range(start=last_ts, periods=int(prediction_length) + 1, freq=freq)[1:]
-        base = pd.DataFrame({id_column: first_id, timestamp_column: future_dates})
-        cov = self.extractor.aggregate_to_daily(future_news, timestamp_column=timestamp_column)
-        base["__day__"] = pd.to_datetime(base[timestamp_column]).dt.floor("D")
-        cov = cov.rename(columns={timestamp_column: "__day__"})
-        fut = base.merge(cov, on="__day__", how="left").drop(columns=["__day__"])
-        for col in COVARIATE_COLUMNS:
-            if col not in fut.columns:
-                fut[col] = 0.0
-            fut[col] = fut[col].fillna(0.0).astype(float)
-        return fut
-    def _predict_chronos_df(
-        self,
-        context_df: pd.DataFrame,
-        *,
-        future_df: Optional[pd.DataFrame],
-        prediction_length: int,
-        quantile_levels: Sequence[float],
-        id_column: str,
-        timestamp_column: str,
-        target: str,
-    ) -> pd.DataFrame:
-        kwargs: Dict[str, Any] = {
-            "prediction_length": int(prediction_length),
-            "quantile_levels": list(quantile_levels),
-            "id_column": id_column,
-            "timestamp_column": timestamp_column,
-            "target": target,
-        }
-        if future_df is not None:
-            kwargs["future_df"] = future_df
-        return self.chronos.predict_df(context_df, **kwargs)
-    def _df_to_records(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
-        out = df.copy()
-        for col in out.columns:
-            if pd.api.types.is_datetime64_any_dtype(out[col]):
-                out[col] = out[col].astype(str)
-        return out.to_dict(orient="records")