Time Series Forecasting
Transformers
PyTorch
Korean
jnu_tsb
feature-extraction
jnu-tsb
time-series
forecasting
chronos-2
polyglot-ko
korean
finance
covariates
r
reticulate
education
custom_code
Instructions to use HONGRIZON/JNU-TSB with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use HONGRIZON/JNU-TSB with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("HONGRIZON/JNU-TSB", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
Delete event_extractor.py
Browse files- event_extractor.py +0 -206
event_extractor.py
DELETED
|
@@ -1,206 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import json
|
| 4 |
-
import re
|
| 5 |
-
from collections import Counter
|
| 6 |
-
from dataclasses import dataclass
|
| 7 |
-
from typing import Any, Callable, Dict, Iterable, List, Optional
|
| 8 |
-
|
| 9 |
-
import pandas as pd
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
DEFAULT_CATEGORIES = [
|
| 13 |
-
"earnings",
|
| 14 |
-
"product",
|
| 15 |
-
"macro",
|
| 16 |
-
"regulation",
|
| 17 |
-
"supply_chain",
|
| 18 |
-
"competition",
|
| 19 |
-
"other",
|
| 20 |
-
]
|
| 21 |
-
|
| 22 |
-
COVARIATE_COLUMNS = [
|
| 23 |
-
"cov_earnings_count",
|
| 24 |
-
"cov_product_count",
|
| 25 |
-
"cov_macro_count",
|
| 26 |
-
"cov_regulation_count",
|
| 27 |
-
"cov_supply_chain_count",
|
| 28 |
-
"cov_competition_count",
|
| 29 |
-
"cov_other_count",
|
| 30 |
-
"cov_sentiment_pos_count",
|
| 31 |
-
"cov_sentiment_neg_count",
|
| 32 |
-
"cov_sentiment_neu_count",
|
| 33 |
-
"cov_news_count",
|
| 34 |
-
"cov_sentiment_mean",
|
| 35 |
-
"cov_confidence_mean",
|
| 36 |
-
"cov_event_score",
|
| 37 |
-
]
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
@dataclass
|
| 41 |
-
class EventResult:
|
| 42 |
-
category: str
|
| 43 |
-
sentiment: int
|
| 44 |
-
confidence: float
|
| 45 |
-
source: str
|
| 46 |
-
raw_text: str = ""
|
| 47 |
-
|
| 48 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 49 |
-
return {
|
| 50 |
-
"category": self.category,
|
| 51 |
-
"sentiment": int(self.sentiment),
|
| 52 |
-
"confidence": float(self.confidence),
|
| 53 |
-
"source": self.source,
|
| 54 |
-
"raw_text": self.raw_text,
|
| 55 |
-
}
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
class EventExtractor:
|
| 59 |
-
"""Convert Korean financial news titles into event labels and daily covariates.
|
| 60 |
-
|
| 61 |
-
The preferred path asks Polyglot-Ko to emit a small JSON object. Because
|
| 62 |
-
Polyglot-Ko-1.3B is a base LM rather than a dedicated instruction-tuned JSON
|
| 63 |
-
extractor, this class includes a deterministic keyword fallback for classroom
|
| 64 |
-
stability and CPU-only smoke tests.
|
| 65 |
-
"""
|
| 66 |
-
|
| 67 |
-
def __init__(
|
| 68 |
-
self,
|
| 69 |
-
generate_fn: Optional[Callable[[str], str]] = None,
|
| 70 |
-
categories: Optional[List[str]] = None,
|
| 71 |
-
use_llm: bool = True,
|
| 72 |
-
) -> None:
|
| 73 |
-
self.generate_fn = generate_fn
|
| 74 |
-
self.categories = categories or DEFAULT_CATEGORIES
|
| 75 |
-
self.use_llm = bool(use_llm)
|
| 76 |
-
|
| 77 |
-
def build_prompt(self, title: str) -> str:
|
| 78 |
-
cats = ", ".join(self.categories)
|
| 79 |
-
return (
|
| 80 |
-
"다음 한국어 금융뉴스 제목을 분석해 JSON만 출력하세요.\n"
|
| 81 |
-
f"가능한 category: {cats}\n"
|
| 82 |
-
"sentiment는 주가 관점에서 -1, 0, 1 중 하나입니다.\n"
|
| 83 |
-
"confidence는 0.0~1.0 숫자입니다.\n"
|
| 84 |
-
"출력 형식: {\"category\":\"...\",\"sentiment\":0,\"confidence\":0.5}\n"
|
| 85 |
-
f"뉴스 제목: {title}\n"
|
| 86 |
-
"JSON:"
|
| 87 |
-
)
|
| 88 |
-
|
| 89 |
-
def extract(self, title: str) -> Dict[str, Any]:
|
| 90 |
-
title = str(title or "").strip()
|
| 91 |
-
if self.use_llm and self.generate_fn is not None:
|
| 92 |
-
prompt = self.build_prompt(title)
|
| 93 |
-
try:
|
| 94 |
-
raw = self.generate_fn(prompt)
|
| 95 |
-
parsed = self._parse_json(raw)
|
| 96 |
-
if parsed is not None:
|
| 97 |
-
return self._normalize(parsed, raw_text=raw, source="polyglot-ko").to_dict()
|
| 98 |
-
except Exception:
|
| 99 |
-
# Fall back silently so classroom demos keep running.
|
| 100 |
-
pass
|
| 101 |
-
return self._keyword_fallback(title).to_dict()
|
| 102 |
-
|
| 103 |
-
def aggregate_to_daily(
|
| 104 |
-
self,
|
| 105 |
-
news: Iterable[Dict[str, Any]],
|
| 106 |
-
date_key: str = "date",
|
| 107 |
-
title_key: str = "title",
|
| 108 |
-
timestamp_column: str = "timestamp",
|
| 109 |
-
) -> pd.DataFrame:
|
| 110 |
-
rows: List[Dict[str, Any]] = []
|
| 111 |
-
for item in news or []:
|
| 112 |
-
if not isinstance(item, dict):
|
| 113 |
-
continue
|
| 114 |
-
title = item.get(title_key) or item.get("text") or item.get("headline") or ""
|
| 115 |
-
date_value = item.get(date_key) or item.get(timestamp_column)
|
| 116 |
-
if not title or date_value is None:
|
| 117 |
-
continue
|
| 118 |
-
event = self.extract(str(title))
|
| 119 |
-
rows.append({timestamp_column: pd.to_datetime(date_value).floor("D"), **event})
|
| 120 |
-
|
| 121 |
-
if not rows:
|
| 122 |
-
return pd.DataFrame(columns=[timestamp_column] + COVARIATE_COLUMNS)
|
| 123 |
-
|
| 124 |
-
df = pd.DataFrame(rows)
|
| 125 |
-
by_date: Dict[pd.Timestamp, Dict[str, Any]] = {}
|
| 126 |
-
for day, group in df.groupby(timestamp_column):
|
| 127 |
-
out: Dict[str, Any] = {timestamp_column: day}
|
| 128 |
-
category_counts = Counter(group["category"].tolist())
|
| 129 |
-
for cat in self.categories:
|
| 130 |
-
out[f"cov_{cat}_count"] = float(category_counts.get(cat, 0))
|
| 131 |
-
|
| 132 |
-
sentiments = [int(x) for x in group["sentiment"].tolist()]
|
| 133 |
-
out["cov_sentiment_pos_count"] = float(sum(s > 0 for s in sentiments))
|
| 134 |
-
out["cov_sentiment_neg_count"] = float(sum(s < 0 for s in sentiments))
|
| 135 |
-
out["cov_sentiment_neu_count"] = float(sum(s == 0 for s in sentiments))
|
| 136 |
-
out["cov_news_count"] = float(len(group))
|
| 137 |
-
out["cov_sentiment_mean"] = float(sum(sentiments) / max(1, len(sentiments)))
|
| 138 |
-
out["cov_confidence_mean"] = float(group["confidence"].astype(float).mean())
|
| 139 |
-
out["cov_event_score"] = float(out["cov_sentiment_mean"] * out["cov_confidence_mean"])
|
| 140 |
-
by_date[day] = out
|
| 141 |
-
|
| 142 |
-
result = pd.DataFrame(list(by_date.values())).sort_values(timestamp_column)
|
| 143 |
-
for col in COVARIATE_COLUMNS:
|
| 144 |
-
if col not in result.columns:
|
| 145 |
-
result[col] = 0.0
|
| 146 |
-
return result[[timestamp_column] + COVARIATE_COLUMNS]
|
| 147 |
-
|
| 148 |
-
def _parse_json(self, raw: str) -> Optional[Dict[str, Any]]:
|
| 149 |
-
if not raw:
|
| 150 |
-
return None
|
| 151 |
-
text = raw.strip()
|
| 152 |
-
try:
|
| 153 |
-
obj = json.loads(text)
|
| 154 |
-
if isinstance(obj, dict):
|
| 155 |
-
return obj
|
| 156 |
-
except Exception:
|
| 157 |
-
pass
|
| 158 |
-
match = re.search(r"\{.*?\}", text, flags=re.S)
|
| 159 |
-
if match:
|
| 160 |
-
try:
|
| 161 |
-
obj = json.loads(match.group(0))
|
| 162 |
-
if isinstance(obj, dict):
|
| 163 |
-
return obj
|
| 164 |
-
except Exception:
|
| 165 |
-
return None
|
| 166 |
-
return None
|
| 167 |
-
|
| 168 |
-
def _normalize(self, obj: Dict[str, Any], raw_text: str, source: str) -> EventResult:
|
| 169 |
-
category = str(obj.get("category", "other")).strip().lower()
|
| 170 |
-
if category not in self.categories:
|
| 171 |
-
category = "other"
|
| 172 |
-
try:
|
| 173 |
-
sentiment = int(float(obj.get("sentiment", 0)))
|
| 174 |
-
except Exception:
|
| 175 |
-
sentiment = 0
|
| 176 |
-
sentiment = max(-1, min(1, sentiment))
|
| 177 |
-
try:
|
| 178 |
-
confidence = float(obj.get("confidence", 0.5))
|
| 179 |
-
except Exception:
|
| 180 |
-
confidence = 0.5
|
| 181 |
-
confidence = max(0.0, min(1.0, confidence))
|
| 182 |
-
return EventResult(category=category, sentiment=sentiment, confidence=confidence, source=source, raw_text=raw_text)
|
| 183 |
-
|
| 184 |
-
def _keyword_fallback(self, title: str) -> EventResult:
|
| 185 |
-
t = str(title or "").lower()
|
| 186 |
-
|
| 187 |
-
category = "other"
|
| 188 |
-
if any(k in t for k in ["실적", "매출", "영업이익", "순이익", "어닝", "흑자", "적자"]):
|
| 189 |
-
category = "earnings"
|
| 190 |
-
elif any(k in t for k in ["신제품", "출시", "양산", "개발", "특허", "수주", "계약"]):
|
| 191 |
-
category = "product"
|
| 192 |
-
elif any(k in t for k in ["금리", "환율", "물가", "경기", "수출", "고용", "fed", "fomc", "인플레이션"]):
|
| 193 |
-
category = "macro"
|
| 194 |
-
elif any(k in t for k in ["규제", "과징금", "소송", "정부", "공정위", "금감원", "제재", "법안"]):
|
| 195 |
-
category = "regulation"
|
| 196 |
-
elif any(k in t for k in ["공급", "공장", "파업", "재고", "물류", "원자재", "반도체", "hbm"]):
|
| 197 |
-
category = "supply_chain"
|
| 198 |
-
elif any(k in t for k in ["경쟁", "점유율", "추격", "라이벌", "중국", "미국", "엔비디아", "sk하이닉스"]):
|
| 199 |
-
category = "competition"
|
| 200 |
-
|
| 201 |
-
positive = ["상승", "호조", "개선", "수혜", "확대", "증가", "돌파", "계약", "수주", "흑자", "최대", "출시"]
|
| 202 |
-
negative = ["하락", "부진", "둔화", "우려", "감소", "적자", "철회", "급락", "리콜", "과징금", "악화", "파업"]
|
| 203 |
-
score = sum(k in t for k in positive) - sum(k in t for k in negative)
|
| 204 |
-
sentiment = 1 if score > 0 else (-1 if score < 0 else 0)
|
| 205 |
-
confidence = 0.65 if category != "other" or sentiment != 0 else 0.45
|
| 206 |
-
return EventResult(category=category, sentiment=sentiment, confidence=confidence, source="keyword", raw_text=title)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|