HONGRIZON
/

JNU-TSB

@@ -1,206 +0,0 @@
-from __future__ import annotations
-import json
-import re
-from collections import Counter
-from dataclasses import dataclass
-from typing import Any, Callable, Dict, Iterable, List, Optional
-import pandas as pd
-DEFAULT_CATEGORIES = [
-    "earnings",
-    "product",
-    "macro",
-    "regulation",
-    "supply_chain",
-    "competition",
-    "other",
-]
-COVARIATE_COLUMNS = [
-    "cov_earnings_count",
-    "cov_product_count",
-    "cov_macro_count",
-    "cov_regulation_count",
-    "cov_supply_chain_count",
-    "cov_competition_count",
-    "cov_other_count",
-    "cov_sentiment_pos_count",
-    "cov_sentiment_neg_count",
-    "cov_sentiment_neu_count",
-    "cov_news_count",
-    "cov_sentiment_mean",
-    "cov_confidence_mean",
-    "cov_event_score",
-]
-@dataclass
-class EventResult:
-    category: str
-    sentiment: int
-    confidence: float
-    source: str
-    raw_text: str = ""
-    def to_dict(self) -> Dict[str, Any]:
-        return {
-            "category": self.category,
-            "sentiment": int(self.sentiment),
-            "confidence": float(self.confidence),
-            "source": self.source,
-            "raw_text": self.raw_text,
-        }
-class EventExtractor:
-    """Convert Korean financial news titles into event labels and daily covariates.
-    The preferred path asks Polyglot-Ko to emit a small JSON object. Because
-    Polyglot-Ko-1.3B is a base LM rather than a dedicated instruction-tuned JSON
-    extractor, this class includes a deterministic keyword fallback for classroom
-    stability and CPU-only smoke tests.
-    """
-    def __init__(
-        self,
-        generate_fn: Optional[Callable[[str], str]] = None,
-        categories: Optional[List[str]] = None,
-        use_llm: bool = True,
-    ) -> None:
-        self.generate_fn = generate_fn
-        self.categories = categories or DEFAULT_CATEGORIES
-        self.use_llm = bool(use_llm)
-    def build_prompt(self, title: str) -> str:
-        cats = ", ".join(self.categories)
-        return (
-            "다음 한국어 금융뉴스 제목을 분석해 JSON만 출력하세요.\n"
-            f"가능한 category: {cats}\n"
-            "sentiment는 주가 관점에서 -1, 0, 1 중 하나입니다.\n"
-            "confidence는 0.0~1.0 숫자입니다.\n"
-            "출력 형식: {\"category\":\"...\",\"sentiment\":0,\"confidence\":0.5}\n"
-            f"뉴스 제목: {title}\n"
-            "JSON:"
-        )
-    def extract(self, title: str) -> Dict[str, Any]:
-        title = str(title or "").strip()
-        if self.use_llm and self.generate_fn is not None:
-            prompt = self.build_prompt(title)
-            try:
-                raw = self.generate_fn(prompt)
-                parsed = self._parse_json(raw)
-                if parsed is not None:
-                    return self._normalize(parsed, raw_text=raw, source="polyglot-ko").to_dict()
-            except Exception:
-                # Fall back silently so classroom demos keep running.
-                pass
-        return self._keyword_fallback(title).to_dict()
-    def aggregate_to_daily(
-        self,
-        news: Iterable[Dict[str, Any]],
-        date_key: str = "date",
-        title_key: str = "title",
-        timestamp_column: str = "timestamp",
-    ) -> pd.DataFrame:
-        rows: List[Dict[str, Any]] = []
-        for item in news or []:
-            if not isinstance(item, dict):
-                continue
-            title = item.get(title_key) or item.get("text") or item.get("headline") or ""
-            date_value = item.get(date_key) or item.get(timestamp_column)
-            if not title or date_value is None:
-                continue
-            event = self.extract(str(title))
-            rows.append({timestamp_column: pd.to_datetime(date_value).floor("D"), **event})
-        if not rows:
-            return pd.DataFrame(columns=[timestamp_column] + COVARIATE_COLUMNS)
-        df = pd.DataFrame(rows)
-        by_date: Dict[pd.Timestamp, Dict[str, Any]] = {}
-        for day, group in df.groupby(timestamp_column):
-            out: Dict[str, Any] = {timestamp_column: day}
-            category_counts = Counter(group["category"].tolist())
-            for cat in self.categories:
-                out[f"cov_{cat}_count"] = float(category_counts.get(cat, 0))
-            sentiments = [int(x) for x in group["sentiment"].tolist()]
-            out["cov_sentiment_pos_count"] = float(sum(s > 0 for s in sentiments))
-            out["cov_sentiment_neg_count"] = float(sum(s < 0 for s in sentiments))
-            out["cov_sentiment_neu_count"] = float(sum(s == 0 for s in sentiments))
-            out["cov_news_count"] = float(len(group))
-            out["cov_sentiment_mean"] = float(sum(sentiments) / max(1, len(sentiments)))
-            out["cov_confidence_mean"] = float(group["confidence"].astype(float).mean())
-            out["cov_event_score"] = float(out["cov_sentiment_mean"] * out["cov_confidence_mean"])
-            by_date[day] = out
-        result = pd.DataFrame(list(by_date.values())).sort_values(timestamp_column)
-        for col in COVARIATE_COLUMNS:
-            if col not in result.columns:
-                result[col] = 0.0
-        return result[[timestamp_column] + COVARIATE_COLUMNS]
-    def _parse_json(self, raw: str) -> Optional[Dict[str, Any]]:
-        if not raw:
-            return None
-        text = raw.strip()
-        try:
-            obj = json.loads(text)
-            if isinstance(obj, dict):
-                return obj
-        except Exception:
-            pass
-        match = re.search(r"\{.*?\}", text, flags=re.S)
-        if match:
-            try:
-                obj = json.loads(match.group(0))
-                if isinstance(obj, dict):
-                    return obj
-            except Exception:
-                return None
-        return None
-    def _normalize(self, obj: Dict[str, Any], raw_text: str, source: str) -> EventResult:
-        category = str(obj.get("category", "other")).strip().lower()
-        if category not in self.categories:
-            category = "other"
-        try:
-            sentiment = int(float(obj.get("sentiment", 0)))
-        except Exception:
-            sentiment = 0
-        sentiment = max(-1, min(1, sentiment))
-        try:
-            confidence = float(obj.get("confidence", 0.5))
-        except Exception:
-            confidence = 0.5
-        confidence = max(0.0, min(1.0, confidence))
-        return EventResult(category=category, sentiment=sentiment, confidence=confidence, source=source, raw_text=raw_text)
-    def _keyword_fallback(self, title: str) -> EventResult:
-        t = str(title or "").lower()
-        category = "other"
-        if any(k in t for k in ["실적", "매출", "영업이익", "순이익", "어닝", "흑자", "적자"]):
-            category = "earnings"
-        elif any(k in t for k in ["신제품", "출시", "양산", "개발", "특허", "수주", "계약"]):
-            category = "product"
-        elif any(k in t for k in ["금리", "환율", "물가", "경기", "수출", "고용", "fed", "fomc", "인플레이션"]):
-            category = "macro"
-        elif any(k in t for k in ["규제", "과징금", "소송", "정부", "공정위", "금감원", "제재", "법안"]):
-            category = "regulation"
-        elif any(k in t for k in ["공급", "공장", "파업", "재고", "물류", "원자재", "반도체", "hbm"]):
-            category = "supply_chain"
-        elif any(k in t for k in ["경쟁", "점유율", "추격", "라이벌", "중국", "미국", "엔비디아", "sk하이닉스"]):
-            category = "competition"
-        positive = ["상승", "호조", "개선", "수혜", "확대", "증가", "돌파", "계약", "수주", "흑자", "최대", "출시"]
-        negative = ["하락", "부진", "둔화", "우려", "감소", "적자", "철회", "급락", "리콜", "과징금", "악화", "파업"]
-        score = sum(k in t for k in positive) - sum(k in t for k in negative)
-        sentiment = 1 if score > 0 else (-1 if score < 0 else 0)
-        confidence = 0.65 if category != "other" or sentiment != 0 else 0.45
-        return EventResult(category=category, sentiment=sentiment, confidence=confidence, source="keyword", raw_text=title)