HONGRIZON commited on
Commit
3ddf519
·
verified ·
1 Parent(s): 9733c0c

Delete event_extractor.py

Browse files
Files changed (1) hide show
  1. event_extractor.py +0 -206
event_extractor.py DELETED
@@ -1,206 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import re
5
- from collections import Counter
6
- from dataclasses import dataclass
7
- from typing import Any, Callable, Dict, Iterable, List, Optional
8
-
9
- import pandas as pd
10
-
11
-
12
- DEFAULT_CATEGORIES = [
13
- "earnings",
14
- "product",
15
- "macro",
16
- "regulation",
17
- "supply_chain",
18
- "competition",
19
- "other",
20
- ]
21
-
22
- COVARIATE_COLUMNS = [
23
- "cov_earnings_count",
24
- "cov_product_count",
25
- "cov_macro_count",
26
- "cov_regulation_count",
27
- "cov_supply_chain_count",
28
- "cov_competition_count",
29
- "cov_other_count",
30
- "cov_sentiment_pos_count",
31
- "cov_sentiment_neg_count",
32
- "cov_sentiment_neu_count",
33
- "cov_news_count",
34
- "cov_sentiment_mean",
35
- "cov_confidence_mean",
36
- "cov_event_score",
37
- ]
38
-
39
-
40
- @dataclass
41
- class EventResult:
42
- category: str
43
- sentiment: int
44
- confidence: float
45
- source: str
46
- raw_text: str = ""
47
-
48
- def to_dict(self) -> Dict[str, Any]:
49
- return {
50
- "category": self.category,
51
- "sentiment": int(self.sentiment),
52
- "confidence": float(self.confidence),
53
- "source": self.source,
54
- "raw_text": self.raw_text,
55
- }
56
-
57
-
58
- class EventExtractor:
59
- """Convert Korean financial news titles into event labels and daily covariates.
60
-
61
- The preferred path asks Polyglot-Ko to emit a small JSON object. Because
62
- Polyglot-Ko-1.3B is a base LM rather than a dedicated instruction-tuned JSON
63
- extractor, this class includes a deterministic keyword fallback for classroom
64
- stability and CPU-only smoke tests.
65
- """
66
-
67
- def __init__(
68
- self,
69
- generate_fn: Optional[Callable[[str], str]] = None,
70
- categories: Optional[List[str]] = None,
71
- use_llm: bool = True,
72
- ) -> None:
73
- self.generate_fn = generate_fn
74
- self.categories = categories or DEFAULT_CATEGORIES
75
- self.use_llm = bool(use_llm)
76
-
77
- def build_prompt(self, title: str) -> str:
78
- cats = ", ".join(self.categories)
79
- return (
80
- "다음 한국어 금융뉴스 제목을 분석해 JSON만 출력하세요.\n"
81
- f"가능한 category: {cats}\n"
82
- "sentiment는 주가 관점에서 -1, 0, 1 중 하나입니다.\n"
83
- "confidence는 0.0~1.0 숫자입니다.\n"
84
- "출력 형식: {\"category\":\"...\",\"sentiment\":0,\"confidence\":0.5}\n"
85
- f"뉴스 제목: {title}\n"
86
- "JSON:"
87
- )
88
-
89
- def extract(self, title: str) -> Dict[str, Any]:
90
- title = str(title or "").strip()
91
- if self.use_llm and self.generate_fn is not None:
92
- prompt = self.build_prompt(title)
93
- try:
94
- raw = self.generate_fn(prompt)
95
- parsed = self._parse_json(raw)
96
- if parsed is not None:
97
- return self._normalize(parsed, raw_text=raw, source="polyglot-ko").to_dict()
98
- except Exception:
99
- # Fall back silently so classroom demos keep running.
100
- pass
101
- return self._keyword_fallback(title).to_dict()
102
-
103
- def aggregate_to_daily(
104
- self,
105
- news: Iterable[Dict[str, Any]],
106
- date_key: str = "date",
107
- title_key: str = "title",
108
- timestamp_column: str = "timestamp",
109
- ) -> pd.DataFrame:
110
- rows: List[Dict[str, Any]] = []
111
- for item in news or []:
112
- if not isinstance(item, dict):
113
- continue
114
- title = item.get(title_key) or item.get("text") or item.get("headline") or ""
115
- date_value = item.get(date_key) or item.get(timestamp_column)
116
- if not title or date_value is None:
117
- continue
118
- event = self.extract(str(title))
119
- rows.append({timestamp_column: pd.to_datetime(date_value).floor("D"), **event})
120
-
121
- if not rows:
122
- return pd.DataFrame(columns=[timestamp_column] + COVARIATE_COLUMNS)
123
-
124
- df = pd.DataFrame(rows)
125
- by_date: Dict[pd.Timestamp, Dict[str, Any]] = {}
126
- for day, group in df.groupby(timestamp_column):
127
- out: Dict[str, Any] = {timestamp_column: day}
128
- category_counts = Counter(group["category"].tolist())
129
- for cat in self.categories:
130
- out[f"cov_{cat}_count"] = float(category_counts.get(cat, 0))
131
-
132
- sentiments = [int(x) for x in group["sentiment"].tolist()]
133
- out["cov_sentiment_pos_count"] = float(sum(s > 0 for s in sentiments))
134
- out["cov_sentiment_neg_count"] = float(sum(s < 0 for s in sentiments))
135
- out["cov_sentiment_neu_count"] = float(sum(s == 0 for s in sentiments))
136
- out["cov_news_count"] = float(len(group))
137
- out["cov_sentiment_mean"] = float(sum(sentiments) / max(1, len(sentiments)))
138
- out["cov_confidence_mean"] = float(group["confidence"].astype(float).mean())
139
- out["cov_event_score"] = float(out["cov_sentiment_mean"] * out["cov_confidence_mean"])
140
- by_date[day] = out
141
-
142
- result = pd.DataFrame(list(by_date.values())).sort_values(timestamp_column)
143
- for col in COVARIATE_COLUMNS:
144
- if col not in result.columns:
145
- result[col] = 0.0
146
- return result[[timestamp_column] + COVARIATE_COLUMNS]
147
-
148
- def _parse_json(self, raw: str) -> Optional[Dict[str, Any]]:
149
- if not raw:
150
- return None
151
- text = raw.strip()
152
- try:
153
- obj = json.loads(text)
154
- if isinstance(obj, dict):
155
- return obj
156
- except Exception:
157
- pass
158
- match = re.search(r"\{.*?\}", text, flags=re.S)
159
- if match:
160
- try:
161
- obj = json.loads(match.group(0))
162
- if isinstance(obj, dict):
163
- return obj
164
- except Exception:
165
- return None
166
- return None
167
-
168
- def _normalize(self, obj: Dict[str, Any], raw_text: str, source: str) -> EventResult:
169
- category = str(obj.get("category", "other")).strip().lower()
170
- if category not in self.categories:
171
- category = "other"
172
- try:
173
- sentiment = int(float(obj.get("sentiment", 0)))
174
- except Exception:
175
- sentiment = 0
176
- sentiment = max(-1, min(1, sentiment))
177
- try:
178
- confidence = float(obj.get("confidence", 0.5))
179
- except Exception:
180
- confidence = 0.5
181
- confidence = max(0.0, min(1.0, confidence))
182
- return EventResult(category=category, sentiment=sentiment, confidence=confidence, source=source, raw_text=raw_text)
183
-
184
- def _keyword_fallback(self, title: str) -> EventResult:
185
- t = str(title or "").lower()
186
-
187
- category = "other"
188
- if any(k in t for k in ["실적", "매출", "영업이익", "순이익", "어닝", "흑자", "적자"]):
189
- category = "earnings"
190
- elif any(k in t for k in ["신제품", "출시", "양산", "개발", "특허", "수주", "계약"]):
191
- category = "product"
192
- elif any(k in t for k in ["금리", "환율", "물가", "경기", "수출", "고용", "fed", "fomc", "인플레이션"]):
193
- category = "macro"
194
- elif any(k in t for k in ["규제", "과징금", "소송", "정부", "공정위", "금감원", "제재", "법안"]):
195
- category = "regulation"
196
- elif any(k in t for k in ["공급", "공장", "파업", "재고", "물류", "원자재", "반도체", "hbm"]):
197
- category = "supply_chain"
198
- elif any(k in t for k in ["경쟁", "점유율", "추격", "라이벌", "중국", "미국", "엔비디아", "sk하이닉스"]):
199
- category = "competition"
200
-
201
- positive = ["상승", "호조", "개선", "수혜", "확대", "증가", "돌파", "계약", "수주", "흑자", "최대", "출시"]
202
- negative = ["하락", "부진", "둔화", "우려", "감소", "적자", "철회", "급락", "리콜", "과징금", "악화", "파업"]
203
- score = sum(k in t for k in positive) - sum(k in t for k in negative)
204
- sentiment = 1 if score > 0 else (-1 if score < 0 else 0)
205
- confidence = 0.65 if category != "other" or sentiment != 0 else 0.45
206
- return EventResult(category=category, sentiment=sentiment, confidence=confidence, source="keyword", raw_text=title)