dash / news.py
openfree's picture
Update news.py
a6fc716 verified
"""
VDash ๋‰ด์Šค ๋ชจ๋“ˆ v2
- AI Times + Hacker News ํฌ๋กค๋ง
- HN ์˜๋ฌธ ์ œ๋ชฉ โ†’ ํ•œ๊ธ€ ์ž๋™ ๋ฒˆ์—ญ
- ๋น„๋“œ๋ž˜ํ”„ํŠธ ๊ด€์  ์ž๋™ ๋ถ„๋ฅ˜
- HF Dataset ์˜๊ตฌ ์ €์žฅ
"""
import requests, json, re, time, os, tempfile
from datetime import datetime, timedelta
from typing import List, Dict
from bs4 import BeautifulSoup
from huggingface_hub import HfApi, hf_hub_download
try:
from deep_translator import GoogleTranslator
translator = GoogleTranslator(source='en', target='ko')
HAS_TRANSLATOR = True
except Exception:
HAS_TRANSLATOR = False
print("[NEWS] deep-translator not available, HN titles will stay English")
HF_TOKEN = os.getenv("HF_TOKEN")
SPACE_ID = os.getenv("SPACE_ID", "")
OWNER = SPACE_ID.split("/")[0] if SPACE_ID else "vidraft"
DATASET_REPO = os.getenv("DATASET_REPO", f"{OWNER}/vidraft-dashboard-data")
NEWS_FILE = "news.json"
hf_api = HfApi(token=HF_TOKEN)
TAG_RULES = [
(["์ •๋ถ€","๊ณผ์ œ","๊ณต๋ชจ","์ง€์›์‚ฌ์—…","IITP","NIA","NIPA","๊ตญ์ฑ…","government","grant"], "๐Ÿ›๏ธ ์ •๋ถ€๊ณผ์ œ", "#3b82f6"),
(["ํˆฌ์ž","ํŽ€๋”ฉ","์‹œ๋ฆฌ์ฆˆ","VC","IPO","์ธ์ˆ˜","M&A","๋ฐธ๋ฅ˜์—์ด์…˜","funding","investment","acquisition"], "๐Ÿ’ฐ ํˆฌ์ž/IR", "#f59e0b"),
(["์˜์ƒ","๋น„๋””์˜ค","video","์ƒ์„ฑ","sora","gen-","๋™์˜์ƒ","์ด๋ฏธ์ง€์ƒ์„ฑ"], "๐ŸŽฌ ์˜์ƒAI", "#ef4444"),
(["ํ•œ๊ตญ์–ด","korean","multilingual","๋ฒˆ์—ญ","๋‹ค๊ตญ์–ด"], "๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ดAI", "#8b5cf6"),
(["ํ—ˆ๊น…ํŽ˜์ด์Šค","hugging","HF","spaces","์˜คํ”ˆ์†Œ์Šค","open source","github"], "๐Ÿค— HF/์˜คํ”ˆ์†Œ์Šค", "#10b981"),
(["LLM","GPT","Claude","Gemini","๊ฑฐ๋Œ€์–ธ์–ด","ํŒŒ์ธํŠœ๋‹","RAG","์—์ด์ „ํŠธ","agent","transformer","llama","mistral"], "๐Ÿง  LLM/์—์ด์ „ํŠธ", "#6366f1"),
(["GPU","์นฉ","๋ฐ˜๋„์ฒด","์—”๋น„๋””์•„","NVIDIA","์ธํ”„๋ผ","์„œ๋ฒ„","ํด๋ผ์šฐ๋“œ","๋ฐ์ดํ„ฐ์„ผํ„ฐ","chip","server","cloud"], "๐Ÿ–ฅ๏ธ ์ธํ”„๋ผ/GPU", "#0d9488"),
(["๋ณด์•ˆ","๊ฐœ์ธ์ •๋ณด","๊ทœ์ œ","๋ฒ•์•ˆ","์œค๋ฆฌ","์•ˆ์ „","์ €์ž‘๊ถŒ","AI๋ฒ•","regulation","safety","privacy"], "๐Ÿ”’ ๊ทœ์ œ/์œค๋ฆฌ", "#dc2626"),
(["์Šคํƒ€ํŠธ์—…","์ฐฝ์—…","์‚ฌ์—…","์ œํœด","ํŒŒํŠธ๋„ˆ","๊ณ„์•ฝ","๋งค์ถœ","startup","business","revenue"], "๐Ÿ’ผ ๋น„์ฆˆ๋‹ˆ์Šค", "#ea580c"),
(["๊ต์œก","ํ•™์Šต","์—ฐ๊ตฌ","๋…ผ๋ฌธ","arXiv","๋ฒค์น˜๋งˆํฌ","์„ฑ๋Šฅ","ํ‰๊ฐ€","paper","research","benchmark"], "๐Ÿ“š R&D/์—ฐ๊ตฌ", "#059669"),
(["๋งˆ์ผ€ํŒ…","์ฝ˜ํ…์ธ ","SNS","๋ธŒ๋žœ๋”ฉ","ํ™๋ณด","PR","๋ฏธ๋””์–ด","marketing"], "๐Ÿ“ข ๋งˆ์ผ€ํŒ…/PR", "#9333ea"),
]
RELEVANCE_KW = {
"ํ•ต์‹ฌ": ["AI ์˜์ƒ","๋น„๋””์˜ค ์ƒ์„ฑ","ํ•œ๊ตญ์–ด","ํ—ˆ๊น…ํŽ˜์ด์Šค","์˜คํ”ˆ์†Œ์Šค","์—์ด์ „ํŠธ","LLM","์Šคํƒ€ํŠธ์—…","์ •๋ถ€๊ณผ์ œ","video generation","hugging face"],
"์ฃผ๋ชฉ": ["GPU","ํด๋ผ์šฐ๋“œ","์ธํ”„๋ผ","ํˆฌ์ž","์ƒ์„ฑAI","ํŒŒ์ธํŠœ๋‹","RAG","API","generative"],
"์ฐธ๊ณ ": ["๊ทœ์ œ","๊ต์œก","์—ฐ๊ตฌ","๋ณด์•ˆ","๋งˆ์ผ€ํŒ…","benchmark","safety"],
}
def classify_news(title, source=""):
text = (title + " " + source).lower()
tags, colors = [], {}
for keywords, tag, color in TAG_RULES:
for kw in keywords:
if kw.lower() in text:
if tag not in tags:
tags.append(tag)
colors[tag] = color
break
if not tags:
tags.append("๐Ÿ“ฐ ์ผ๋ฐ˜AI๋‰ด์Šค")
colors["๐Ÿ“ฐ ์ผ๋ฐ˜AI๋‰ด์Šค"] = "#64748b"
relevance = "์ผ๋ฐ˜"
for level in ["ํ•ต์‹ฌ", "์ฃผ๋ชฉ", "์ฐธ๊ณ "]:
for kw in RELEVANCE_KW[level]:
if kw.lower() in text:
relevance = level
break
if relevance != "์ผ๋ฐ˜":
break
return {"tags": tags, "colors": colors, "relevance": relevance}
def translate_to_korean(text):
"""์˜๋ฌธ ํ…์ŠคํŠธ๋ฅผ ํ•œ๊ธ€๋กœ ๋ฒˆ์—ญ"""
if not text or not HAS_TRANSLATOR:
return text
# ์ด๋ฏธ ํ•œ๊ธ€์ด๋ฉด ์Šคํ‚ต
if re.search(r'[๊ฐ€-ํžฃ]', text):
return text
try:
result = translator.translate(text)
return result if result else text
except Exception as e:
print(f" ๋ฒˆ์—ญ ์‹คํŒจ: {e}")
return text
def gen_summary(title):
t = title.strip()
return t[:80] + "..." if len(t) > 80 else t
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
def fetch_aitimes(max_items=20):
print("๐Ÿ“ฐ AI Times ์ˆ˜์ง‘ ์ค‘...")
urls = [
"https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm",
"https://www.aitimes.com/news/articleList.html?sc_section_code=S1N24&view_type=sm",
]
all_news = []
today = datetime.now().strftime("%m-%d")
yesterday = (datetime.now() - timedelta(days=1)).strftime("%m-%d")
for url in urls:
try:
r = requests.get(url, timeout=15, headers={"User-Agent": UA})
r.raise_for_status()
r.encoding = "utf-8"
soup = BeautifulSoup(r.text, "html.parser")
for tag in soup.find_all("a", href=re.compile(r"/news/articleView\.html\?idxno=\d+")):
title = tag.get_text(strip=True)
link = tag.get("href", "")
if not title or len(title) < 10:
continue
if link and not link.startswith("http"):
link = "https://www.aitimes.com" + link
date_text = today
if tag.parent:
m = re.search(r"(\d{2}-\d{2}\s+\d{2}:\d{2})", tag.parent.get_text())
if m:
date_text = m.group(1)
if today not in date_text and yesterday not in date_text:
continue
cls = classify_news(title, "AI Times")
all_news.append({"title": title, "url": link, "date": date_text, "source": "AI Times", "summary": gen_summary(title), **cls})
time.sleep(0.5)
except Exception as e:
print(f" โš ๏ธ AI Times: {e}")
seen = set()
unique = [n for n in all_news if n["url"] not in seen and not seen.add(n["url"])]
print(f" โœ… AI Times {len(unique)}๊ฑด")
return unique[:max_items]
def fetch_hackernews(limit=15):
print("๐Ÿ”ฅ Hacker News ์ˆ˜์ง‘ ์ค‘...")
news = []
try:
r = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10)
ids = r.json()[:limit * 3]
cutoff = datetime.utcnow() - timedelta(hours=36)
for sid in ids:
if len(news) >= limit:
break
try:
sr = requests.get(f"https://hacker-news.firebaseio.com/v0/item/{sid}.json", timeout=5)
s = sr.json()
if s.get("type") != "story" or not s.get("url"):
continue
st = datetime.utcfromtimestamp(s.get("time", 0))
if st < cutoff:
continue
title_en = s.get("title", "")
# ํ•œ๊ธ€ ๋ฒˆ์—ญ
title_ko = translate_to_korean(title_en)
cls = classify_news(title_en + " " + title_ko, "Hacker News")
news.append({
"title": title_ko,
"title_en": title_en,
"url": s["url"],
"date": st.strftime("%m-%d %H:%M"),
"source": "Hacker News",
"summary": gen_summary(title_ko),
"score": s.get("score", 0),
**cls,
})
time.sleep(0.2)
except Exception:
continue
print(f" โœ… HN {len(news)}๊ฑด (ํ•œ๊ธ€ ๋ฒˆ์—ญ ์™„๋ฃŒ)")
except Exception as e:
print(f" โš ๏ธ HN: {e}")
return news
def load_news_from_hf():
try:
path = hf_hub_download(repo_id=DATASET_REPO, filename=NEWS_FILE, repo_type="dataset", token=HF_TOKEN, force_download=True)
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
return []
def save_news_to_hf(news_list):
try:
tmp = os.path.join(tempfile.gettempdir(), NEWS_FILE)
with open(tmp, "w", encoding="utf-8") as f:
json.dump(news_list, f, ensure_ascii=False, indent=2)
hf_api.upload_file(path_or_fileobj=tmp, path_in_repo=NEWS_FILE, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
except Exception as e:
print(f"[ERROR] News save: {e}")
def collect_news(force=False):
if not force:
cached = load_news_from_hf()
if cached:
try:
last = cached[0].get("collected_at", "")
if last and (datetime.now() - datetime.fromisoformat(last)).total_seconds() < 21600:
return cached
except Exception:
pass
print("\n[NEWS] Collecting fresh news...")
now_iso = datetime.now().isoformat()
all_news = fetch_aitimes(20) + fetch_hackernews(15)
for n in all_news:
n["collected_at"] = now_iso
order = {"ํ•ต์‹ฌ": 0, "์ฃผ๋ชฉ": 1, "์ฐธ๊ณ ": 2, "์ผ๋ฐ˜": 3}
all_news.sort(key=lambda x: order.get(x.get("relevance", "์ผ๋ฐ˜"), 3))
if HF_TOKEN and all_news:
save_news_to_hf(all_news)
return all_news