| """ |
| VDash ๋ด์ค ๋ชจ๋ v2 |
| - AI Times + Hacker News ํฌ๋กค๋ง |
| - HN ์๋ฌธ ์ ๋ชฉ โ ํ๊ธ ์๋ ๋ฒ์ญ |
| - ๋น๋๋ํํธ ๊ด์ ์๋ ๋ถ๋ฅ |
| - HF Dataset ์๊ตฌ ์ ์ฅ |
| """ |
|
|
| import requests, json, re, time, os, tempfile |
| from datetime import datetime, timedelta |
| from typing import List, Dict |
| from bs4 import BeautifulSoup |
| from huggingface_hub import HfApi, hf_hub_download |
|
|
| try: |
| from deep_translator import GoogleTranslator |
| translator = GoogleTranslator(source='en', target='ko') |
| HAS_TRANSLATOR = True |
| except Exception: |
| HAS_TRANSLATOR = False |
| print("[NEWS] deep-translator not available, HN titles will stay English") |
|
|
| HF_TOKEN = os.getenv("HF_TOKEN") |
| SPACE_ID = os.getenv("SPACE_ID", "") |
| OWNER = SPACE_ID.split("/")[0] if SPACE_ID else "vidraft" |
| DATASET_REPO = os.getenv("DATASET_REPO", f"{OWNER}/vidraft-dashboard-data") |
| NEWS_FILE = "news.json" |
| hf_api = HfApi(token=HF_TOKEN) |
|
|
| TAG_RULES = [ |
| (["์ ๋ถ","๊ณผ์ ","๊ณต๋ชจ","์ง์์ฌ์
","IITP","NIA","NIPA","๊ตญ์ฑ
","government","grant"], "๐๏ธ ์ ๋ถ๊ณผ์ ", "#3b82f6"), |
| (["ํฌ์","ํ๋ฉ","์๋ฆฌ์ฆ","VC","IPO","์ธ์","M&A","๋ฐธ๋ฅ์์ด์
","funding","investment","acquisition"], "๐ฐ ํฌ์/IR", "#f59e0b"), |
| (["์์","๋น๋์ค","video","์์ฑ","sora","gen-","๋์์","์ด๋ฏธ์ง์์ฑ"], "๐ฌ ์์AI", "#ef4444"), |
| (["ํ๊ตญ์ด","korean","multilingual","๋ฒ์ญ","๋ค๊ตญ์ด"], "๐ฐ๐ท ํ๊ตญ์ดAI", "#8b5cf6"), |
| (["ํ๊น
ํ์ด์ค","hugging","HF","spaces","์คํ์์ค","open source","github"], "๐ค HF/์คํ์์ค", "#10b981"), |
| (["LLM","GPT","Claude","Gemini","๊ฑฐ๋์ธ์ด","ํ์ธํ๋","RAG","์์ด์ ํธ","agent","transformer","llama","mistral"], "๐ง LLM/์์ด์ ํธ", "#6366f1"), |
| (["GPU","์นฉ","๋ฐ๋์ฒด","์๋น๋์","NVIDIA","์ธํ๋ผ","์๋ฒ","ํด๋ผ์ฐ๋","๋ฐ์ดํฐ์ผํฐ","chip","server","cloud"], "๐ฅ๏ธ ์ธํ๋ผ/GPU", "#0d9488"), |
| (["๋ณด์","๊ฐ์ธ์ ๋ณด","๊ท์ ","๋ฒ์","์ค๋ฆฌ","์์ ","์ ์๊ถ","AI๋ฒ","regulation","safety","privacy"], "๐ ๊ท์ /์ค๋ฆฌ", "#dc2626"), |
| (["์คํํธ์
","์ฐฝ์
","์ฌ์
","์ ํด","ํํธ๋","๊ณ์ฝ","๋งค์ถ","startup","business","revenue"], "๐ผ ๋น์ฆ๋์ค", "#ea580c"), |
| (["๊ต์ก","ํ์ต","์ฐ๊ตฌ","๋
ผ๋ฌธ","arXiv","๋ฒค์น๋งํฌ","์ฑ๋ฅ","ํ๊ฐ","paper","research","benchmark"], "๐ R&D/์ฐ๊ตฌ", "#059669"), |
| (["๋ง์ผํ
","์ฝํ
์ธ ","SNS","๋ธ๋๋ฉ","ํ๋ณด","PR","๋ฏธ๋์ด","marketing"], "๐ข ๋ง์ผํ
/PR", "#9333ea"), |
| ] |
|
|
| RELEVANCE_KW = { |
| "ํต์ฌ": ["AI ์์","๋น๋์ค ์์ฑ","ํ๊ตญ์ด","ํ๊น
ํ์ด์ค","์คํ์์ค","์์ด์ ํธ","LLM","์คํํธ์
","์ ๋ถ๊ณผ์ ","video generation","hugging face"], |
| "์ฃผ๋ชฉ": ["GPU","ํด๋ผ์ฐ๋","์ธํ๋ผ","ํฌ์","์์ฑAI","ํ์ธํ๋","RAG","API","generative"], |
| "์ฐธ๊ณ ": ["๊ท์ ","๊ต์ก","์ฐ๊ตฌ","๋ณด์","๋ง์ผํ
","benchmark","safety"], |
| } |
|
|
|
|
| def classify_news(title, source=""): |
| text = (title + " " + source).lower() |
| tags, colors = [], {} |
| for keywords, tag, color in TAG_RULES: |
| for kw in keywords: |
| if kw.lower() in text: |
| if tag not in tags: |
| tags.append(tag) |
| colors[tag] = color |
| break |
| if not tags: |
| tags.append("๐ฐ ์ผ๋ฐAI๋ด์ค") |
| colors["๐ฐ ์ผ๋ฐAI๋ด์ค"] = "#64748b" |
| relevance = "์ผ๋ฐ" |
| for level in ["ํต์ฌ", "์ฃผ๋ชฉ", "์ฐธ๊ณ "]: |
| for kw in RELEVANCE_KW[level]: |
| if kw.lower() in text: |
| relevance = level |
| break |
| if relevance != "์ผ๋ฐ": |
| break |
| return {"tags": tags, "colors": colors, "relevance": relevance} |
|
|
|
|
| def translate_to_korean(text): |
| """์๋ฌธ ํ
์คํธ๋ฅผ ํ๊ธ๋ก ๋ฒ์ญ""" |
| if not text or not HAS_TRANSLATOR: |
| return text |
| |
| if re.search(r'[๊ฐ-ํฃ]', text): |
| return text |
| try: |
| result = translator.translate(text) |
| return result if result else text |
| except Exception as e: |
| print(f" ๋ฒ์ญ ์คํจ: {e}") |
| return text |
|
|
|
|
| def gen_summary(title): |
| t = title.strip() |
| return t[:80] + "..." if len(t) > 80 else t |
|
|
|
|
| UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" |
|
|
|
|
| def fetch_aitimes(max_items=20): |
| print("๐ฐ AI Times ์์ง ์ค...") |
| urls = [ |
| "https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm", |
| "https://www.aitimes.com/news/articleList.html?sc_section_code=S1N24&view_type=sm", |
| ] |
| all_news = [] |
| today = datetime.now().strftime("%m-%d") |
| yesterday = (datetime.now() - timedelta(days=1)).strftime("%m-%d") |
| for url in urls: |
| try: |
| r = requests.get(url, timeout=15, headers={"User-Agent": UA}) |
| r.raise_for_status() |
| r.encoding = "utf-8" |
| soup = BeautifulSoup(r.text, "html.parser") |
| for tag in soup.find_all("a", href=re.compile(r"/news/articleView\.html\?idxno=\d+")): |
| title = tag.get_text(strip=True) |
| link = tag.get("href", "") |
| if not title or len(title) < 10: |
| continue |
| if link and not link.startswith("http"): |
| link = "https://www.aitimes.com" + link |
| date_text = today |
| if tag.parent: |
| m = re.search(r"(\d{2}-\d{2}\s+\d{2}:\d{2})", tag.parent.get_text()) |
| if m: |
| date_text = m.group(1) |
| if today not in date_text and yesterday not in date_text: |
| continue |
| cls = classify_news(title, "AI Times") |
| all_news.append({"title": title, "url": link, "date": date_text, "source": "AI Times", "summary": gen_summary(title), **cls}) |
| time.sleep(0.5) |
| except Exception as e: |
| print(f" โ ๏ธ AI Times: {e}") |
| seen = set() |
| unique = [n for n in all_news if n["url"] not in seen and not seen.add(n["url"])] |
| print(f" โ
AI Times {len(unique)}๊ฑด") |
| return unique[:max_items] |
|
|
|
|
| def fetch_hackernews(limit=15): |
| print("๐ฅ Hacker News ์์ง ์ค...") |
| news = [] |
| try: |
| r = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10) |
| ids = r.json()[:limit * 3] |
| cutoff = datetime.utcnow() - timedelta(hours=36) |
| for sid in ids: |
| if len(news) >= limit: |
| break |
| try: |
| sr = requests.get(f"https://hacker-news.firebaseio.com/v0/item/{sid}.json", timeout=5) |
| s = sr.json() |
| if s.get("type") != "story" or not s.get("url"): |
| continue |
| st = datetime.utcfromtimestamp(s.get("time", 0)) |
| if st < cutoff: |
| continue |
| title_en = s.get("title", "") |
| |
| title_ko = translate_to_korean(title_en) |
| cls = classify_news(title_en + " " + title_ko, "Hacker News") |
| news.append({ |
| "title": title_ko, |
| "title_en": title_en, |
| "url": s["url"], |
| "date": st.strftime("%m-%d %H:%M"), |
| "source": "Hacker News", |
| "summary": gen_summary(title_ko), |
| "score": s.get("score", 0), |
| **cls, |
| }) |
| time.sleep(0.2) |
| except Exception: |
| continue |
| print(f" โ
HN {len(news)}๊ฑด (ํ๊ธ ๋ฒ์ญ ์๋ฃ)") |
| except Exception as e: |
| print(f" โ ๏ธ HN: {e}") |
| return news |
|
|
|
|
| def load_news_from_hf(): |
| try: |
| path = hf_hub_download(repo_id=DATASET_REPO, filename=NEWS_FILE, repo_type="dataset", token=HF_TOKEN, force_download=True) |
| with open(path, "r", encoding="utf-8") as f: |
| return json.load(f) |
| except Exception: |
| return [] |
|
|
|
|
| def save_news_to_hf(news_list): |
| try: |
| tmp = os.path.join(tempfile.gettempdir(), NEWS_FILE) |
| with open(tmp, "w", encoding="utf-8") as f: |
| json.dump(news_list, f, ensure_ascii=False, indent=2) |
| hf_api.upload_file(path_or_fileobj=tmp, path_in_repo=NEWS_FILE, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN) |
| except Exception as e: |
| print(f"[ERROR] News save: {e}") |
|
|
|
|
| def collect_news(force=False): |
| if not force: |
| cached = load_news_from_hf() |
| if cached: |
| try: |
| last = cached[0].get("collected_at", "") |
| if last and (datetime.now() - datetime.fromisoformat(last)).total_seconds() < 21600: |
| return cached |
| except Exception: |
| pass |
| print("\n[NEWS] Collecting fresh news...") |
| now_iso = datetime.now().isoformat() |
| all_news = fetch_aitimes(20) + fetch_hackernews(15) |
| for n in all_news: |
| n["collected_at"] = now_iso |
| order = {"ํต์ฌ": 0, "์ฃผ๋ชฉ": 1, "์ฐธ๊ณ ": 2, "์ผ๋ฐ": 3} |
| all_news.sort(key=lambda x: order.get(x.get("relevance", "์ผ๋ฐ"), 3)) |
| if HF_TOKEN and all_news: |
| save_news_to_hf(all_news) |
| return all_news |