Spaces:

VIDraft
/

dash

Running

App Files Files Community

dash / news.py

openfree

Update news.py

a6fc716 verified 26 days ago

raw

history blame contribute delete

9.12 kB

	"""
	VDash 뉴스 모듈 v2
	- AI Times + Hacker News 크롤링
	- HN 영문 제목 → 한글 자동 번역
	- 비드래프트 관점 자동 분류
	- HF Dataset 영구 저장
	"""

	import requests, json, re, time, os, tempfile
	from datetime import datetime, timedelta
	from typing import List, Dict
	from bs4 import BeautifulSoup
	from huggingface_hub import HfApi, hf_hub_download

	try:
	from deep_translator import GoogleTranslator
	translator = GoogleTranslator(source='en', target='ko')
	HAS_TRANSLATOR = True
	except Exception:
	HAS_TRANSLATOR = False
	print("[NEWS] deep-translator not available, HN titles will stay English")

	HF_TOKEN = os.getenv("HF_TOKEN")
	SPACE_ID = os.getenv("SPACE_ID", "")
	OWNER = SPACE_ID.split("/")[0] if SPACE_ID else "vidraft"
	DATASET_REPO = os.getenv("DATASET_REPO", f"{OWNER}/vidraft-dashboard-data")
	NEWS_FILE = "news.json"
	hf_api = HfApi(token=HF_TOKEN)

	TAG_RULES = [
	(["정부","과제","공모","지원사업","IITP","NIA","NIPA","국책","government","grant"], "🏛️ 정부과제", "#3b82f6"),
	(["투자","펀딩","시리즈","VC","IPO","인수","M&A","밸류에이션","funding","investment","acquisition"], "💰 투자/IR", "#f59e0b"),
	(["영상","비디오","video","생성","sora","gen-","동영상","이미지생성"], "🎬 영상AI", "#ef4444"),
	(["한국어","korean","multilingual","번역","다국어"], "🇰🇷 한국어AI", "#8b5cf6"),
	(["허깅페이스","hugging","HF","spaces","오픈소스","open source","github"], "🤗 HF/오픈소스", "#10b981"),
	(["LLM","GPT","Claude","Gemini","거대언어","파인튜닝","RAG","에이전트","agent","transformer","llama","mistral"], "🧠 LLM/에이전트", "#6366f1"),
	(["GPU","칩","반도체","엔비디아","NVIDIA","인프라","서버","클라우드","데이터센터","chip","server","cloud"], "🖥️ 인프라/GPU", "#0d9488"),
	(["보안","개인정보","규제","법안","윤리","안전","저작권","AI법","regulation","safety","privacy"], "🔒 규제/윤리", "#dc2626"),
	(["스타트업","창업","사업","제휴","파트너","계약","매출","startup","business","revenue"], "💼 비즈니스", "#ea580c"),
	(["교육","학습","연구","논문","arXiv","벤치마크","성능","평가","paper","research","benchmark"], "📚 R&D/연구", "#059669"),
	(["마케팅","콘텐츠","SNS","브랜딩","홍보","PR","미디어","marketing"], "📢 마케팅/PR", "#9333ea"),
	]

	RELEVANCE_KW = {
	"핵심": ["AI 영상","비디오 생성","한국어","허깅페이스","오픈소스","에이전트","LLM","스타트업","정부과제","video generation","hugging face"],
	"주목": ["GPU","클라우드","인프라","투자","생성AI","파인튜닝","RAG","API","generative"],
	"참고": ["규제","교육","연구","보안","마케팅","benchmark","safety"],
	}


	def classify_news(title, source=""):
	text = (title + " " + source).lower()
	tags, colors = [], {}
	for keywords, tag, color in TAG_RULES:
	for kw in keywords:
	if kw.lower() in text:
	if tag not in tags:
	tags.append(tag)
	colors[tag] = color
	break
	if not tags:
	tags.append("📰 일반AI뉴스")
	colors["📰 일반AI뉴스"] = "#64748b"
	relevance = "일반"
	for level in ["핵심", "주목", "참고"]:
	for kw in RELEVANCE_KW[level]:
	if kw.lower() in text:
	relevance = level
	break
	if relevance != "일반":
	break
	return {"tags": tags, "colors": colors, "relevance": relevance}


	def translate_to_korean(text):
	"""영문 텍스트를 한글로 번역"""
	if not text or not HAS_TRANSLATOR:
	return text
	# 이미 한글이면 스킵
	if re.search(r'[가-힣]', text):
	return text
	try:
	result = translator.translate(text)
	return result if result else text
	except Exception as e:
	print(f" 번역 실패: {e}")
	return text


	def gen_summary(title):
	t = title.strip()
	return t[:80] + "..." if len(t) > 80 else t


	UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"


	def fetch_aitimes(max_items=20):
	print("📰 AI Times 수집 중...")
	urls = [
	"https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm",
	"https://www.aitimes.com/news/articleList.html?sc_section_code=S1N24&view_type=sm",
	]
	all_news = []
	today = datetime.now().strftime("%m-%d")
	yesterday = (datetime.now() - timedelta(days=1)).strftime("%m-%d")
	for url in urls:
	try:
	r = requests.get(url, timeout=15, headers={"User-Agent": UA})
	r.raise_for_status()
	r.encoding = "utf-8"
	soup = BeautifulSoup(r.text, "html.parser")
	for tag in soup.find_all("a", href=re.compile(r"/news/articleView\.html\?idxno=\d+")):
	title = tag.get_text(strip=True)
	link = tag.get("href", "")
	if not title or len(title) < 10:
	continue
	if link and not link.startswith("http"):
	link = "https://www.aitimes.com" + link
	date_text = today
	if tag.parent:
	m = re.search(r"(\d{2}-\d{2}\s+\d{2}:\d{2})", tag.parent.get_text())
	if m:
	date_text = m.group(1)
	if today not in date_text and yesterday not in date_text:
	continue
	cls = classify_news(title, "AI Times")
	all_news.append({"title": title, "url": link, "date": date_text, "source": "AI Times", "summary": gen_summary(title), **cls})
	time.sleep(0.5)
	except Exception as e:
	print(f" ⚠️ AI Times: {e}")
	seen = set()
	unique = [n for n in all_news if n["url"] not in seen and not seen.add(n["url"])]
	print(f" ✅ AI Times {len(unique)}건")
	return unique[:max_items]


	def fetch_hackernews(limit=15):
	print("🔥 Hacker News 수집 중...")
	news = []
	try:
	r = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10)
	ids = r.json()[:limit * 3]
	cutoff = datetime.utcnow() - timedelta(hours=36)
	for sid in ids:
	if len(news) >= limit:
	break
	try:
	sr = requests.get(f"https://hacker-news.firebaseio.com/v0/item/{sid}.json", timeout=5)
	s = sr.json()
	if s.get("type") != "story" or not s.get("url"):
	continue
	st = datetime.utcfromtimestamp(s.get("time", 0))
	if st < cutoff:
	continue
	title_en = s.get("title", "")
	# 한글 번역
	title_ko = translate_to_korean(title_en)
	cls = classify_news(title_en + " " + title_ko, "Hacker News")
	news.append({
	"title": title_ko,
	"title_en": title_en,
	"url": s["url"],
	"date": st.strftime("%m-%d %H:%M"),
	"source": "Hacker News",
	"summary": gen_summary(title_ko),
	"score": s.get("score", 0),
	**cls,
	})
	time.sleep(0.2)
	except Exception:
	continue
	print(f" ✅ HN {len(news)}건 (한글 번역 완료)")
	except Exception as e:
	print(f" ⚠️ HN: {e}")
	return news


	def load_news_from_hf():
	try:
	path = hf_hub_download(repo_id=DATASET_REPO, filename=NEWS_FILE, repo_type="dataset", token=HF_TOKEN, force_download=True)
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)
	except Exception:
	return []


	def save_news_to_hf(news_list):
	try:
	tmp = os.path.join(tempfile.gettempdir(), NEWS_FILE)
	with open(tmp, "w", encoding="utf-8") as f:
	json.dump(news_list, f, ensure_ascii=False, indent=2)
	hf_api.upload_file(path_or_fileobj=tmp, path_in_repo=NEWS_FILE, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
	except Exception as e:
	print(f"[ERROR] News save: {e}")


	def collect_news(force=False):
	if not force:
	cached = load_news_from_hf()
	if cached:
	try:
	last = cached[0].get("collected_at", "")
	if last and (datetime.now() - datetime.fromisoformat(last)).total_seconds() < 21600:
	return cached
	except Exception:
	pass
	print("\n[NEWS] Collecting fresh news...")
	now_iso = datetime.now().isoformat()
	all_news = fetch_aitimes(20) + fetch_hackernews(15)
	for n in all_news:
	n["collected_at"] = now_iso
	order = {"핵심": 0, "주목": 1, "참고": 2, "일반": 3}
	all_news.sort(key=lambda x: order.get(x.get("relevance", "일반"), 3))
	if HF_TOKEN and all_news:
	save_news_to_hf(all_news)
	return all_news