|
|
|
|
| """
|
| λλΆμ΄λ―Όμ£ΌλΉ ν¬λ‘€λ¬ - κ³ μ±λ₯ λΉλκΈ° λ²μ + νκΉ
νμ΄μ€ μλ μ
λ‘λ
|
| - asyncio + aiohttp (10-20λ°° λΉ λ₯Έ μλ)
|
| - λμ μμ² μ μ μ΄ (μλ² λΆλ΄ μ΅μν)
|
| - μ¦λΆ μ
λ°μ΄νΈ (λ§μ§λ§ λ μ§ μ΄νλ§ ν¬λ‘€λ§)
|
| - νκΉ
νμ΄μ€ μλ μ
λ‘λ
|
| - μΌ λ¨μ μ€μΌμ€λ§
|
| """
|
|
|
| import os
|
| import json
|
| import time
|
| import re
|
| import asyncio
|
| from datetime import datetime, timedelta
|
| from typing import List, Dict, Optional
|
| import pandas as pd
|
| from tqdm.asyncio import tqdm as async_tqdm
|
| import aiohttp
|
| from bs4 import BeautifulSoup
|
| from dotenv import load_dotenv
|
| from huggingface_hub import HfApi, login
|
| from datasets import Dataset, load_dataset, concatenate_datasets
|
|
|
|
|
| load_dotenv()
|
|
|
| class MinjooAsyncCrawler:
|
| def __init__(self, config_path="crawler_config.json"):
|
| self.base_url = "https://theminjoo.kr/main/sub"
|
| self.party_name = "λλΆμ΄λ―Όμ£ΌλΉ"
|
| self.config_path = config_path
|
| self.state_path = "crawler_state.json"
|
|
|
|
|
| self.load_config()
|
|
|
|
|
| self.hf_token = os.getenv("HF_TOKEN")
|
| self.hf_repo_id = os.getenv("HF_REPO_ID", "minjoo-press-releases")
|
|
|
|
|
| self.semaphore = asyncio.Semaphore(20)
|
|
|
| def load_config(self):
|
| """μ€μ νμΌ λ‘λ"""
|
| default_config = {
|
| "boards": {
|
| "보λμλ£": "188",
|
| "λ
Όν_λΈλ¦¬ν": "11",
|
| "λͺ¨λλ°μΈ": "230"
|
| },
|
| "start_date": "2003-11-11",
|
| "max_pages": 10000,
|
| "concurrent_requests": 20,
|
| "request_delay": 0.1,
|
| "output_path": "./data"
|
| }
|
|
|
| if os.path.exists(self.config_path):
|
| with open(self.config_path, 'r', encoding='utf-8') as f:
|
| config = json.load(f)
|
|
|
| if 'minjoo' in config:
|
| self.config = config['minjoo']
|
| else:
|
| self.config = default_config
|
| else:
|
| self.config = default_config
|
|
|
| self.boards = self.config["boards"]
|
| self.start_date = self.config["start_date"]
|
| self.max_pages = self.config["max_pages"]
|
| self.output_path = self.config["output_path"]
|
|
|
| def load_state(self) -> Dict:
|
| """ν¬λ‘€λ¬ μν λ‘λ (λ§μ§λ§ ν¬λ‘€λ§ λ μ§)"""
|
| if os.path.exists(self.state_path):
|
| with open(self.state_path, 'r', encoding='utf-8') as f:
|
| state = json.load(f)
|
| return state.get('minjoo', {})
|
| return {}
|
|
|
| def save_state(self, state: Dict):
|
| """ν¬λ‘€λ¬ μν μ μ₯"""
|
| all_state = {}
|
| if os.path.exists(self.state_path):
|
| with open(self.state_path, 'r', encoding='utf-8') as f:
|
| all_state = json.load(f)
|
|
|
| all_state['minjoo'] = state
|
|
|
| with open(self.state_path, 'w', encoding='utf-8') as f:
|
| json.dump(all_state, f, ensure_ascii=False, indent=2)
|
|
|
| @staticmethod
|
| def parse_date(date_str: str) -> Optional[datetime]:
|
| """λ μ§ νμ±"""
|
| try:
|
| return datetime.strptime(date_str.strip().split()[0], '%Y-%m-%d')
|
| except:
|
| return None
|
|
|
| @staticmethod
|
| def clean_text(text: str) -> str:
|
| """ν
μ€νΈ μ 리"""
|
| text = text.replace('\xa0', '').replace('\u200b', '').replace('β', '')
|
| return text.strip()
|
|
|
| async def fetch_with_retry(self, session: aiohttp.ClientSession, url: str,
|
| max_retries: int = 3) -> Optional[str]:
|
| """μ¬μλ λ‘μ§μ΄ μλ λΉλκΈ° μμ²"""
|
| async with self.semaphore:
|
| for attempt in range(max_retries):
|
| try:
|
| await asyncio.sleep(self.config.get("request_delay", 0.1))
|
| async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as response:
|
| if response.status == 200:
|
| return await response.text()
|
| except Exception as e:
|
| if attempt < max_retries - 1:
|
| await asyncio.sleep(1)
|
| else:
|
| return None
|
| return None
|
|
|
| async def fetch_list_page(self, session: aiohttp.ClientSession,
|
| board_id: str, page_num: int,
|
| start_date: datetime, end_date: datetime) -> tuple:
|
| """λͺ©λ‘ νμ΄μ§ νλ κ°μ Έμ€κΈ°"""
|
| if page_num == 0:
|
| url = f"{self.base_url}/news/list.php?brd={board_id}"
|
| else:
|
| url = f"{self.base_url}/news/list.php?sno={page_num}&par=&&brd={board_id}"
|
|
|
| html = await self.fetch_with_retry(session, url)
|
| if not html:
|
| return [], False
|
|
|
| soup = BeautifulSoup(html, 'html.parser')
|
| board_items = soup.find_all('div', {'class': 'board-item'})
|
|
|
| if not board_items:
|
| return [], True
|
|
|
| data = []
|
| stop_flag = False
|
|
|
| for item in board_items:
|
| try:
|
| link_tag = item.find('a')
|
| if not link_tag:
|
| continue
|
|
|
| title_span = link_tag.find('span')
|
| if not title_span:
|
| continue
|
|
|
| title = title_span.get_text(strip=True).replace('\n', ' ')
|
|
|
|
|
| article_url = link_tag.get('href', '')
|
| if article_url.startswith('./'):
|
| article_url = self.base_url + '/news/' + article_url[2:]
|
| elif not article_url.startswith('http'):
|
| article_url = self.base_url + article_url
|
|
|
|
|
| category_tag = item.find('p', {'class': 'category'})
|
| category = ""
|
| if category_tag:
|
| category_span = category_tag.find('span')
|
| if category_span:
|
| category = category_span.get_text(strip=True)
|
|
|
|
|
| time_tag = item.find('time')
|
| if not time_tag:
|
| continue
|
|
|
| date_str = time_tag.get('datetime', '') or time_tag.get_text(strip=True)
|
| article_date = self.parse_date(date_str)
|
|
|
| if not article_date:
|
| continue
|
| if article_date < start_date:
|
| stop_flag = True
|
| break
|
| if article_date > end_date:
|
| continue
|
|
|
| data.append({
|
| 'category': category,
|
| 'title': title,
|
| 'date': date_str.split()[0] if ' ' in date_str else date_str,
|
| 'url': article_url
|
| })
|
| except:
|
| continue
|
|
|
| return data, stop_flag
|
|
|
| async def fetch_article_detail(self, session: aiohttp.ClientSession,
|
| url: str) -> Dict:
|
| """μμΈ νμ΄μ§ κ°μ Έμ€κΈ°"""
|
| html = await self.fetch_with_retry(session, url)
|
| if not html:
|
| return {'text': "λ³Έλ¬Έ μ‘°ν μ€ν¨", 'writer': "", 'published_date': ""}
|
|
|
| soup = BeautifulSoup(html, 'html.parser')
|
| text_parts = []
|
| writer = ""
|
| published_date = ""
|
|
|
|
|
| date_li = soup.find('li', {'class': 'date'})
|
| if date_li:
|
| date_text = date_li.get_text(strip=True)
|
| match = re.search(r'(\d{4}-\d{2}-\d{2})', date_text)
|
| if match:
|
| published_date = match.group(1)
|
|
|
|
|
| contents_div = soup.find('div', {'class': 'board-view__contents'})
|
| if contents_div:
|
| for element in contents_div.descendants:
|
| if element.name == 'p':
|
| text = element.get_text(strip=True)
|
| cleaned = self.clean_text(text)
|
| if cleaned:
|
| text_parts.append(cleaned)
|
| elif element.name == 'b':
|
| text = element.get_text(strip=True)
|
| cleaned = self.clean_text(text)
|
| if cleaned and not writer:
|
| if 'λ―Όμ£ΌλΉ' in cleaned or '곡보κ΅' in cleaned or 'λλ³μΈ' in cleaned:
|
| writer = cleaned
|
|
|
| return {
|
| 'text': '\n'.join(text_parts),
|
| 'writer': writer,
|
| 'published_date': published_date
|
| }
|
|
|
| async def collect_board(self, board_name: str, board_id: str,
|
| start_date: str, end_date: str) -> List[Dict]:
|
| """ν κ²μν μ 체 μμ§ (λΉλκΈ°)"""
|
| start_dt = datetime.strptime(start_date, '%Y-%m-%d')
|
| end_dt = datetime.strptime(end_date, '%Y-%m-%d')
|
|
|
| print(f"\nβΆ [{board_name}] λͺ©λ‘ μμ§ μμ...")
|
|
|
| headers = {
|
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 'Accept-Language': 'ko-KR,ko;q=0.9',
|
| }
|
|
|
| async with aiohttp.ClientSession(headers=headers) as session:
|
|
|
| all_items = []
|
| page_num = 0
|
| empty_pages = 0
|
| max_empty_pages = 3
|
|
|
| with async_tqdm(desc=f"[{board_name}] λͺ©λ‘", unit="νμ΄μ§") as pbar:
|
| while page_num <= self.max_pages * 20:
|
| items, stop_flag = await self.fetch_list_page(
|
| session, board_id, page_num, start_dt, end_dt
|
| )
|
|
|
| if not items:
|
| empty_pages += 1
|
| if empty_pages >= max_empty_pages or stop_flag:
|
| break
|
| else:
|
| empty_pages = 0
|
| all_items.extend(items)
|
|
|
| pbar.update(1)
|
| pbar.set_postfix({"μμ§": len(all_items)})
|
|
|
| if stop_flag:
|
| break
|
|
|
| page_num += 20
|
|
|
| print(f" β {len(all_items)}κ° νλͺ© λ°κ²¬")
|
|
|
|
|
| if all_items:
|
| print(f" βΆ μμΈ νμ΄μ§ μμ§ μ€...")
|
| tasks = [self.fetch_article_detail(session, item['url']) for item in all_items]
|
|
|
|
|
| details = []
|
| for coro in async_tqdm(asyncio.as_completed(tasks),
|
| total=len(tasks),
|
| desc=f"[{board_name}] μμΈ"):
|
| detail = await coro
|
| details.append(detail)
|
|
|
|
|
| for item, detail in zip(all_items, details):
|
| item.update(detail)
|
| item['board_name'] = board_name
|
|
|
| print(f"β [{board_name}] μλ£: {len(all_items)}κ°")
|
| return all_items
|
|
|
| async def collect_all(self, start_date: Optional[str] = None,
|
| end_date: Optional[str] = None) -> pd.DataFrame:
|
| """λͺ¨λ κ²μν μμ§"""
|
| if not end_date:
|
| end_date = datetime.now().strftime('%Y-%m-%d')
|
| if not start_date:
|
| start_date = self.start_date
|
|
|
| print(f"\n{'='*60}")
|
| print(f"λλΆμ΄λ―Όμ£ΌλΉ 보λμλ£ μμ§ - λΉλκΈ° κ³ μ±λ₯ λ²μ ")
|
| print(f"κΈ°κ°: {start_date} ~ {end_date}")
|
| print(f"{'='*60}")
|
|
|
|
|
| tasks = [
|
| self.collect_board(board_name, board_id, start_date, end_date)
|
| for board_name, board_id in self.boards.items()
|
| ]
|
|
|
| results = await asyncio.gather(*tasks)
|
|
|
|
|
| all_data = []
|
| for items in results:
|
| all_data.extend(items)
|
|
|
| if not all_data:
|
| print("\nβ οΈ μμ§λ λ°μ΄ν° μμ")
|
| return pd.DataFrame()
|
|
|
| df = pd.DataFrame(all_data)
|
| df = df[['board_name', 'title', 'category', 'published_date', 'writer', 'text', 'url']]
|
| df = df[(df['title'] != "") & (df['text'] != "")]
|
| df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
|
| df = df.rename(columns={'published_date': 'date'})
|
|
|
| print(f"\nβ μ΄ {len(df)}κ° μμ§ μλ£")
|
| return df
|
|
|
| def save_local(self, df: pd.DataFrame):
|
| """λ‘컬μ μ μ₯"""
|
| os.makedirs(self.output_path, exist_ok=True)
|
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
|
|
|
| csv_path = os.path.join(self.output_path, f"{self.party_name}_{timestamp}.csv")
|
| df.to_csv(csv_path, index=False, encoding='utf-8-sig')
|
|
|
|
|
| xlsx_path = os.path.join(self.output_path, f"{self.party_name}_{timestamp}.xlsx")
|
| df.to_excel(xlsx_path, index=False, engine='openpyxl')
|
|
|
| print(f"β CSV: {csv_path}")
|
| print(f"β Excel: {xlsx_path}")
|
|
|
| def upload_to_huggingface(self, df: pd.DataFrame):
|
| """νκΉ
νμ΄μ€μ μ
λ‘λ"""
|
| if not self.hf_token:
|
| print("\nβ οΈ HF_TOKENμ΄ μ€μ λμ§ μμμ΅λλ€. .env νμΌμ νμΈνμΈμ.")
|
| return
|
|
|
| print(f"\nβΆ νκΉ
νμ΄μ€ μ
λ‘λ μ€... (repo: {self.hf_repo_id})")
|
|
|
| try:
|
|
|
| login(token=self.hf_token)
|
| api = HfApi()
|
|
|
|
|
| new_dataset = Dataset.from_pandas(df)
|
|
|
|
|
| try:
|
| existing_dataset = load_dataset(self.hf_repo_id, split='train')
|
| print(f" βΉοΈ κΈ°μ‘΄ λ°μ΄ν°: {len(existing_dataset)}κ°")
|
|
|
|
|
| existing_df = existing_dataset.to_pandas()
|
| combined_df = pd.concat([existing_df, df], ignore_index=True)
|
| combined_df = combined_df.drop_duplicates(subset=['url'], keep='last')
|
| combined_df = combined_df.sort_values('date', ascending=False).reset_index(drop=True)
|
|
|
| final_dataset = Dataset.from_pandas(combined_df)
|
| print(f" β λ³ν© ν: {len(final_dataset)}κ° (μ€λ³΅ μ κ±°λ¨)")
|
| except:
|
| print(f" βΉοΈ μ κ· λ°μ΄ν°μ
μμ±")
|
| final_dataset = new_dataset
|
|
|
|
|
| final_dataset.push_to_hub(self.hf_repo_id, token=self.hf_token)
|
| print(f"β νκΉ
νμ΄μ€ μ
λ‘λ μλ£!")
|
| print(f" π https://huggingface.co/datasets/{self.hf_repo_id}")
|
|
|
| except Exception as e:
|
| print(f"β μ
λ‘λ μ€ν¨: {e}")
|
|
|
| async def run_incremental(self):
|
| """μ¦λΆ μ
λ°μ΄νΈ μ€ν (λ§μ§λ§ λ μ§ μ΄νλ§)"""
|
| state = self.load_state()
|
| last_date = state.get('last_crawl_date')
|
|
|
| if last_date:
|
|
|
| start_date = (datetime.strptime(last_date, '%Y-%m-%d') + timedelta(days=1)).strftime('%Y-%m-%d')
|
| print(f"π
μ¦λΆ μ
λ°μ΄νΈ: {start_date} μ΄ν λ°μ΄ν°λ§ μμ§")
|
| else:
|
| start_date = self.start_date
|
| print(f"π
μ 체 μμ§: {start_date}λΆν°")
|
|
|
| end_date = datetime.now().strftime('%Y-%m-%d')
|
|
|
|
|
| df = await self.collect_all(start_date, end_date)
|
|
|
| if df.empty:
|
| print("β μλ‘μ΄ λ°μ΄ν° μμ")
|
| return
|
|
|
|
|
| self.save_local(df)
|
|
|
|
|
| self.upload_to_huggingface(df)
|
|
|
|
|
| state['last_crawl_date'] = end_date
|
| state['last_crawl_time'] = datetime.now().isoformat()
|
| state['last_count'] = len(df)
|
| self.save_state(state)
|
|
|
| print(f"\n{'='*60}")
|
| print(f"β μλ£! λ€μ μ€ν: λ΄μΌ")
|
| print(f"{'='*60}\n")
|
|
|
| async def main():
|
| """λ©μΈ ν¨μ"""
|
| crawler = MinjooAsyncCrawler()
|
| await crawler.run_incremental()
|
|
|
| if __name__ == "__main__":
|
| asyncio.run(main())
|
|
|