#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 통합 정당 크롤러 - 더불어민주당, 국민의힘, 조국혁신당, 개혁신당, 기본소득당, 진보당 동시 크롤링 - 각 정당별 독립적인 허깅페이스 업로드 - 비동기 병렬 처리 ※ CLI 인자 지원이 필요한 경우 main.py 를 사용하세요. """ import asyncio import logging from datetime import datetime from minjoo_crawler_async import MinjooAsyncCrawler from ppp_crawler_async import PPPAsyncCrawler from rebuilding_crawler_async import RebuildingAsyncCrawler from reform_crawler_async import ReformAsyncCrawler from basic_income_crawler_async import BasicIncomeAsyncCrawler from jinbo_crawler_async import JinboAsyncCrawler logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler('unified_crawler.log', encoding='utf-8'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) CRAWLERS = { '더불어민주당': MinjooAsyncCrawler, '국민의힘': PPPAsyncCrawler, '조국혁신당': RebuildingAsyncCrawler, '개혁신당': ReformAsyncCrawler, '기본소득당': BasicIncomeAsyncCrawler, '진보당': JinboAsyncCrawler, } async def crawl_all_parties(): """6개 정당 동시 크롤링""" logger.info("=" * 60) logger.info("통합 정당 크롤러 시작") logger.info(" + ".join(CRAWLERS.keys())) logger.info("=" * 60) start_time = datetime.now() crawlers = [cls() for cls in CRAWLERS.values()] party_names = list(CRAWLERS.keys()) results = await asyncio.gather( *[crawler.run_incremental() for crawler in crawlers], return_exceptions=True ) for party, result in zip(party_names, results): if isinstance(result, Exception): logger.error(f"{party} 크롤링 실패: {result}") else: logger.info(f"{party} 크롤링 완료") duration = (datetime.now() - start_time).total_seconds() logger.info("=" * 60) logger.info(f"전체 크롤링 완료") logger.info(f"소요 시간: {duration:.1f}초 ({duration / 60:.1f}분)") logger.info("=" * 60) # 하위 호환성 유지 async def crawl_both_parties(): await crawl_all_parties() async def main(): await crawl_all_parties() if __name__ == "__main__": asyncio.run(main())