hanjunlee's picture
Upload 23 files
3a36548 verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
์ •๋‹น ๋ณด๋„์ž๋ฃŒ ํฌ๋กค๋Ÿฌ - ๋ฉ”์ธ ์ง„์ž…์ 
์ง€์› ์ •๋‹น: ๋”๋ถˆ์–ด๋ฏผ์ฃผ๋‹น, ๊ตญ๋ฏผ์˜ํž˜, ์กฐ๊ตญํ˜์‹ ๋‹น, ๊ฐœํ˜์‹ ๋‹น, ๊ธฐ๋ณธ์†Œ๋“๋‹น, ์ง„๋ณด๋‹น
์‚ฌ์šฉ๋ฒ•:
python main.py # ์ „์ฒด ์ •๋‹น ์ฆ๋ถ„ ์—…๋ฐ์ดํŠธ
python main.py --party minjoo # ๋”๋ถˆ์–ด๋ฏผ์ฃผ๋‹น๋งŒ
python main.py --party ppp # ๊ตญ๋ฏผ์˜ํž˜๋งŒ
python main.py --party rebuilding # ์กฐ๊ตญํ˜์‹ ๋‹น๋งŒ
python main.py --party reform # ๊ฐœํ˜์‹ ๋‹น๋งŒ
python main.py --party basic_income # ๊ธฐ๋ณธ์†Œ๋“๋‹น๋งŒ
python main.py --party jinbo # ์ง„๋ณด๋‹น๋งŒ
python main.py --start-date 2024-01-01 # ๋‚ ์งœ ๋ฒ”์œ„ ์ง€์ •
python main.py --party ppp --start-date 2024-01-01 --end-date 2024-06-30
"""
import asyncio
import argparse
import logging
from datetime import datetime
from minjoo_crawler_async import MinjooAsyncCrawler
from ppp_crawler_async import PPPAsyncCrawler
from rebuilding_crawler_async import RebuildingAsyncCrawler
from reform_crawler_async import ReformAsyncCrawler
from basic_income_crawler_async import BasicIncomeAsyncCrawler
from jinbo_crawler_async import JinboAsyncCrawler
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler('main.log', encoding='utf-8'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
PARTY_LABELS = {
'minjoo': '๋”๋ถˆ์–ด๋ฏผ์ฃผ๋‹น',
'ppp': '๊ตญ๋ฏผ์˜ํž˜',
'rebuilding': '์กฐ๊ตญํ˜์‹ ๋‹น',
'reform': '๊ฐœํ˜์‹ ๋‹น',
'basic_income':'๊ธฐ๋ณธ์†Œ๋“๋‹น',
'jinbo': '์ง„๋ณด๋‹น',
'all': '์ „์ฒด (6๊ฐœ ์ •๋‹น)',
}
ALL_PARTIES = ['minjoo', 'ppp', 'rebuilding', 'reform', 'basic_income', 'jinbo']
def parse_args():
parser = argparse.ArgumentParser(
description='์ •๋‹น ๋ณด๋„์ž๋ฃŒ ํฌ๋กค๋Ÿฌ',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument(
'--party',
choices=list(PARTY_LABELS.keys()),
default='all',
help=(
'ํฌ๋กค๋งํ•  ์ •๋‹น ์„ ํƒ (๊ธฐ๋ณธ๊ฐ’: all)\n'
' minjoo : ๋”๋ถˆ์–ด๋ฏผ์ฃผ๋‹น\n'
' ppp : ๊ตญ๋ฏผ์˜ํž˜\n'
' rebuilding : ์กฐ๊ตญํ˜์‹ ๋‹น\n'
' reform : ๊ฐœํ˜์‹ ๋‹น\n'
' basic_income : ๊ธฐ๋ณธ์†Œ๋“๋‹น\n'
' jinbo : ์ง„๋ณด๋‹น\n'
' all : ์ „์ฒด ๋™์‹œ ํฌ๋กค๋ง'
)
)
parser.add_argument(
'--start-date',
metavar='YYYY-MM-DD',
default=None,
help='์ˆ˜์ง‘ ์‹œ์ž‘ ๋‚ ์งœ (์˜ˆ: 2024-01-01)\n๋ฏธ์ž…๋ ฅ ์‹œ ๋งˆ์ง€๋ง‰ ํฌ๋กค๋ง ์ดํ›„๋ถ€ํ„ฐ (์ฆ๋ถ„ ์—…๋ฐ์ดํŠธ)'
)
parser.add_argument(
'--end-date',
metavar='YYYY-MM-DD',
default=None,
help='์ˆ˜์ง‘ ์ข…๋ฃŒ ๋‚ ์งœ (์˜ˆ: 2024-12-31)\n๋ฏธ์ž…๋ ฅ ์‹œ ์˜ค๋Š˜ ๋‚ ์งœ'
)
return parser.parse_args()
def get_crawler(party: str):
"""์ •๋‹น ์ฝ”๋“œ์— ๋งž๋Š” ํฌ๋กค๋Ÿฌ ์ธ์Šคํ„ด์Šค ๋ฐ˜ํ™˜"""
return {
'minjoo': MinjooAsyncCrawler,
'ppp': PPPAsyncCrawler,
'rebuilding': RebuildingAsyncCrawler,
'reform': ReformAsyncCrawler,
'basic_income': BasicIncomeAsyncCrawler,
'jinbo': JinboAsyncCrawler,
}[party]()
async def run_party(party: str, start_date=None, end_date=None):
"""๋‹จ์ผ ์ •๋‹น ํฌ๋กค๋ง ์‹คํ–‰"""
crawler = get_crawler(party)
if start_date or end_date:
df = await crawler.collect_all(start_date, end_date)
if not df.empty:
crawler.save_local(df)
crawler.upload_to_huggingface(df)
else:
await crawler.run_incremental()
async def main():
args = parse_args()
start_time = datetime.now()
target_parties = ALL_PARTIES if args.party == 'all' else [args.party]
logger.info("=" * 60)
logger.info("์ •๋‹น ๋ณด๋„์ž๋ฃŒ ํฌ๋กค๋Ÿฌ ์‹œ์ž‘")
logger.info(f"๋Œ€์ƒ ์ •๋‹น : {PARTY_LABELS[args.party]}")
logger.info(f"์ˆ˜์ง‘ ๊ธฐ๊ฐ„ : {args.start_date or '์ฆ๋ถ„ ์—…๋ฐ์ดํŠธ'} ~ {args.end_date or '์˜ค๋Š˜'}")
logger.info("=" * 60)
if len(target_parties) == 1:
await run_party(target_parties[0], args.start_date, args.end_date)
else:
results = await asyncio.gather(
*[run_party(p, args.start_date, args.end_date) for p in target_parties],
return_exceptions=True
)
for party, result in zip(target_parties, results):
if isinstance(result, Exception):
logger.error(f"{PARTY_LABELS[party]} ํฌ๋กค๋ง ์‹คํŒจ: {result}")
else:
logger.info(f"{PARTY_LABELS[party]} ํฌ๋กค๋ง ์™„๋ฃŒ")
duration = (datetime.now() - start_time).total_seconds()
logger.info("=" * 60)
logger.info(f"์ „์ฒด ์™„๋ฃŒ! ์†Œ์š” ์‹œ๊ฐ„: {duration:.1f}์ดˆ ({duration / 60:.1f}๋ถ„)")
logger.info("=" * 60)
if __name__ == "__main__":
asyncio.run(main())