| |
|
|
| import sys |
| sys.path.append('/Users/machi/Library/Python/3.8/lib/python/site-packages') |
|
|
| import os |
| import asyncio |
| |
| |
| |
|
|
| from pyppeteer import launch |
| from bs4 import BeautifulSoup |
| import re |
| import time |
|
|
| async def pyppteer_fetchUrl(url): |
| browser = await launch({'headless': False,'dumpio':True, 'autoClose':True}) |
| page = await browser.newPage() |
|
|
| |
| await page.goto(url) |
| await asyncio.wait([page.waitForNavigation()]) |
| str = await page.content() |
| await browser.close() |
| return str |
|
|
| def fetchUrl(url): |
| return asyncio.get_event_loop().run_until_complete(pyppteer_fetchUrl(url)) |
|
|
| def getPageUrl(): |
| for page in range(1,5): |
| if page == 1: |
| yield 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd.shtml' |
| else: |
| url = 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd_'+ str(page) +'.shtml' |
| yield url |
|
|
| def getTitleUrl(html): |
|
|
| bsobj = BeautifulSoup(html,'html.parser') |
| titleList = bsobj.find('div', attrs={"class":"list"}).ul.find_all("li") |
| for item in titleList: |
| link = "http://www.nhc.gov.cn" + item.a["href"]; |
| title = item.a["title"] |
| date = item.span.text |
| yield title, link, date |
|
|
| def getInfo(pat, s): |
| res = re.search(pat, s) |
| if res: |
| return res.group(1) |
| return '0' |
|
|
| def getContent(html): |
|
|
| bsobj = BeautifulSoup(html,'html.parser') |
| cnt = bsobj.find('div', attrs={"id":"xw_box"}).find_all("p") |
| res = [] |
|
|
| if cnt: |
| |
| s = cnt[0].text |
| res.append(getInfo(r'新增确诊病例(\d+)例', s)) |
| res.append(getInfo(r'本土病例(\d+)例', s)) |
| res.append(getInfo(r'新增死亡病例(\d+)例', s)) |
|
|
| |
| s = cnt[1].text |
| res.append(getInfo(r'新增治愈出院病例(\d+)例', s)) |
|
|
| |
| s = cnt[4].text |
| res.append(getInfo(r'新增无症状感染者(\d+)例', s)) |
| res.append(getInfo(r'本土(\d+)例', s)) |
|
|
| return res |
|
|
| def saveFile(path, filename, content): |
|
|
| if not os.path.exists(path): |
| os.makedirs(path) |
|
|
| |
| with open(path + filename + ".txt", 'w', encoding='utf-8') as f: |
| f.write(content) |
|
|
| if "__main__" == __name__: |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| with open('/Users/machi/Desktop/covid.csv', 'w') as f: |
| header = ','.join(['日期', '新增确诊病例', '本土新增确诊病例', '新增死亡病例', '新增治愈出院病例', '新增无症状感染者', '本土新增无症状感染者']) |
| f.write(header + '\n') |
|
|
| for url in getPageUrl(): |
| print(url) |
| try: |
| s =fetchUrl(url) |
| except: |
| continue |
|
|
| for title,link,date in getTitleUrl(s): |
| print(title,link) |
| |
| try: |
| html =fetchUrl(link) |
| content = getContent(html) |
|
|
| s = ','.join([date] + content) |
| f.write(s + '\n') |
| print('%s write finish' % date) |
| except Exception as e: |
| print('%s process failed' % date, e) |
| continue |
|
|
| |
|
|