Spaces:

pkumc
/

PyTest

Runtime error

App Files Files Community

PyTest / covid.py

pkumc

Upload folder using huggingface_hub

ae92d51 over 2 years ago

raw

history blame contribute delete

4.71 kB

	# -- coding: utf-8 -

	import sys
	sys.path.append('/Users/machi/Library/Python/3.8/lib/python/site-packages')

	import os
	import asyncio
	# from pyppeteer import launcher
	# # 在导入 launch 之前把 --enable-automation 禁用防止监测webdriver
	# launcher.AUTOMATION_ARGS.remove("--enable-automation")

	from pyppeteer import launch
	from bs4 import BeautifulSoup
	import re
	import time

	async def pyppteer_fetchUrl(url):
	browser = await launch({'headless': False,'dumpio':True, 'autoClose':True})
	page = await browser.newPage()

	# await page.setDefaultNavigationTimeout(60000)
	await page.goto(url)
	await asyncio.wait([page.waitForNavigation()])
	str = await page.content()
	await browser.close()
	return str

	def fetchUrl(url):
	return asyncio.get_event_loop().run_until_complete(pyppteer_fetchUrl(url))

	def getPageUrl():
	for page in range(1,5):
	if page == 1:
	yield 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd.shtml'
	else:
	url = 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd_'+ str(page) +'.shtml'
	yield url

	def getTitleUrl(html):

	bsobj = BeautifulSoup(html,'html.parser')
	titleList = bsobj.find('div', attrs={"class":"list"}).ul.find_all("li")
	for item in titleList:
	link = "http://www.nhc.gov.cn" + item.a["href"];
	title = item.a["title"]
	date = item.span.text
	yield title, link, date

	def getInfo(pat, s):
	res = re.search(pat, s)
	if res:
	return res.group(1)
	return '0'

	def getContent(html):

	bsobj = BeautifulSoup(html,'html.parser')
	cnt = bsobj.find('div', attrs={"id":"xw_box"}).find_all("p")
	res = []

	if cnt:
	# 从第一段解析
	s = cnt[0].text
	res.append(getInfo(r'新增确诊病例(\d+)例', s))
	res.append(getInfo(r'本土病例(\d+)例', s))
	res.append(getInfo(r'新增死亡病例(\d+)例', s))

	# 从第二段解析
	s = cnt[1].text
	res.append(getInfo(r'新增治愈出院病例(\d+)例', s))

	# 从第五段解析
	s = cnt[4].text
	res.append(getInfo(r'新增无症状感染者(\d+)例', s))
	res.append(getInfo(r'本土(\d+)例', s))

	return res

	def saveFile(path, filename, content):

	if not os.path.exists(path):
	os.makedirs(path)

	# 保存文件
	with open(path + filename + ".txt", 'w', encoding='utf-8') as f:
	f.write(content)

	if "__main__" == __name__:
	# print(getInfo(r'新增死亡病例(\d+)例', '无新增死亡病例。'))
	# s = '4月28日0—24时，31个省（自治区、直辖市）和新疆生产建设兵团报告新增确诊病例5659例。其中境外输入病例13例（广东3例，北京2例，上海2例，福建2例，黑龙江1例，浙江1例，广西1例，四川1例），含2例由无症状感染者转为确诊病例（浙江1例，福建1例）；本土病例5646例（上海5487例，北京47例，吉林42例，浙江31例，山东7例，广东7例，黑龙江4例，江西4例，内蒙古3例，江苏3例，四川3例，河南2例，辽宁1例，福建1例，湖南1例，广西1例，重庆1例，云南1例），含5125例由无症状感染者转为确诊病例（上海5062例，吉林31例，浙江28例，辽宁1例，山东1例，河南1例，云南1例）。新增死亡病例52例，均为本土病例，在上海；无新增疑似病例。'
	# res = re.search( r'新增确诊病例(\d+)例', s)
	# print(res.group(1))
	#
	# res = re.search( r'本土病例.*），', s)
	# print(res.group())
	#
	# res = re.search( r'新增死亡病例\d+例', s)
	# print(res.group())
	#
	# res = re.search( r'新增治愈出院病例\d+例', s)
	# print(res.group())
	#
	with open('/Users/machi/Desktop/covid.csv', 'w') as f:
	header = ','.join(['日期', '新增确诊病例', '本土新增确诊病例', '新增死亡病例', '新增治愈出院病例', '新增无症状感染者', '本土新增无症状感染者'])
	f.write(header + '\n')

	for url in getPageUrl():
	print(url)
	try:
	s =fetchUrl(url)
	except:
	continue

	for title,link,date in getTitleUrl(s):
	print(title,link)
	# time.sleep(5)
	try:
	html =fetchUrl(link)
	content = getContent(html)

	s = ','.join([date] + content)
	f.write(s + '\n')
	print('%s write finish' % date)
	except Exception as e:
	print('%s process failed' % date, e)
	continue

	# break