ChatPaper111

Build error

App Files Files Community

ChatPaper111 / scipdf_utils.py

johnyang

Duplicate from yixin6178/ChatPaper

3356bf0 about 3 years ago

raw

history blame contribute delete

14.5 kB

	import re
	import os
	import os.path as op
	from glob import glob
	import urllib
	import subprocess
	import requests
	from bs4 import BeautifulSoup, NavigableString


	# or https://cloud.science-miner.com/grobid/ for cloud service
	GROBID_URL = "http://localhost:8070"
	DIR_PATH = op.dirname(op.abspath(__file__))
	PDF_FIGURES_JAR_PATH = op.join(
	DIR_PATH, "pdffigures2", "pdffigures2-assembly-0.0.12-SNAPSHOT.jar"
	)


	def list_pdf_paths(pdf_folder: str):
	"""
	list of pdf paths in pdf folder
	"""
	return glob(op.join(pdf_folder, "", "", "*.pdf"))


	def validate_url(path: str):
	"""
	Validate a given ``path`` if it is URL or not
	"""
	regex = re.compile(
	r"^(?:http\|ftp)s?://" # http:// or https://
	# domain...
	r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?\|[A-Z0-9-]{2,}\.?)\|"
	r"localhost\|" # localhost...
	r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
	r"(?::\d+)?" # optional port
	r"(?:/?\|[/?]\S+)$",
	re.IGNORECASE,
	)
	return re.match(regex, path) is not None


	def parse_pdf(
	pdf_path: str,
	fulltext: bool = True,
	soup: bool = False,
	grobid_url: str = GROBID_URL,
	):
	"""
	Function to parse PDF to XML or BeautifulSoup using GROBID tool

	You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally
	After loading GROBID zip file, you can run GROBID by using the following
	>> ./gradlew run

	Parameters
	==========
	pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF
	fulltext: bool, option for parsing, if True, parse full text of the article
	if False, parse only header
	grobid_url: str, url to GROBID parser, default at 'http://localhost:8070'
	This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service
	soup: bool, if True, return BeautifulSoup of the article

	Output
	======
	parsed_article: if soup is False, return parsed XML in text format,
	else return BeautifulSoup of the XML
	Example
	=======
	>> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True)
	"""
	# GROBID URL
	if fulltext:
	url = "%s/api/processFulltextDocument" % grobid_url
	else:
	url = "%s/api/processHeaderDocument" % grobid_url

	if isinstance(pdf_path, str):
	if validate_url(pdf_path) and op.splitext(pdf_path)[-1].lower() != ".pdf":
	print("The input URL has to end with ``.pdf``")
	parsed_article = None
	elif validate_url(pdf_path) and op.splitext(pdf_path)[-1] == ".pdf":
	page = urllib.request.urlopen(pdf_path).read()
	parsed_article = requests.post(url, files={"input": page}).text
	elif op.exists(pdf_path):
	parsed_article = requests.post(
	url, files={"input": open(pdf_path, "rb")}
	).text
	else:
	parsed_article = None
	elif isinstance(pdf_path, bytes):
	# assume that incoming is byte string
	parsed_article = requests.post(url, files={"input": pdf_path}).text
	else:
	parsed_article = None

	if soup and parsed_article is not None:
	parsed_article = BeautifulSoup(parsed_article, "lxml")
	return parsed_article


	def parse_authors(article):
	"""
	Parse authors from a given BeautifulSoup of an article
	"""
	author_names = article.find("sourcedesc").findAll("persname")
	authors = []
	for author in author_names:
	firstname = author.find("forename", {"type": "first"})
	firstname = firstname.text.strip() if firstname is not None else ""
	middlename = author.find("forename", {"type": "middle"})
	middlename = middlename.text.strip() if middlename is not None else ""
	lastname = author.find("surname")
	lastname = lastname.text.strip() if lastname is not None else ""
	if middlename != "":
	authors.append(firstname + " " + middlename + " " + lastname)
	else:
	authors.append(firstname + " " + lastname)
	authors = "; ".join(authors)
	return authors


	def parse_date(article):
	"""
	Parse date from a given BeautifulSoup of an article
	"""
	pub_date = article.find("publicationstmt")
	year = pub_date.find("date")
	year = year.attrs.get("when") if year is not None else ""
	return year


	def parse_abstract(article):
	"""
	Parse abstract from a given BeautifulSoup of an article
	"""
	div = article.find("abstract")
	abstract = ""
	for p in list(div.children):
	if not isinstance(p, NavigableString) and len(list(p)) > 0:
	abstract += " ".join(
	[elem.text for elem in p if not isinstance(
	elem, NavigableString)]
	)
	return abstract


	def calculate_number_of_references(div):
	"""
	For a given section, calculate number of references made in the section
	"""
	n_publication_ref = len(
	[ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"]
	)
	n_figure_ref = len(
	[ref for ref in div.find_all(
	"ref") if ref.attrs.get("type") == "figure"]
	)
	return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref}


	def parse_sections(article, as_list: bool = False):
	"""
	Parse list of sections from a given BeautifulSoup of an article

	Parameters
	==========
	as_list: bool, if True, output text as a list of paragraph instead
	of joining it together as one single text
	"""
	article_text = article.find("text")
	divs = article_text.find_all(
	"div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"})
	sections = []
	for div in divs:
	div_list = list(div.children)
	if len(div_list) == 0:
	heading = ""
	text = ""
	all_paragraphs = []
	elif len(div_list) == 1:
	if isinstance(div_list[0], NavigableString):
	heading = str(div_list[0])
	text = ""
	all_paragraphs = []
	else:
	heading = ""
	text = div_list[0].text
	all_paragraphs = [text]
	else:
	text = []
	heading = div_list[0]
	all_paragraphs = []
	if isinstance(heading, NavigableString):
	heading = str(heading)
	p_all = list(div.children)[1:]
	else:
	heading = ""
	p_all = list(div.children)
	for p in p_all:
	if p is not None:
	try:
	text.append(p.text)
	all_paragraphs.append(p.text)
	except:
	pass
	if not as_list:
	text = "\n".join(text)
	if heading != "" or text != "":
	ref_dict = calculate_number_of_references(div)
	sections.append(
	{
	"heading": heading,
	"text": text,
	"all_paragraphs": all_paragraphs,
	"n_publication_ref": ref_dict["n_publication_ref"],
	"n_figure_ref": ref_dict["n_figure_ref"],
	}
	)
	return sections


	def parse_references(article):
	"""
	Parse list of references from a given BeautifulSoup of an article
	"""
	reference_list = []
	references = article.find("text").find("div", attrs={"type": "references"})
	references = references.find_all(
	"biblstruct") if references is not None else []
	reference_list = []
	for reference in references:
	title = reference.find("title", attrs={"level": "a"})
	if title is None:
	title = reference.find("title", attrs={"level": "m"})
	title = title.text if title is not None else ""
	journal = reference.find("title", attrs={"level": "j"})
	journal = journal.text if journal is not None else ""
	if journal == "":
	journal = reference.find("publisher")
	journal = journal.text if journal is not None else ""
	year = reference.find("date")
	year = year.attrs.get("when") if year is not None else ""
	authors = []
	for author in reference.find_all("author"):
	firstname = author.find("forename", {"type": "first"})
	firstname = firstname.text.strip() if firstname is not None else ""
	middlename = author.find("forename", {"type": "middle"})
	middlename = middlename.text.strip() if middlename is not None else ""
	lastname = author.find("surname")
	lastname = lastname.text.strip() if lastname is not None else ""
	if middlename != "":
	authors.append(firstname + " " + middlename + " " + lastname)
	else:
	authors.append(firstname + " " + lastname)
	authors = "; ".join(authors)
	reference_list.append(
	{"title": title, "journal": journal, "year": year, "authors": authors}
	)
	return reference_list


	def parse_figure_caption(article):
	"""
	Parse list of figures/tables from a given BeautifulSoup of an article
	"""
	figures_list = []
	figures = article.find_all("figure")
	for figure in figures:
	figure_type = figure.attrs.get("type") or ""
	figure_id = figure.attrs["xml:id"] or ""
	label = figure.find("label").text
	if figure_type == "table":
	caption = figure.find("figdesc").text
	data = figure.table.text
	else:
	caption = figure.text
	data = ""
	figures_list.append(
	{
	"figure_label": label,
	"figure_type": figure_type,
	"figure_id": figure_id,
	"figure_caption": caption,
	"figure_data": data,
	}
	)
	return figures_list


	def convert_article_soup_to_dict(article, as_list: bool = False):
	"""
	Function to convert BeautifulSoup to JSON format
	similar to the output from https://github.com/allenai/science-parse/

	Parameters
	==========
	article: BeautifulSoup

	Output
	======
	article_json: dict, parsed dictionary of a given article in the following format
	{
	'title': ...,
	'abstract': ...,
	'sections': [
	{'heading': ..., 'text': ...},
	{'heading': ..., 'text': ...},
	...
	],
	'references': [
	{'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
	{'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
	...
	],
	'figures': [
	{'figure_label': ..., 'figure_type': ..., 'figure_id': ..., 'figure_caption': ..., 'figure_data': ...},
	...
	]
	}
	"""
	article_dict = {}
	if article is not None:
	title = article.find("title", attrs={"type": "main"})
	title = title.text.strip() if title is not None else ""
	article_dict["authors"] = parse_authors(article)
	article_dict["pub_date"] = parse_date(article)
	article_dict["title"] = title
	article_dict["abstract"] = parse_abstract(article)
	article_dict["sections"] = parse_sections(article, as_list=as_list)
	article_dict["references"] = parse_references(article)
	article_dict["figures"] = parse_figure_caption(article)

	doi = article.find("idno", attrs={"type": "DOI"})
	doi = doi.text if doi is not None else ""
	article_dict["doi"] = doi

	return article_dict
	else:
	return None


	def parse_pdf_to_dict(
	pdf_path: str,
	fulltext: bool = True,
	soup: bool = True,
	as_list: bool = False,
	grobid_url: str = GROBID_URL,
	):
	"""
	Parse the given PDF and return dictionary of the parsed article

	Parameters
	==========
	pdf_path: str, path to publication or article
	fulltext: bool, whether to extract fulltext or not
	soup: bool, whether to return BeautifulSoup or not
	as_list: bool, whether to return list of sections or not
	grobid_url: str, url to grobid server, default is `GROBID_URL`
	This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service

	Ouput
	=====
	article_dict: dict, dictionary of an article
	"""
	parsed_article = parse_pdf(
	pdf_path, fulltext=fulltext, soup=soup, grobid_url=grobid_url
	)
	article_dict = convert_article_soup_to_dict(
	parsed_article, as_list=as_list)
	return article_dict


	def parse_figures(
	pdf_folder: str,
	jar_path: str = PDF_FIGURES_JAR_PATH,
	resolution: int = 300,
	output_folder: str = "figures",
	):
	"""
	Parse figures from the given scientific PDF using pdffigures2

	Parameters
	==========
	pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files
	jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file
	resolution: int, resolution of the output figures
	output_folder: str, path to folder that we want to save parsed data (related to figures) and figures

	Output
	======
	folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively
	"""
	if not op.isdir(output_folder):
	os.makedirs(output_folder)

	# create ``data`` and ``figures`` subfolder within ``output_folder``
	data_path = op.join(output_folder, "data")
	figure_path = op.join(output_folder, "figures")
	if not op.exists(data_path):
	os.makedirs(data_path)
	if not op.exists(figure_path):
	os.makedirs(figure_path)

	if op.isdir(data_path) and op.isdir(figure_path):
	args = [
	"java",
	"-jar",
	jar_path,
	pdf_folder,
	"-i",
	str(resolution),
	"-d",
	os.path.join(os.path.abspath(data_path), ""),
	"-m",
	op.join(os.path.abspath(figure_path), ""), # end path with "/"
	]
	_ = subprocess.run(
	args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20
	)
	print("Done parsing figures from PDFs!")
	else:
	print("You may have to check of ``data`` and ``figures`` in the the output folder path.")