Spaces:
Build error
Build error
| import re | |
| import os | |
| import os.path as op | |
| from glob import glob | |
| import urllib | |
| import subprocess | |
| import requests | |
| from bs4 import BeautifulSoup, NavigableString | |
| # or https://cloud.science-miner.com/grobid/ for cloud service | |
| GROBID_URL = "http://localhost:8070" | |
| DIR_PATH = op.dirname(op.abspath(__file__)) | |
| PDF_FIGURES_JAR_PATH = op.join( | |
| DIR_PATH, "pdffigures2", "pdffigures2-assembly-0.0.12-SNAPSHOT.jar" | |
| ) | |
| def list_pdf_paths(pdf_folder: str): | |
| """ | |
| list of pdf paths in pdf folder | |
| """ | |
| return glob(op.join(pdf_folder, "*", "*", "*.pdf")) | |
| def validate_url(path: str): | |
| """ | |
| Validate a given ``path`` if it is URL or not | |
| """ | |
| regex = re.compile( | |
| r"^(?:http|ftp)s?://" # http:// or https:// | |
| # domain... | |
| r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" | |
| r"localhost|" # localhost... | |
| r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip | |
| r"(?::\d+)?" # optional port | |
| r"(?:/?|[/?]\S+)$", | |
| re.IGNORECASE, | |
| ) | |
| return re.match(regex, path) is not None | |
| def parse_pdf( | |
| pdf_path: str, | |
| fulltext: bool = True, | |
| soup: bool = False, | |
| grobid_url: str = GROBID_URL, | |
| ): | |
| """ | |
| Function to parse PDF to XML or BeautifulSoup using GROBID tool | |
| You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally | |
| After loading GROBID zip file, you can run GROBID by using the following | |
| >> ./gradlew run | |
| Parameters | |
| ========== | |
| pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF | |
| fulltext: bool, option for parsing, if True, parse full text of the article | |
| if False, parse only header | |
| grobid_url: str, url to GROBID parser, default at 'http://localhost:8070' | |
| This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service | |
| soup: bool, if True, return BeautifulSoup of the article | |
| Output | |
| ====== | |
| parsed_article: if soup is False, return parsed XML in text format, | |
| else return BeautifulSoup of the XML | |
| Example | |
| ======= | |
| >> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True) | |
| """ | |
| # GROBID URL | |
| if fulltext: | |
| url = "%s/api/processFulltextDocument" % grobid_url | |
| else: | |
| url = "%s/api/processHeaderDocument" % grobid_url | |
| if isinstance(pdf_path, str): | |
| if validate_url(pdf_path) and op.splitext(pdf_path)[-1].lower() != ".pdf": | |
| print("The input URL has to end with ``.pdf``") | |
| parsed_article = None | |
| elif validate_url(pdf_path) and op.splitext(pdf_path)[-1] == ".pdf": | |
| page = urllib.request.urlopen(pdf_path).read() | |
| parsed_article = requests.post(url, files={"input": page}).text | |
| elif op.exists(pdf_path): | |
| parsed_article = requests.post( | |
| url, files={"input": open(pdf_path, "rb")} | |
| ).text | |
| else: | |
| parsed_article = None | |
| elif isinstance(pdf_path, bytes): | |
| # assume that incoming is byte string | |
| parsed_article = requests.post(url, files={"input": pdf_path}).text | |
| else: | |
| parsed_article = None | |
| if soup and parsed_article is not None: | |
| parsed_article = BeautifulSoup(parsed_article, "lxml") | |
| return parsed_article | |
| def parse_authors(article): | |
| """ | |
| Parse authors from a given BeautifulSoup of an article | |
| """ | |
| author_names = article.find("sourcedesc").findAll("persname") | |
| authors = [] | |
| for author in author_names: | |
| firstname = author.find("forename", {"type": "first"}) | |
| firstname = firstname.text.strip() if firstname is not None else "" | |
| middlename = author.find("forename", {"type": "middle"}) | |
| middlename = middlename.text.strip() if middlename is not None else "" | |
| lastname = author.find("surname") | |
| lastname = lastname.text.strip() if lastname is not None else "" | |
| if middlename != "": | |
| authors.append(firstname + " " + middlename + " " + lastname) | |
| else: | |
| authors.append(firstname + " " + lastname) | |
| authors = "; ".join(authors) | |
| return authors | |
| def parse_date(article): | |
| """ | |
| Parse date from a given BeautifulSoup of an article | |
| """ | |
| pub_date = article.find("publicationstmt") | |
| year = pub_date.find("date") | |
| year = year.attrs.get("when") if year is not None else "" | |
| return year | |
| def parse_abstract(article): | |
| """ | |
| Parse abstract from a given BeautifulSoup of an article | |
| """ | |
| div = article.find("abstract") | |
| abstract = "" | |
| for p in list(div.children): | |
| if not isinstance(p, NavigableString) and len(list(p)) > 0: | |
| abstract += " ".join( | |
| [elem.text for elem in p if not isinstance( | |
| elem, NavigableString)] | |
| ) | |
| return abstract | |
| def calculate_number_of_references(div): | |
| """ | |
| For a given section, calculate number of references made in the section | |
| """ | |
| n_publication_ref = len( | |
| [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"] | |
| ) | |
| n_figure_ref = len( | |
| [ref for ref in div.find_all( | |
| "ref") if ref.attrs.get("type") == "figure"] | |
| ) | |
| return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref} | |
| def parse_sections(article, as_list: bool = False): | |
| """ | |
| Parse list of sections from a given BeautifulSoup of an article | |
| Parameters | |
| ========== | |
| as_list: bool, if True, output text as a list of paragraph instead | |
| of joining it together as one single text | |
| """ | |
| article_text = article.find("text") | |
| divs = article_text.find_all( | |
| "div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"}) | |
| sections = [] | |
| for div in divs: | |
| div_list = list(div.children) | |
| if len(div_list) == 0: | |
| heading = "" | |
| text = "" | |
| all_paragraphs = [] | |
| elif len(div_list) == 1: | |
| if isinstance(div_list[0], NavigableString): | |
| heading = str(div_list[0]) | |
| text = "" | |
| all_paragraphs = [] | |
| else: | |
| heading = "" | |
| text = div_list[0].text | |
| all_paragraphs = [text] | |
| else: | |
| text = [] | |
| heading = div_list[0] | |
| all_paragraphs = [] | |
| if isinstance(heading, NavigableString): | |
| heading = str(heading) | |
| p_all = list(div.children)[1:] | |
| else: | |
| heading = "" | |
| p_all = list(div.children) | |
| for p in p_all: | |
| if p is not None: | |
| try: | |
| text.append(p.text) | |
| all_paragraphs.append(p.text) | |
| except: | |
| pass | |
| if not as_list: | |
| text = "\n".join(text) | |
| if heading != "" or text != "": | |
| ref_dict = calculate_number_of_references(div) | |
| sections.append( | |
| { | |
| "heading": heading, | |
| "text": text, | |
| "all_paragraphs": all_paragraphs, | |
| "n_publication_ref": ref_dict["n_publication_ref"], | |
| "n_figure_ref": ref_dict["n_figure_ref"], | |
| } | |
| ) | |
| return sections | |
| def parse_references(article): | |
| """ | |
| Parse list of references from a given BeautifulSoup of an article | |
| """ | |
| reference_list = [] | |
| references = article.find("text").find("div", attrs={"type": "references"}) | |
| references = references.find_all( | |
| "biblstruct") if references is not None else [] | |
| reference_list = [] | |
| for reference in references: | |
| title = reference.find("title", attrs={"level": "a"}) | |
| if title is None: | |
| title = reference.find("title", attrs={"level": "m"}) | |
| title = title.text if title is not None else "" | |
| journal = reference.find("title", attrs={"level": "j"}) | |
| journal = journal.text if journal is not None else "" | |
| if journal == "": | |
| journal = reference.find("publisher") | |
| journal = journal.text if journal is not None else "" | |
| year = reference.find("date") | |
| year = year.attrs.get("when") if year is not None else "" | |
| authors = [] | |
| for author in reference.find_all("author"): | |
| firstname = author.find("forename", {"type": "first"}) | |
| firstname = firstname.text.strip() if firstname is not None else "" | |
| middlename = author.find("forename", {"type": "middle"}) | |
| middlename = middlename.text.strip() if middlename is not None else "" | |
| lastname = author.find("surname") | |
| lastname = lastname.text.strip() if lastname is not None else "" | |
| if middlename != "": | |
| authors.append(firstname + " " + middlename + " " + lastname) | |
| else: | |
| authors.append(firstname + " " + lastname) | |
| authors = "; ".join(authors) | |
| reference_list.append( | |
| {"title": title, "journal": journal, "year": year, "authors": authors} | |
| ) | |
| return reference_list | |
| def parse_figure_caption(article): | |
| """ | |
| Parse list of figures/tables from a given BeautifulSoup of an article | |
| """ | |
| figures_list = [] | |
| figures = article.find_all("figure") | |
| for figure in figures: | |
| figure_type = figure.attrs.get("type") or "" | |
| figure_id = figure.attrs["xml:id"] or "" | |
| label = figure.find("label").text | |
| if figure_type == "table": | |
| caption = figure.find("figdesc").text | |
| data = figure.table.text | |
| else: | |
| caption = figure.text | |
| data = "" | |
| figures_list.append( | |
| { | |
| "figure_label": label, | |
| "figure_type": figure_type, | |
| "figure_id": figure_id, | |
| "figure_caption": caption, | |
| "figure_data": data, | |
| } | |
| ) | |
| return figures_list | |
| def convert_article_soup_to_dict(article, as_list: bool = False): | |
| """ | |
| Function to convert BeautifulSoup to JSON format | |
| similar to the output from https://github.com/allenai/science-parse/ | |
| Parameters | |
| ========== | |
| article: BeautifulSoup | |
| Output | |
| ====== | |
| article_json: dict, parsed dictionary of a given article in the following format | |
| { | |
| 'title': ..., | |
| 'abstract': ..., | |
| 'sections': [ | |
| {'heading': ..., 'text': ...}, | |
| {'heading': ..., 'text': ...}, | |
| ... | |
| ], | |
| 'references': [ | |
| {'title': ..., 'journal': ..., 'year': ..., 'authors': ...}, | |
| {'title': ..., 'journal': ..., 'year': ..., 'authors': ...}, | |
| ... | |
| ], | |
| 'figures': [ | |
| {'figure_label': ..., 'figure_type': ..., 'figure_id': ..., 'figure_caption': ..., 'figure_data': ...}, | |
| ... | |
| ] | |
| } | |
| """ | |
| article_dict = {} | |
| if article is not None: | |
| title = article.find("title", attrs={"type": "main"}) | |
| title = title.text.strip() if title is not None else "" | |
| article_dict["authors"] = parse_authors(article) | |
| article_dict["pub_date"] = parse_date(article) | |
| article_dict["title"] = title | |
| article_dict["abstract"] = parse_abstract(article) | |
| article_dict["sections"] = parse_sections(article, as_list=as_list) | |
| article_dict["references"] = parse_references(article) | |
| article_dict["figures"] = parse_figure_caption(article) | |
| doi = article.find("idno", attrs={"type": "DOI"}) | |
| doi = doi.text if doi is not None else "" | |
| article_dict["doi"] = doi | |
| return article_dict | |
| else: | |
| return None | |
| def parse_pdf_to_dict( | |
| pdf_path: str, | |
| fulltext: bool = True, | |
| soup: bool = True, | |
| as_list: bool = False, | |
| grobid_url: str = GROBID_URL, | |
| ): | |
| """ | |
| Parse the given PDF and return dictionary of the parsed article | |
| Parameters | |
| ========== | |
| pdf_path: str, path to publication or article | |
| fulltext: bool, whether to extract fulltext or not | |
| soup: bool, whether to return BeautifulSoup or not | |
| as_list: bool, whether to return list of sections or not | |
| grobid_url: str, url to grobid server, default is `GROBID_URL` | |
| This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service | |
| Ouput | |
| ===== | |
| article_dict: dict, dictionary of an article | |
| """ | |
| parsed_article = parse_pdf( | |
| pdf_path, fulltext=fulltext, soup=soup, grobid_url=grobid_url | |
| ) | |
| article_dict = convert_article_soup_to_dict( | |
| parsed_article, as_list=as_list) | |
| return article_dict | |
| def parse_figures( | |
| pdf_folder: str, | |
| jar_path: str = PDF_FIGURES_JAR_PATH, | |
| resolution: int = 300, | |
| output_folder: str = "figures", | |
| ): | |
| """ | |
| Parse figures from the given scientific PDF using pdffigures2 | |
| Parameters | |
| ========== | |
| pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files | |
| jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file | |
| resolution: int, resolution of the output figures | |
| output_folder: str, path to folder that we want to save parsed data (related to figures) and figures | |
| Output | |
| ====== | |
| folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively | |
| """ | |
| if not op.isdir(output_folder): | |
| os.makedirs(output_folder) | |
| # create ``data`` and ``figures`` subfolder within ``output_folder`` | |
| data_path = op.join(output_folder, "data") | |
| figure_path = op.join(output_folder, "figures") | |
| if not op.exists(data_path): | |
| os.makedirs(data_path) | |
| if not op.exists(figure_path): | |
| os.makedirs(figure_path) | |
| if op.isdir(data_path) and op.isdir(figure_path): | |
| args = [ | |
| "java", | |
| "-jar", | |
| jar_path, | |
| pdf_folder, | |
| "-i", | |
| str(resolution), | |
| "-d", | |
| os.path.join(os.path.abspath(data_path), ""), | |
| "-m", | |
| op.join(os.path.abspath(figure_path), ""), # end path with "/" | |
| ] | |
| _ = subprocess.run( | |
| args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20 | |
| ) | |
| print("Done parsing figures from PDFs!") | |
| else: | |
| print("You may have to check of ``data`` and ``figures`` in the the output folder path.") | |