Spaces:

ningzhitang2001
/

plse-conf-scraper

Sleeping

App Files Files Community

plse-conf-scraper / app.py

ningzhitang2001

Update app.py

6362987 verified 7 months ago

raw

history blame contribute delete

5.99 kB

	import gradio as gr
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse


	def scrape_conference_papers(url):
	"""
	Scrapes paper data from a conference URL and returns structured data.

	Args:
	url (str): The conference track URL to scrape

	Returns:
	dict: Contains 'papers' (list of dicts), 'track_name', and 'display_name'

	Raises:
	ValueError: If URL is invalid or scraping fails
	"""
	if not url:
	raise ValueError("URL cannot be empty.")

	print(f"Fetching: {url} at {pd.Timestamp.now()}")
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
	}

	try:
	response = requests.get(url, headers=headers)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")
	except requests.exceptions.RequestException as e:
	raise ValueError(f"Failed to fetch URL: {e}")

	papers_table = soup.select_one("div#event-overview table")
	if not papers_table:
	raise ValueError(
	"Could not find the 'Accepted Papers' table. Check if the URL is correct and points to a supported conference page."
	)

	papers_list = []
	for row in papers_table.find_all("tr"):
	title_element = row.find("a", attrs={"data-event-modal": True})
	if not title_element:
	continue

	paper_title = title_element.get_text(strip=True)
	performers_div = row.find("div", class_="performers")
	authors_str = (
	", ".join([a.get_text(strip=True) for a in performers_div.find_all("a")])
	if performers_div
	else "N/A"
	)
	papers_list.append({"Title": paper_title, "Authors": authors_str})

	if not papers_list:
	raise ValueError("Found the table, but it contains no paper data.")

	# Generate track information from URL
	try:
	path = urlparse(url).path
	track_name = path.strip("/").split("/")[-1]
	words = track_name.replace("-", " ").split()
	display_name = (
	" ".join([words[0].upper()] + [word.title() for word in words[1:]])
	if words
	else track_name
	)
	except Exception:
	track_name = "accepted_papers"
	display_name = "Accepted Papers"

	return {
	"papers": papers_list,
	"track_name": track_name,
	"display_name": display_name,
	}


	def scrape_and_save_papers(url):
	"""
	Gradio wrapper function that scrapes paper data and prepares UI components.
	"""
	if not url:
	raise gr.Error("Please enter a URL.")

	try:
	result = scrape_conference_papers(url)
	papers_list = result["papers"]
	track_name = result["track_name"]
	display_name = result["display_name"]
	except ValueError as e:
	raise gr.Error(str(e))

	df = pd.DataFrame(papers_list)
	output_filename = f"{track_name}.csv"
	dynamic_label = f"Accepted Papers - {display_name}"

	# Save the DataFrame to a CSV file to make it downloadable
	df.to_csv(output_filename, index=True, encoding="utf-8-sig")

	# Create a new DataFrame component with the dynamic label and make it visible
	updated_table = gr.DataFrame(label=dynamic_label, value=df, wrap=True, visible=True)

	# Create an updated download button that is visible when there's data
	updated_download_button = gr.DownloadButton(
	label="Download CSV", visible=True, value=output_filename
	)

	# Return the updated DataFrame component and the updated download button
	return updated_table, updated_download_button


	def create_gradio_app():
	"""
	Defines and returns the Gradio web application interface.
	"""
	with gr.Blocks(theme=gr.themes.Default()) as iface:
	gr.Markdown(
	"""
	# Researchr Conference Paper Scraper (PL/SE)
	Scrape lists of accepted papers from conferences on [`conf.researchr.org`](https://conf.researchr.org/) and download them as a CSV file. This tool primarily targets conferences in the domains of Programming Languages and Software Engineering.
	"""
	)

	url_input = gr.Textbox(
	placeholder="Enter the URL of a specific track (e.g., https://pldi25.sigplan.org/track/pldi-2025-papers)",
	container=False,
	)

	gr.Examples(
	examples=[
	"https://pldi25.sigplan.org/track/pldi-2025-papers",
	"https://conf.researchr.org/track/ase-2025/ase-2025-nier-track",
	"https://conf.researchr.org/track/vlhcc-2025/vlhcc-2025-research-papers",
	"https://2024.splashcon.org/track/splash-2024-oopsla",
	"https://conf.researchr.org/track/icse-2024/icse-2024-demonstrations?",
	],
	inputs=url_input,
	example_labels=[
	"PLDI 2025",
	"ASE 2025 NIER",
	"VL/HCC 2025",
	"OOPSLA 2024",
	"ICSE 2024 Demo",
	],
	)

	submit_button = gr.Button("Scrape Papers", variant="primary")

	empty_df_with_headers = pd.DataFrame({"Title": [], "Authors": []})
	output_table = gr.DataFrame(
	label="Accepted Papers",
	value=empty_df_with_headers,
	wrap=True,
	visible=False,
	)

	download_button = gr.DownloadButton(label="Download CSV", visible=False)

	# Connect the button click to the scraping function
	submit_button.click(
	fn=scrape_and_save_papers,
	inputs=url_input,
	outputs=[output_table, download_button],
	)

	gr.Markdown(
	"""
	Developed by [Ningzhi Tang](https://www.nztang.com/).
	"""
	)

	return iface


	if __name__ == "__main__":
	app = create_gradio_app()
	app.launch()