ningzhitang2001's picture
Update app.py
6362987 verified
import gradio as gr
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
def scrape_conference_papers(url):
"""
Scrapes paper data from a conference URL and returns structured data.
Args:
url (str): The conference track URL to scrape
Returns:
dict: Contains 'papers' (list of dicts), 'track_name', and 'display_name'
Raises:
ValueError: If URL is invalid or scraping fails
"""
if not url:
raise ValueError("URL cannot be empty.")
print(f"Fetching: {url} at {pd.Timestamp.now()}")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
except requests.exceptions.RequestException as e:
raise ValueError(f"Failed to fetch URL: {e}")
papers_table = soup.select_one("div#event-overview table")
if not papers_table:
raise ValueError(
"Could not find the 'Accepted Papers' table. Check if the URL is correct and points to a supported conference page."
)
papers_list = []
for row in papers_table.find_all("tr"):
title_element = row.find("a", attrs={"data-event-modal": True})
if not title_element:
continue
paper_title = title_element.get_text(strip=True)
performers_div = row.find("div", class_="performers")
authors_str = (
", ".join([a.get_text(strip=True) for a in performers_div.find_all("a")])
if performers_div
else "N/A"
)
papers_list.append({"Title": paper_title, "Authors": authors_str})
if not papers_list:
raise ValueError("Found the table, but it contains no paper data.")
# Generate track information from URL
try:
path = urlparse(url).path
track_name = path.strip("/").split("/")[-1]
words = track_name.replace("-", " ").split()
display_name = (
" ".join([words[0].upper()] + [word.title() for word in words[1:]])
if words
else track_name
)
except Exception:
track_name = "accepted_papers"
display_name = "Accepted Papers"
return {
"papers": papers_list,
"track_name": track_name,
"display_name": display_name,
}
def scrape_and_save_papers(url):
"""
Gradio wrapper function that scrapes paper data and prepares UI components.
"""
if not url:
raise gr.Error("Please enter a URL.")
try:
result = scrape_conference_papers(url)
papers_list = result["papers"]
track_name = result["track_name"]
display_name = result["display_name"]
except ValueError as e:
raise gr.Error(str(e))
df = pd.DataFrame(papers_list)
output_filename = f"{track_name}.csv"
dynamic_label = f"Accepted Papers - {display_name}"
# Save the DataFrame to a CSV file to make it downloadable
df.to_csv(output_filename, index=True, encoding="utf-8-sig")
# Create a new DataFrame component with the dynamic label and make it visible
updated_table = gr.DataFrame(label=dynamic_label, value=df, wrap=True, visible=True)
# Create an updated download button that is visible when there's data
updated_download_button = gr.DownloadButton(
label="Download CSV", visible=True, value=output_filename
)
# Return the updated DataFrame component and the updated download button
return updated_table, updated_download_button
def create_gradio_app():
"""
Defines and returns the Gradio web application interface.
"""
with gr.Blocks(theme=gr.themes.Default()) as iface:
gr.Markdown(
"""
# Researchr Conference Paper Scraper (PL/SE)
Scrape lists of accepted papers from conferences on [`conf.researchr.org`](https://conf.researchr.org/) and download them as a CSV file. This tool primarily targets conferences in the domains of Programming Languages and Software Engineering.
"""
)
url_input = gr.Textbox(
placeholder="Enter the URL of a specific track (e.g., https://pldi25.sigplan.org/track/pldi-2025-papers)",
container=False,
)
gr.Examples(
examples=[
"https://pldi25.sigplan.org/track/pldi-2025-papers",
"https://conf.researchr.org/track/ase-2025/ase-2025-nier-track",
"https://conf.researchr.org/track/vlhcc-2025/vlhcc-2025-research-papers",
"https://2024.splashcon.org/track/splash-2024-oopsla",
"https://conf.researchr.org/track/icse-2024/icse-2024-demonstrations?",
],
inputs=url_input,
example_labels=[
"PLDI 2025",
"ASE 2025 NIER",
"VL/HCC 2025",
"OOPSLA 2024",
"ICSE 2024 Demo",
],
)
submit_button = gr.Button("Scrape Papers", variant="primary")
empty_df_with_headers = pd.DataFrame({"Title": [], "Authors": []})
output_table = gr.DataFrame(
label="Accepted Papers",
value=empty_df_with_headers,
wrap=True,
visible=False,
)
download_button = gr.DownloadButton(label="Download CSV", visible=False)
# Connect the button click to the scraping function
submit_button.click(
fn=scrape_and_save_papers,
inputs=url_input,
outputs=[output_table, download_button],
)
gr.Markdown(
"""
*Developed by [Ningzhi Tang](https://www.nztang.com/).*
"""
)
return iface
if __name__ == "__main__":
app = create_gradio_app()
app.launch()