import gradio as gr import requests import pandas as pd from bs4 import BeautifulSoup from urllib.parse import urlparse def scrape_conference_papers(url): """ Scrapes paper data from a conference URL and returns structured data. Args: url (str): The conference track URL to scrape Returns: dict: Contains 'papers' (list of dicts), 'track_name', and 'display_name' Raises: ValueError: If URL is invalid or scraping fails """ if not url: raise ValueError("URL cannot be empty.") print(f"Fetching: {url} at {pd.Timestamp.now()}") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } try: response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") except requests.exceptions.RequestException as e: raise ValueError(f"Failed to fetch URL: {e}") papers_table = soup.select_one("div#event-overview table") if not papers_table: raise ValueError( "Could not find the 'Accepted Papers' table. Check if the URL is correct and points to a supported conference page." ) papers_list = [] for row in papers_table.find_all("tr"): title_element = row.find("a", attrs={"data-event-modal": True}) if not title_element: continue paper_title = title_element.get_text(strip=True) performers_div = row.find("div", class_="performers") authors_str = ( ", ".join([a.get_text(strip=True) for a in performers_div.find_all("a")]) if performers_div else "N/A" ) papers_list.append({"Title": paper_title, "Authors": authors_str}) if not papers_list: raise ValueError("Found the table, but it contains no paper data.") # Generate track information from URL try: path = urlparse(url).path track_name = path.strip("/").split("/")[-1] words = track_name.replace("-", " ").split() display_name = ( " ".join([words[0].upper()] + [word.title() for word in words[1:]]) if words else track_name ) except Exception: track_name = "accepted_papers" display_name = "Accepted Papers" return { "papers": papers_list, "track_name": track_name, "display_name": display_name, } def scrape_and_save_papers(url): """ Gradio wrapper function that scrapes paper data and prepares UI components. """ if not url: raise gr.Error("Please enter a URL.") try: result = scrape_conference_papers(url) papers_list = result["papers"] track_name = result["track_name"] display_name = result["display_name"] except ValueError as e: raise gr.Error(str(e)) df = pd.DataFrame(papers_list) output_filename = f"{track_name}.csv" dynamic_label = f"Accepted Papers - {display_name}" # Save the DataFrame to a CSV file to make it downloadable df.to_csv(output_filename, index=True, encoding="utf-8-sig") # Create a new DataFrame component with the dynamic label and make it visible updated_table = gr.DataFrame(label=dynamic_label, value=df, wrap=True, visible=True) # Create an updated download button that is visible when there's data updated_download_button = gr.DownloadButton( label="Download CSV", visible=True, value=output_filename ) # Return the updated DataFrame component and the updated download button return updated_table, updated_download_button def create_gradio_app(): """ Defines and returns the Gradio web application interface. """ with gr.Blocks(theme=gr.themes.Default()) as iface: gr.Markdown( """ # Researchr Conference Paper Scraper (PL/SE) Scrape lists of accepted papers from conferences on [`conf.researchr.org`](https://conf.researchr.org/) and download them as a CSV file. This tool primarily targets conferences in the domains of Programming Languages and Software Engineering. """ ) url_input = gr.Textbox( placeholder="Enter the URL of a specific track (e.g., https://pldi25.sigplan.org/track/pldi-2025-papers)", container=False, ) gr.Examples( examples=[ "https://pldi25.sigplan.org/track/pldi-2025-papers", "https://conf.researchr.org/track/ase-2025/ase-2025-nier-track", "https://conf.researchr.org/track/vlhcc-2025/vlhcc-2025-research-papers", "https://2024.splashcon.org/track/splash-2024-oopsla", "https://conf.researchr.org/track/icse-2024/icse-2024-demonstrations?", ], inputs=url_input, example_labels=[ "PLDI 2025", "ASE 2025 NIER", "VL/HCC 2025", "OOPSLA 2024", "ICSE 2024 Demo", ], ) submit_button = gr.Button("Scrape Papers", variant="primary") empty_df_with_headers = pd.DataFrame({"Title": [], "Authors": []}) output_table = gr.DataFrame( label="Accepted Papers", value=empty_df_with_headers, wrap=True, visible=False, ) download_button = gr.DownloadButton(label="Download CSV", visible=False) # Connect the button click to the scraping function submit_button.click( fn=scrape_and_save_papers, inputs=url_input, outputs=[output_table, download_button], ) gr.Markdown( """ *Developed by [Ningzhi Tang](https://www.nztang.com/).* """ ) return iface if __name__ == "__main__": app = create_gradio_app() app.launch()