Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse | |
| def scrape_conference_papers(url): | |
| """ | |
| Scrapes paper data from a conference URL and returns structured data. | |
| Args: | |
| url (str): The conference track URL to scrape | |
| Returns: | |
| dict: Contains 'papers' (list of dicts), 'track_name', and 'display_name' | |
| Raises: | |
| ValueError: If URL is invalid or scraping fails | |
| """ | |
| if not url: | |
| raise ValueError("URL cannot be empty.") | |
| print(f"Fetching: {url} at {pd.Timestamp.now()}") | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
| } | |
| try: | |
| response = requests.get(url, headers=headers) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| except requests.exceptions.RequestException as e: | |
| raise ValueError(f"Failed to fetch URL: {e}") | |
| papers_table = soup.select_one("div#event-overview table") | |
| if not papers_table: | |
| raise ValueError( | |
| "Could not find the 'Accepted Papers' table. Check if the URL is correct and points to a supported conference page." | |
| ) | |
| papers_list = [] | |
| for row in papers_table.find_all("tr"): | |
| title_element = row.find("a", attrs={"data-event-modal": True}) | |
| if not title_element: | |
| continue | |
| paper_title = title_element.get_text(strip=True) | |
| performers_div = row.find("div", class_="performers") | |
| authors_str = ( | |
| ", ".join([a.get_text(strip=True) for a in performers_div.find_all("a")]) | |
| if performers_div | |
| else "N/A" | |
| ) | |
| papers_list.append({"Title": paper_title, "Authors": authors_str}) | |
| if not papers_list: | |
| raise ValueError("Found the table, but it contains no paper data.") | |
| # Generate track information from URL | |
| try: | |
| path = urlparse(url).path | |
| track_name = path.strip("/").split("/")[-1] | |
| words = track_name.replace("-", " ").split() | |
| display_name = ( | |
| " ".join([words[0].upper()] + [word.title() for word in words[1:]]) | |
| if words | |
| else track_name | |
| ) | |
| except Exception: | |
| track_name = "accepted_papers" | |
| display_name = "Accepted Papers" | |
| return { | |
| "papers": papers_list, | |
| "track_name": track_name, | |
| "display_name": display_name, | |
| } | |
| def scrape_and_save_papers(url): | |
| """ | |
| Gradio wrapper function that scrapes paper data and prepares UI components. | |
| """ | |
| if not url: | |
| raise gr.Error("Please enter a URL.") | |
| try: | |
| result = scrape_conference_papers(url) | |
| papers_list = result["papers"] | |
| track_name = result["track_name"] | |
| display_name = result["display_name"] | |
| except ValueError as e: | |
| raise gr.Error(str(e)) | |
| df = pd.DataFrame(papers_list) | |
| output_filename = f"{track_name}.csv" | |
| dynamic_label = f"Accepted Papers - {display_name}" | |
| # Save the DataFrame to a CSV file to make it downloadable | |
| df.to_csv(output_filename, index=True, encoding="utf-8-sig") | |
| # Create a new DataFrame component with the dynamic label and make it visible | |
| updated_table = gr.DataFrame(label=dynamic_label, value=df, wrap=True, visible=True) | |
| # Create an updated download button that is visible when there's data | |
| updated_download_button = gr.DownloadButton( | |
| label="Download CSV", visible=True, value=output_filename | |
| ) | |
| # Return the updated DataFrame component and the updated download button | |
| return updated_table, updated_download_button | |
| def create_gradio_app(): | |
| """ | |
| Defines and returns the Gradio web application interface. | |
| """ | |
| with gr.Blocks(theme=gr.themes.Default()) as iface: | |
| gr.Markdown( | |
| """ | |
| # Researchr Conference Paper Scraper (PL/SE) | |
| Scrape lists of accepted papers from conferences on [`conf.researchr.org`](https://conf.researchr.org/) and download them as a CSV file. This tool primarily targets conferences in the domains of Programming Languages and Software Engineering. | |
| """ | |
| ) | |
| url_input = gr.Textbox( | |
| placeholder="Enter the URL of a specific track (e.g., https://pldi25.sigplan.org/track/pldi-2025-papers)", | |
| container=False, | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| "https://pldi25.sigplan.org/track/pldi-2025-papers", | |
| "https://conf.researchr.org/track/ase-2025/ase-2025-nier-track", | |
| "https://conf.researchr.org/track/vlhcc-2025/vlhcc-2025-research-papers", | |
| "https://2024.splashcon.org/track/splash-2024-oopsla", | |
| "https://conf.researchr.org/track/icse-2024/icse-2024-demonstrations?", | |
| ], | |
| inputs=url_input, | |
| example_labels=[ | |
| "PLDI 2025", | |
| "ASE 2025 NIER", | |
| "VL/HCC 2025", | |
| "OOPSLA 2024", | |
| "ICSE 2024 Demo", | |
| ], | |
| ) | |
| submit_button = gr.Button("Scrape Papers", variant="primary") | |
| empty_df_with_headers = pd.DataFrame({"Title": [], "Authors": []}) | |
| output_table = gr.DataFrame( | |
| label="Accepted Papers", | |
| value=empty_df_with_headers, | |
| wrap=True, | |
| visible=False, | |
| ) | |
| download_button = gr.DownloadButton(label="Download CSV", visible=False) | |
| # Connect the button click to the scraping function | |
| submit_button.click( | |
| fn=scrape_and_save_papers, | |
| inputs=url_input, | |
| outputs=[output_table, download_button], | |
| ) | |
| gr.Markdown( | |
| """ | |
| *Developed by [Ningzhi Tang](https://www.nztang.com/).* | |
| """ | |
| ) | |
| return iface | |
| if __name__ == "__main__": | |
| app = create_gradio_app() | |
| app.launch() | |