import gradio as gr
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse


def scrape_conference_papers(url):
    """
    Scrapes paper data from a conference URL and returns structured data.

    Args:
        url (str): The conference track URL to scrape

    Returns:
        dict: Contains 'papers' (list of dicts), 'track_name', and 'display_name'

    Raises:
        ValueError: If URL is invalid or scraping fails
    """
    if not url:
        raise ValueError("URL cannot be empty.")

    print(f"Fetching: {url} at {pd.Timestamp.now()}")
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
    except requests.exceptions.RequestException as e:
        raise ValueError(f"Failed to fetch URL: {e}")

    papers_table = soup.select_one("div#event-overview table")
    if not papers_table:
        raise ValueError(
            "Could not find the 'Accepted Papers' table. Check if the URL is correct and points to a supported conference page."
        )

    papers_list = []
    for row in papers_table.find_all("tr"):
        title_element = row.find("a", attrs={"data-event-modal": True})
        if not title_element:
            continue

        paper_title = title_element.get_text(strip=True)
        performers_div = row.find("div", class_="performers")
        authors_str = (
            ", ".join([a.get_text(strip=True) for a in performers_div.find_all("a")])
            if performers_div
            else "N/A"
        )
        papers_list.append({"Title": paper_title, "Authors": authors_str})

    if not papers_list:
        raise ValueError("Found the table, but it contains no paper data.")

    # Generate track information from URL
    try:
        path = urlparse(url).path
        track_name = path.strip("/").split("/")[-1]
        words = track_name.replace("-", " ").split()
        display_name = (
            " ".join([words[0].upper()] + [word.title() for word in words[1:]])
            if words
            else track_name
        )
    except Exception:
        track_name = "accepted_papers"
        display_name = "Accepted Papers"

    return {
        "papers": papers_list,
        "track_name": track_name,
        "display_name": display_name,
    }


def scrape_and_save_papers(url):
    """
    Gradio wrapper function that scrapes paper data and prepares UI components.
    """
    if not url:
        raise gr.Error("Please enter a URL.")

    try:
        result = scrape_conference_papers(url)
        papers_list = result["papers"]
        track_name = result["track_name"]
        display_name = result["display_name"]
    except ValueError as e:
        raise gr.Error(str(e))

    df = pd.DataFrame(papers_list)
    output_filename = f"{track_name}.csv"
    dynamic_label = f"Accepted Papers - {display_name}"

    # Save the DataFrame to a CSV file to make it downloadable
    df.to_csv(output_filename, index=True, encoding="utf-8-sig")

    # Create a new DataFrame component with the dynamic label and make it visible
    updated_table = gr.DataFrame(label=dynamic_label, value=df, wrap=True, visible=True)

    # Create an updated download button that is visible when there's data
    updated_download_button = gr.DownloadButton(
        label="Download CSV", visible=True, value=output_filename
    )

    # Return the updated DataFrame component and the updated download button
    return updated_table, updated_download_button


def create_gradio_app():
    """
    Defines and returns the Gradio web application interface.
    """
    with gr.Blocks(theme=gr.themes.Default()) as iface:
        gr.Markdown(
            """
            # Researchr Conference Paper Scraper (PL/SE)
            Scrape lists of accepted papers from conferences on [`conf.researchr.org`](https://conf.researchr.org/) and download them as a CSV file. This tool primarily targets conferences in the domains of Programming Languages and Software Engineering.
            """
        )

        url_input = gr.Textbox(
            placeholder="Enter the URL of a specific track (e.g., https://pldi25.sigplan.org/track/pldi-2025-papers)",
            container=False,
        )

        gr.Examples(
            examples=[
                "https://pldi25.sigplan.org/track/pldi-2025-papers",
                "https://conf.researchr.org/track/ase-2025/ase-2025-nier-track",
                "https://conf.researchr.org/track/vlhcc-2025/vlhcc-2025-research-papers",
                "https://2024.splashcon.org/track/splash-2024-oopsla",
                "https://conf.researchr.org/track/icse-2024/icse-2024-demonstrations?",
            ],
            inputs=url_input,
            example_labels=[
                "PLDI 2025",
                "ASE 2025 NIER",
                "VL/HCC 2025",
                "OOPSLA 2024",
                "ICSE 2024 Demo",
            ],
        )

        submit_button = gr.Button("Scrape Papers", variant="primary")

        empty_df_with_headers = pd.DataFrame({"Title": [], "Authors": []})
        output_table = gr.DataFrame(
            label="Accepted Papers",
            value=empty_df_with_headers,
            wrap=True,
            visible=False,
        )

        download_button = gr.DownloadButton(label="Download CSV", visible=False)

        # Connect the button click to the scraping function
        submit_button.click(
            fn=scrape_and_save_papers,
            inputs=url_input,
            outputs=[output_table, download_button],
        )

        gr.Markdown(
            """
            *Developed by [Ningzhi Tang](https://www.nztang.com/).*
            """
        )

    return iface


if __name__ == "__main__":
    app = create_gradio_app()
    app.launch()