"""VC Deal Flow Signal — Interactive Explorer.

Hugging Face Space that loads the live HF dataset
(huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) and renders
an interactive Gradio dashboard.

5 tabs:
  - Overview            : KPI strip + signal-type composition + top movers (latest quarter)
  - Sector heatmap      : sector x quarter avg commit velocity
  - Top movers          : filterable ranking of accelerating startups
  - Startup drilldown   : per-startup four-quarter trajectory
  - Methodology & cite  : SSRN, Zenodo, classifier code, MCP server, citation BibTeX

The dataset is public (CC-BY-4.0); no auth required at runtime.
"""
from __future__ import annotations

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from huggingface_hub import hf_hub_download

REPO_ID = "the-data-nerd/vc-deal-flow-signal"
PERIOD_ORDER = ["q3-2025", "q4-2025", "q1-2026", "q2-2026"]
PERIOD_LABEL = {p: p.upper().replace("-", " ") for p in PERIOD_ORDER}


def load_csv(name: str) -> pd.DataFrame:
    path = hf_hub_download(repo_id=REPO_ID, filename=name, repo_type="dataset")
    return pd.read_csv(path)


SIGNALS = load_csv("startup_signals.csv")
SECTORS = load_csv("sector_aggregates.csv")
TIMESERIES = load_csv("signal_type_timeseries.csv")

for _df in (SIGNALS, SECTORS, TIMESERIES):
    _df["period"] = pd.Categorical(_df["period"], categories=PERIOD_ORDER, ordered=True)

LATEST = str(SIGNALS["period"].max())
N_STARTUPS = SIGNALS["startup_name"].nunique()
N_SECTORS = SIGNALS["sector_name"].nunique()
N_QUARTERS = SIGNALS["period"].nunique()
N_OBSERVATIONS = len(SIGNALS)
SECTORS_SORTED = sorted(SIGNALS["sector_name"].unique())
STAGES_SORTED = sorted(s for s in SIGNALS["stage"].unique() if isinstance(s, str))
SIGNAL_TYPES = sorted(s for s in SIGNALS["signal_type"].unique() if isinstance(s, str))
STARTUPS_SORTED = sorted(SIGNALS["startup_name"].unique())


def overview_kpis() -> str:
    latest_df = SIGNALS[SIGNALS["period"] == LATEST]
    top_mover = latest_df.sort_values("commit_velocity_change_pct", ascending=False).iloc[0]
    return (
        f"**{N_STARTUPS}** startups · **{N_SECTORS}** sectors · **{N_QUARTERS}** quarters · **{N_OBSERVATIONS}** observations  \n"
        f"**Top mover {PERIOD_LABEL[LATEST]}**: `{top_mover['startup_name']}` "
        f"({top_mover['sector_name']}) — `{top_mover['commit_velocity_change_pct']:+.0f}%` Δ commit velocity"
    )


def overview_signal_share_fig() -> go.Figure:
    ts = TIMESERIES.copy()
    ts["period"] = ts["period"].astype(str)
    fig = px.bar(
        ts,
        x="period",
        y="share_of_total",
        color="signal_type",
        title="Signal Type Share by Quarter",
        labels={"share_of_total": "Share of total", "period": "Quarter", "signal_type": "Signal"},
        category_orders={"period": PERIOD_ORDER},
    )
    fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=380, legend_title_text="")
    return fig


def overview_top10() -> pd.DataFrame:
    latest_df = SIGNALS[SIGNALS["period"] == LATEST]
    return (
        latest_df.sort_values("commit_velocity_change_pct", ascending=False)
        .head(10)[
            [
                "startup_name",
                "sector_name",
                "stage",
                "commit_velocity_14d",
                "commit_velocity_change_pct",
                "signal_type",
                "github_url",
            ]
        ]
        .rename(
            columns={
                "startup_name": "Startup",
                "sector_name": "Sector",
                "stage": "Stage",
                "commit_velocity_14d": "Velocity (14d)",
                "commit_velocity_change_pct": "Change %",
                "signal_type": "Signal",
                "github_url": "GitHub",
            }
        )
        .reset_index(drop=True)
    )


def sector_heatmap_fig() -> go.Figure:
    pivot = (
        SECTORS.pivot_table(
            index="sector_name",
            columns="period",
            values="avg_commit_velocity_14d",
            aggfunc="mean",
            observed=True,
        )
        .reindex(columns=PERIOD_ORDER)
        .sort_index()
    )
    fig = px.imshow(
        pivot,
        labels=dict(x="Quarter", y="Sector", color="Avg Commit Velocity (14d)"),
        aspect="auto",
        color_continuous_scale="Viridis",
        title="Average Commit Velocity by Sector × Quarter",
        text_auto=".0f",
    )
    fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=620)
    return fig


def filter_movers(period: str, sector: str, stage: str, signal: str, top_n: int):
    df = SIGNALS.copy()
    if period != "All":
        df = df[df["period"] == period]
    if sector != "All":
        df = df[df["sector_name"] == sector]
    if stage != "All":
        df = df[df["stage"] == stage]
    if signal != "All":
        df = df[df["signal_type"] == signal]

    df = df.sort_values("commit_velocity_change_pct", ascending=False).head(int(top_n))

    if df.empty:
        empty_fig = go.Figure()
        empty_fig.add_annotation(text="No rows match the selected filters", x=0.5, y=0.5, showarrow=False)
        empty_fig.update_layout(height=380, margin=dict(l=20, r=20, t=50, b=20))
        return df, empty_fig

    fig = px.bar(
        df,
        y="startup_name",
        x="commit_velocity_change_pct",
        color="signal_type",
        orientation="h",
        title=f"Top {len(df)} Movers — Δ Commit Velocity",
        labels={"commit_velocity_change_pct": "Velocity Change %", "startup_name": "Startup", "signal_type": "Signal"},
        hover_data=["sector_name", "stage", "commit_velocity_14d"],
    )
    fig.update_layout(
        yaxis={"categoryorder": "total ascending"},
        height=max(380, 28 * len(df)),
        margin=dict(l=20, r=20, t=50, b=20),
        legend_title_text="",
    )

    table = (
        df[
            [
                "startup_name",
                "sector_name",
                "stage",
                "commit_velocity_14d",
                "commit_velocity_change_pct",
                "signal_type",
                "github_url",
            ]
        ]
        .rename(
            columns={
                "startup_name": "Startup",
                "sector_name": "Sector",
                "stage": "Stage",
                "commit_velocity_14d": "Velocity (14d)",
                "commit_velocity_change_pct": "Change %",
                "signal_type": "Signal",
                "github_url": "GitHub",
            }
        )
        .reset_index(drop=True)
    )
    return table, fig


def drilldown(startup: str):
    if not startup:
        return "_Pick a startup above_", go.Figure()
    df = SIGNALS[SIGNALS["startup_name"] == startup].copy()
    if df.empty:
        return f"_No rows for `{startup}`_", go.Figure()
    df = df.sort_values("period")
    df["period_str"] = df["period"].astype(str)

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=df["period_str"],
            y=df["commit_velocity_14d"],
            mode="lines+markers+text",
            name="Commit velocity (14d)",
            text=df["commit_velocity_14d"].astype(str),
            textposition="top center",
        )
    )
    fig.update_layout(
        title=f"{startup} — commit velocity trajectory",
        xaxis_title="Quarter",
        yaxis_title="Commit velocity (14d)",
        height=380,
        margin=dict(l=20, r=20, t=50, b=20),
    )

    latest = df.iloc[-1]
    md = (
        f"**Sector:** {latest['sector_name']}  \n"
        f"**Stage:** {latest['stage']}  \n"
        f"**Geography:** {latest['geography']}  \n"
        f"**Latest signal ({PERIOD_LABEL[str(latest['period'])]}):** `{latest['signal_type']}`  \n"
        f"**Commit velocity (14d):** {latest['commit_velocity_14d']} (Δ {latest['commit_velocity_change_pct']:+.0f}%)  \n"
        f"**Contributors:** {latest['contributors']} (growth {latest['contributor_growth_pct']:+.0f}%)  \n"
        f"**GitHub:** [{latest['github_url']}]({latest['github_url']})"
    )
    return md, fig


with gr.Blocks(title="VC Deal Flow Signal — Interactive Explorer", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        f"""
# 📊 VC Deal Flow Signal — Interactive Explorer

Live engineering-velocity panel across **{N_STARTUPS}** venture-backed startups in **{N_SECTORS}** sectors over **{N_QUARTERS}** quarters of GitHub data.

Source: [`the-data-nerd/vc-deal-flow-signal`](https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) · CC-BY-4.0 · methodology on [SSRN 6606558](https://ssrn.com/abstract=6606558) · companion chat agent: [`vc-deal-flow-deepseek`](https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek)
"""
    )

    with gr.Tabs():
        with gr.Tab("Overview"):
            gr.Markdown(overview_kpis())
            gr.Plot(value=overview_signal_share_fig(), label="Signal-type composition over quarters")
            gr.Markdown(f"### Top 10 movers — {PERIOD_LABEL[LATEST]}")
            gr.Dataframe(value=overview_top10(), interactive=False, wrap=True)

        with gr.Tab("Sector heatmap"):
            gr.Plot(value=sector_heatmap_fig(), label="Sector × Quarter heatmap")
            gr.Markdown(
                "_Each cell shows the **average 14-day commit velocity** of the startups tracked in that sector "
                "for that quarter. Brighter = more engineering throughput. Use it to spot sector-level rotations._"
            )

        with gr.Tab("Top movers"):
            with gr.Row():
                period_dd = gr.Dropdown(["All"] + PERIOD_ORDER, value=LATEST, label="Quarter")
                sector_dd = gr.Dropdown(["All"] + SECTORS_SORTED, value="All", label="Sector")
                stage_dd = gr.Dropdown(["All"] + STAGES_SORTED, value="All", label="Stage")
                signal_dd = gr.Dropdown(["All"] + SIGNAL_TYPES, value="All", label="Signal type")
                topn_slider = gr.Slider(5, 50, value=15, step=5, label="Top N")

            init_table, init_fig = filter_movers(LATEST, "All", "All", "All", 15)
            movers_table = gr.Dataframe(value=init_table, label="Filtered movers", interactive=False, wrap=True)
            movers_fig = gr.Plot(value=init_fig, label="Velocity-change ranking")

            for control in (period_dd, sector_dd, stage_dd, signal_dd, topn_slider):
                control.change(
                    filter_movers,
                    inputs=[period_dd, sector_dd, stage_dd, signal_dd, topn_slider],
                    outputs=[movers_table, movers_fig],
                )

        with gr.Tab("Startup drilldown"):
            startup_dd = gr.Dropdown(STARTUPS_SORTED, value=STARTUPS_SORTED[0], label="Pick a startup")
            init_md, init_drill_fig = drilldown(STARTUPS_SORTED[0])
            drill_md = gr.Markdown(value=init_md)
            drill_fig = gr.Plot(value=init_drill_fig, label="Commit velocity over time")
            startup_dd.change(drilldown, inputs=[startup_dd], outputs=[drill_md, drill_fig])

        with gr.Tab("Methodology & cite"):
            gr.Markdown(
                """
## How signals are computed

The dataset is derived live from the [GitHub REST API v3](https://docs.github.com/en/rest). For each tracked startup we sample its most active public organisation repository on a 14-day rolling window, four times per quarter.

**Working hypothesis (testable, falsifiable):** sustained engineering acceleration — commit velocity rising significantly above a startup's own baseline — tends to precede fundraise announcements by roughly 6–12 weeks.

The classifier maps every (startup, quarter) observation onto one of four signal types:

| Signal | Definition |
|---|---|
| `Engineering hiring burst` | Unique-contributor count spikes vs. trailing 90-day baseline |
| `Infrastructure buildout` | Multiple new public repos created in the last 30 days |
| `Deploy frequency spike` | Commit velocity ≥ 2× the trailing 90-day baseline |
| `Framework migration` | High commit volume with low contributor growth and zero new repos |

Full classifier source (MIT): [github.com/kindrat86/gitdealflow-signal-classifier](https://github.com/kindrat86/gitdealflow-signal-classifier)

## Cite this dataset

```bibtex
@dataset{vc_deal_flow_signal_2026,
  author    = {The Data Nerd},
  title     = {Startup GitHub Engineering Velocity Panel},
  year      = {2026},
  publisher = {Zenodo},
  doi       = {10.5281/zenodo.19650920},
  url       = {https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal}
}
```

## Live mirrors and related artefacts

- **HF dataset (this Space's source):** https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal
- **Zenodo (DOI'd version):** https://zenodo.org/records/19650920 — concept DOI [10.5281/zenodo.19650919](https://doi.org/10.5281/zenodo.19650919)
- **Kaggle mirror:** https://www.kaggle.com/datasets/thedatanerd2026/vc-deal-flow-signal
- **Data.world mirror:** https://data.world/thedatanerd2026/vc-deal-flow-signal-startup-engineering-acceleration
- **SSRN preprint (methodology):** https://ssrn.com/abstract=6606558
- **Live MCP server (read-only, public):** https://signals.gitdealflow.com/api/mcp/rpc
- **Companion chat agent (Space):** https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek
- **Production web app:** https://signals.gitdealflow.com
"""
            )

    gr.Markdown(
        """---
Built with [Gradio](https://gradio.app) on top of [Hugging Face Datasets](https://huggingface.co/docs/datasets). Code: MIT. Data: CC-BY-4.0. _Past acceleration does not guarantee future outcomes — this is alternative-data research, not investment advice._
"""
    )


if __name__ == "__main__":
    demo.launch()