"""VC Deal Flow Signal — Interactive Explorer. Hugging Face Space that loads the live HF dataset (huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) and renders an interactive Gradio dashboard. 5 tabs: - Overview : KPI strip + signal-type composition + top movers (latest quarter) - Sector heatmap : sector x quarter avg commit velocity - Top movers : filterable ranking of accelerating startups - Startup drilldown : per-startup four-quarter trajectory - Methodology & cite : SSRN, Zenodo, classifier code, MCP server, citation BibTeX The dataset is public (CC-BY-4.0); no auth required at runtime. """ from __future__ import annotations import gradio as gr import pandas as pd import plotly.express as px import plotly.graph_objects as go from huggingface_hub import hf_hub_download REPO_ID = "the-data-nerd/vc-deal-flow-signal" PERIOD_ORDER = ["q3-2025", "q4-2025", "q1-2026", "q2-2026"] PERIOD_LABEL = {p: p.upper().replace("-", " ") for p in PERIOD_ORDER} def load_csv(name: str) -> pd.DataFrame: path = hf_hub_download(repo_id=REPO_ID, filename=name, repo_type="dataset") return pd.read_csv(path) SIGNALS = load_csv("startup_signals.csv") SECTORS = load_csv("sector_aggregates.csv") TIMESERIES = load_csv("signal_type_timeseries.csv") for _df in (SIGNALS, SECTORS, TIMESERIES): _df["period"] = pd.Categorical(_df["period"], categories=PERIOD_ORDER, ordered=True) LATEST = str(SIGNALS["period"].max()) N_STARTUPS = SIGNALS["startup_name"].nunique() N_SECTORS = SIGNALS["sector_name"].nunique() N_QUARTERS = SIGNALS["period"].nunique() N_OBSERVATIONS = len(SIGNALS) SECTORS_SORTED = sorted(SIGNALS["sector_name"].unique()) STAGES_SORTED = sorted(s for s in SIGNALS["stage"].unique() if isinstance(s, str)) SIGNAL_TYPES = sorted(s for s in SIGNALS["signal_type"].unique() if isinstance(s, str)) STARTUPS_SORTED = sorted(SIGNALS["startup_name"].unique()) def overview_kpis() -> str: latest_df = SIGNALS[SIGNALS["period"] == LATEST] top_mover = latest_df.sort_values("commit_velocity_change_pct", ascending=False).iloc[0] return ( f"**{N_STARTUPS}** startups · **{N_SECTORS}** sectors · **{N_QUARTERS}** quarters · **{N_OBSERVATIONS}** observations \n" f"**Top mover {PERIOD_LABEL[LATEST]}**: `{top_mover['startup_name']}` " f"({top_mover['sector_name']}) — `{top_mover['commit_velocity_change_pct']:+.0f}%` Δ commit velocity" ) def overview_signal_share_fig() -> go.Figure: ts = TIMESERIES.copy() ts["period"] = ts["period"].astype(str) fig = px.bar( ts, x="period", y="share_of_total", color="signal_type", title="Signal Type Share by Quarter", labels={"share_of_total": "Share of total", "period": "Quarter", "signal_type": "Signal"}, category_orders={"period": PERIOD_ORDER}, ) fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=380, legend_title_text="") return fig def overview_top10() -> pd.DataFrame: latest_df = SIGNALS[SIGNALS["period"] == LATEST] return ( latest_df.sort_values("commit_velocity_change_pct", ascending=False) .head(10)[ [ "startup_name", "sector_name", "stage", "commit_velocity_14d", "commit_velocity_change_pct", "signal_type", "github_url", ] ] .rename( columns={ "startup_name": "Startup", "sector_name": "Sector", "stage": "Stage", "commit_velocity_14d": "Velocity (14d)", "commit_velocity_change_pct": "Change %", "signal_type": "Signal", "github_url": "GitHub", } ) .reset_index(drop=True) ) def sector_heatmap_fig() -> go.Figure: pivot = ( SECTORS.pivot_table( index="sector_name", columns="period", values="avg_commit_velocity_14d", aggfunc="mean", observed=True, ) .reindex(columns=PERIOD_ORDER) .sort_index() ) fig = px.imshow( pivot, labels=dict(x="Quarter", y="Sector", color="Avg Commit Velocity (14d)"), aspect="auto", color_continuous_scale="Viridis", title="Average Commit Velocity by Sector × Quarter", text_auto=".0f", ) fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=620) return fig def filter_movers(period: str, sector: str, stage: str, signal: str, top_n: int): df = SIGNALS.copy() if period != "All": df = df[df["period"] == period] if sector != "All": df = df[df["sector_name"] == sector] if stage != "All": df = df[df["stage"] == stage] if signal != "All": df = df[df["signal_type"] == signal] df = df.sort_values("commit_velocity_change_pct", ascending=False).head(int(top_n)) if df.empty: empty_fig = go.Figure() empty_fig.add_annotation(text="No rows match the selected filters", x=0.5, y=0.5, showarrow=False) empty_fig.update_layout(height=380, margin=dict(l=20, r=20, t=50, b=20)) return df, empty_fig fig = px.bar( df, y="startup_name", x="commit_velocity_change_pct", color="signal_type", orientation="h", title=f"Top {len(df)} Movers — Δ Commit Velocity", labels={"commit_velocity_change_pct": "Velocity Change %", "startup_name": "Startup", "signal_type": "Signal"}, hover_data=["sector_name", "stage", "commit_velocity_14d"], ) fig.update_layout( yaxis={"categoryorder": "total ascending"}, height=max(380, 28 * len(df)), margin=dict(l=20, r=20, t=50, b=20), legend_title_text="", ) table = ( df[ [ "startup_name", "sector_name", "stage", "commit_velocity_14d", "commit_velocity_change_pct", "signal_type", "github_url", ] ] .rename( columns={ "startup_name": "Startup", "sector_name": "Sector", "stage": "Stage", "commit_velocity_14d": "Velocity (14d)", "commit_velocity_change_pct": "Change %", "signal_type": "Signal", "github_url": "GitHub", } ) .reset_index(drop=True) ) return table, fig def drilldown(startup: str): if not startup: return "_Pick a startup above_", go.Figure() df = SIGNALS[SIGNALS["startup_name"] == startup].copy() if df.empty: return f"_No rows for `{startup}`_", go.Figure() df = df.sort_values("period") df["period_str"] = df["period"].astype(str) fig = go.Figure() fig.add_trace( go.Scatter( x=df["period_str"], y=df["commit_velocity_14d"], mode="lines+markers+text", name="Commit velocity (14d)", text=df["commit_velocity_14d"].astype(str), textposition="top center", ) ) fig.update_layout( title=f"{startup} — commit velocity trajectory", xaxis_title="Quarter", yaxis_title="Commit velocity (14d)", height=380, margin=dict(l=20, r=20, t=50, b=20), ) latest = df.iloc[-1] md = ( f"**Sector:** {latest['sector_name']} \n" f"**Stage:** {latest['stage']} \n" f"**Geography:** {latest['geography']} \n" f"**Latest signal ({PERIOD_LABEL[str(latest['period'])]}):** `{latest['signal_type']}` \n" f"**Commit velocity (14d):** {latest['commit_velocity_14d']} (Δ {latest['commit_velocity_change_pct']:+.0f}%) \n" f"**Contributors:** {latest['contributors']} (growth {latest['contributor_growth_pct']:+.0f}%) \n" f"**GitHub:** [{latest['github_url']}]({latest['github_url']})" ) return md, fig with gr.Blocks(title="VC Deal Flow Signal — Interactive Explorer", theme=gr.themes.Soft()) as demo: gr.Markdown( f""" # 📊 VC Deal Flow Signal — Interactive Explorer Live engineering-velocity panel across **{N_STARTUPS}** venture-backed startups in **{N_SECTORS}** sectors over **{N_QUARTERS}** quarters of GitHub data. Source: [`the-data-nerd/vc-deal-flow-signal`](https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) · CC-BY-4.0 · methodology on [SSRN 6606558](https://ssrn.com/abstract=6606558) · companion chat agent: [`vc-deal-flow-deepseek`](https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek) """ ) with gr.Tabs(): with gr.Tab("Overview"): gr.Markdown(overview_kpis()) gr.Plot(value=overview_signal_share_fig(), label="Signal-type composition over quarters") gr.Markdown(f"### Top 10 movers — {PERIOD_LABEL[LATEST]}") gr.Dataframe(value=overview_top10(), interactive=False, wrap=True) with gr.Tab("Sector heatmap"): gr.Plot(value=sector_heatmap_fig(), label="Sector × Quarter heatmap") gr.Markdown( "_Each cell shows the **average 14-day commit velocity** of the startups tracked in that sector " "for that quarter. Brighter = more engineering throughput. Use it to spot sector-level rotations._" ) with gr.Tab("Top movers"): with gr.Row(): period_dd = gr.Dropdown(["All"] + PERIOD_ORDER, value=LATEST, label="Quarter") sector_dd = gr.Dropdown(["All"] + SECTORS_SORTED, value="All", label="Sector") stage_dd = gr.Dropdown(["All"] + STAGES_SORTED, value="All", label="Stage") signal_dd = gr.Dropdown(["All"] + SIGNAL_TYPES, value="All", label="Signal type") topn_slider = gr.Slider(5, 50, value=15, step=5, label="Top N") init_table, init_fig = filter_movers(LATEST, "All", "All", "All", 15) movers_table = gr.Dataframe(value=init_table, label="Filtered movers", interactive=False, wrap=True) movers_fig = gr.Plot(value=init_fig, label="Velocity-change ranking") for control in (period_dd, sector_dd, stage_dd, signal_dd, topn_slider): control.change( filter_movers, inputs=[period_dd, sector_dd, stage_dd, signal_dd, topn_slider], outputs=[movers_table, movers_fig], ) with gr.Tab("Startup drilldown"): startup_dd = gr.Dropdown(STARTUPS_SORTED, value=STARTUPS_SORTED[0], label="Pick a startup") init_md, init_drill_fig = drilldown(STARTUPS_SORTED[0]) drill_md = gr.Markdown(value=init_md) drill_fig = gr.Plot(value=init_drill_fig, label="Commit velocity over time") startup_dd.change(drilldown, inputs=[startup_dd], outputs=[drill_md, drill_fig]) with gr.Tab("Methodology & cite"): gr.Markdown( """ ## How signals are computed The dataset is derived live from the [GitHub REST API v3](https://docs.github.com/en/rest). For each tracked startup we sample its most active public organisation repository on a 14-day rolling window, four times per quarter. **Working hypothesis (testable, falsifiable):** sustained engineering acceleration — commit velocity rising significantly above a startup's own baseline — tends to precede fundraise announcements by roughly 6–12 weeks. The classifier maps every (startup, quarter) observation onto one of four signal types: | Signal | Definition | |---|---| | `Engineering hiring burst` | Unique-contributor count spikes vs. trailing 90-day baseline | | `Infrastructure buildout` | Multiple new public repos created in the last 30 days | | `Deploy frequency spike` | Commit velocity ≥ 2× the trailing 90-day baseline | | `Framework migration` | High commit volume with low contributor growth and zero new repos | Full classifier source (MIT): [github.com/kindrat86/gitdealflow-signal-classifier](https://github.com/kindrat86/gitdealflow-signal-classifier) ## Cite this dataset ```bibtex @dataset{vc_deal_flow_signal_2026, author = {The Data Nerd}, title = {Startup GitHub Engineering Velocity Panel}, year = {2026}, publisher = {Zenodo}, doi = {10.5281/zenodo.19650920}, url = {https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal} } ``` ## Live mirrors and related artefacts - **HF dataset (this Space's source):** https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal - **Zenodo (DOI'd version):** https://zenodo.org/records/19650920 — concept DOI [10.5281/zenodo.19650919](https://doi.org/10.5281/zenodo.19650919) - **Kaggle mirror:** https://www.kaggle.com/datasets/thedatanerd2026/vc-deal-flow-signal - **Data.world mirror:** https://data.world/thedatanerd2026/vc-deal-flow-signal-startup-engineering-acceleration - **SSRN preprint (methodology):** https://ssrn.com/abstract=6606558 - **Live MCP server (read-only, public):** https://signals.gitdealflow.com/api/mcp/rpc - **Companion chat agent (Space):** https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek - **Production web app:** https://signals.gitdealflow.com """ ) gr.Markdown( """--- Built with [Gradio](https://gradio.app) on top of [Hugging Face Datasets](https://huggingface.co/docs/datasets). Code: MIT. Data: CC-BY-4.0. _Past acceleration does not guarantee future outcomes — this is alternative-data research, not investment advice._ """ ) if __name__ == "__main__": demo.launch()