Spaces:
Sleeping
Sleeping
| """VC Deal Flow Signal — Interactive Explorer. | |
| Hugging Face Space that loads the live HF dataset | |
| (huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) and renders | |
| an interactive Gradio dashboard. | |
| 5 tabs: | |
| - Overview : KPI strip + signal-type composition + top movers (latest quarter) | |
| - Sector heatmap : sector x quarter avg commit velocity | |
| - Top movers : filterable ranking of accelerating startups | |
| - Startup drilldown : per-startup four-quarter trajectory | |
| - Methodology & cite : SSRN, Zenodo, classifier code, MCP server, citation BibTeX | |
| The dataset is public (CC-BY-4.0); no auth required at runtime. | |
| """ | |
| from __future__ import annotations | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from huggingface_hub import hf_hub_download | |
| REPO_ID = "the-data-nerd/vc-deal-flow-signal" | |
| PERIOD_ORDER = ["q3-2025", "q4-2025", "q1-2026", "q2-2026"] | |
| PERIOD_LABEL = {p: p.upper().replace("-", " ") for p in PERIOD_ORDER} | |
| def load_csv(name: str) -> pd.DataFrame: | |
| path = hf_hub_download(repo_id=REPO_ID, filename=name, repo_type="dataset") | |
| return pd.read_csv(path) | |
| SIGNALS = load_csv("startup_signals.csv") | |
| SECTORS = load_csv("sector_aggregates.csv") | |
| TIMESERIES = load_csv("signal_type_timeseries.csv") | |
| for _df in (SIGNALS, SECTORS, TIMESERIES): | |
| _df["period"] = pd.Categorical(_df["period"], categories=PERIOD_ORDER, ordered=True) | |
| LATEST = str(SIGNALS["period"].max()) | |
| N_STARTUPS = SIGNALS["startup_name"].nunique() | |
| N_SECTORS = SIGNALS["sector_name"].nunique() | |
| N_QUARTERS = SIGNALS["period"].nunique() | |
| N_OBSERVATIONS = len(SIGNALS) | |
| SECTORS_SORTED = sorted(SIGNALS["sector_name"].unique()) | |
| STAGES_SORTED = sorted(s for s in SIGNALS["stage"].unique() if isinstance(s, str)) | |
| SIGNAL_TYPES = sorted(s for s in SIGNALS["signal_type"].unique() if isinstance(s, str)) | |
| STARTUPS_SORTED = sorted(SIGNALS["startup_name"].unique()) | |
| def overview_kpis() -> str: | |
| latest_df = SIGNALS[SIGNALS["period"] == LATEST] | |
| top_mover = latest_df.sort_values("commit_velocity_change_pct", ascending=False).iloc[0] | |
| return ( | |
| f"**{N_STARTUPS}** startups · **{N_SECTORS}** sectors · **{N_QUARTERS}** quarters · **{N_OBSERVATIONS}** observations \n" | |
| f"**Top mover {PERIOD_LABEL[LATEST]}**: `{top_mover['startup_name']}` " | |
| f"({top_mover['sector_name']}) — `{top_mover['commit_velocity_change_pct']:+.0f}%` Δ commit velocity" | |
| ) | |
| def overview_signal_share_fig() -> go.Figure: | |
| ts = TIMESERIES.copy() | |
| ts["period"] = ts["period"].astype(str) | |
| fig = px.bar( | |
| ts, | |
| x="period", | |
| y="share_of_total", | |
| color="signal_type", | |
| title="Signal Type Share by Quarter", | |
| labels={"share_of_total": "Share of total", "period": "Quarter", "signal_type": "Signal"}, | |
| category_orders={"period": PERIOD_ORDER}, | |
| ) | |
| fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=380, legend_title_text="") | |
| return fig | |
| def overview_top10() -> pd.DataFrame: | |
| latest_df = SIGNALS[SIGNALS["period"] == LATEST] | |
| return ( | |
| latest_df.sort_values("commit_velocity_change_pct", ascending=False) | |
| .head(10)[ | |
| [ | |
| "startup_name", | |
| "sector_name", | |
| "stage", | |
| "commit_velocity_14d", | |
| "commit_velocity_change_pct", | |
| "signal_type", | |
| "github_url", | |
| ] | |
| ] | |
| .rename( | |
| columns={ | |
| "startup_name": "Startup", | |
| "sector_name": "Sector", | |
| "stage": "Stage", | |
| "commit_velocity_14d": "Velocity (14d)", | |
| "commit_velocity_change_pct": "Change %", | |
| "signal_type": "Signal", | |
| "github_url": "GitHub", | |
| } | |
| ) | |
| .reset_index(drop=True) | |
| ) | |
| def sector_heatmap_fig() -> go.Figure: | |
| pivot = ( | |
| SECTORS.pivot_table( | |
| index="sector_name", | |
| columns="period", | |
| values="avg_commit_velocity_14d", | |
| aggfunc="mean", | |
| observed=True, | |
| ) | |
| .reindex(columns=PERIOD_ORDER) | |
| .sort_index() | |
| ) | |
| fig = px.imshow( | |
| pivot, | |
| labels=dict(x="Quarter", y="Sector", color="Avg Commit Velocity (14d)"), | |
| aspect="auto", | |
| color_continuous_scale="Viridis", | |
| title="Average Commit Velocity by Sector × Quarter", | |
| text_auto=".0f", | |
| ) | |
| fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=620) | |
| return fig | |
| def filter_movers(period: str, sector: str, stage: str, signal: str, top_n: int): | |
| df = SIGNALS.copy() | |
| if period != "All": | |
| df = df[df["period"] == period] | |
| if sector != "All": | |
| df = df[df["sector_name"] == sector] | |
| if stage != "All": | |
| df = df[df["stage"] == stage] | |
| if signal != "All": | |
| df = df[df["signal_type"] == signal] | |
| df = df.sort_values("commit_velocity_change_pct", ascending=False).head(int(top_n)) | |
| if df.empty: | |
| empty_fig = go.Figure() | |
| empty_fig.add_annotation(text="No rows match the selected filters", x=0.5, y=0.5, showarrow=False) | |
| empty_fig.update_layout(height=380, margin=dict(l=20, r=20, t=50, b=20)) | |
| return df, empty_fig | |
| fig = px.bar( | |
| df, | |
| y="startup_name", | |
| x="commit_velocity_change_pct", | |
| color="signal_type", | |
| orientation="h", | |
| title=f"Top {len(df)} Movers — Δ Commit Velocity", | |
| labels={"commit_velocity_change_pct": "Velocity Change %", "startup_name": "Startup", "signal_type": "Signal"}, | |
| hover_data=["sector_name", "stage", "commit_velocity_14d"], | |
| ) | |
| fig.update_layout( | |
| yaxis={"categoryorder": "total ascending"}, | |
| height=max(380, 28 * len(df)), | |
| margin=dict(l=20, r=20, t=50, b=20), | |
| legend_title_text="", | |
| ) | |
| table = ( | |
| df[ | |
| [ | |
| "startup_name", | |
| "sector_name", | |
| "stage", | |
| "commit_velocity_14d", | |
| "commit_velocity_change_pct", | |
| "signal_type", | |
| "github_url", | |
| ] | |
| ] | |
| .rename( | |
| columns={ | |
| "startup_name": "Startup", | |
| "sector_name": "Sector", | |
| "stage": "Stage", | |
| "commit_velocity_14d": "Velocity (14d)", | |
| "commit_velocity_change_pct": "Change %", | |
| "signal_type": "Signal", | |
| "github_url": "GitHub", | |
| } | |
| ) | |
| .reset_index(drop=True) | |
| ) | |
| return table, fig | |
| def drilldown(startup: str): | |
| if not startup: | |
| return "_Pick a startup above_", go.Figure() | |
| df = SIGNALS[SIGNALS["startup_name"] == startup].copy() | |
| if df.empty: | |
| return f"_No rows for `{startup}`_", go.Figure() | |
| df = df.sort_values("period") | |
| df["period_str"] = df["period"].astype(str) | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Scatter( | |
| x=df["period_str"], | |
| y=df["commit_velocity_14d"], | |
| mode="lines+markers+text", | |
| name="Commit velocity (14d)", | |
| text=df["commit_velocity_14d"].astype(str), | |
| textposition="top center", | |
| ) | |
| ) | |
| fig.update_layout( | |
| title=f"{startup} — commit velocity trajectory", | |
| xaxis_title="Quarter", | |
| yaxis_title="Commit velocity (14d)", | |
| height=380, | |
| margin=dict(l=20, r=20, t=50, b=20), | |
| ) | |
| latest = df.iloc[-1] | |
| md = ( | |
| f"**Sector:** {latest['sector_name']} \n" | |
| f"**Stage:** {latest['stage']} \n" | |
| f"**Geography:** {latest['geography']} \n" | |
| f"**Latest signal ({PERIOD_LABEL[str(latest['period'])]}):** `{latest['signal_type']}` \n" | |
| f"**Commit velocity (14d):** {latest['commit_velocity_14d']} (Δ {latest['commit_velocity_change_pct']:+.0f}%) \n" | |
| f"**Contributors:** {latest['contributors']} (growth {latest['contributor_growth_pct']:+.0f}%) \n" | |
| f"**GitHub:** [{latest['github_url']}]({latest['github_url']})" | |
| ) | |
| return md, fig | |
| with gr.Blocks(title="VC Deal Flow Signal — Interactive Explorer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| f""" | |
| # 📊 VC Deal Flow Signal — Interactive Explorer | |
| Live engineering-velocity panel across **{N_STARTUPS}** venture-backed startups in **{N_SECTORS}** sectors over **{N_QUARTERS}** quarters of GitHub data. | |
| Source: [`the-data-nerd/vc-deal-flow-signal`](https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) · CC-BY-4.0 · methodology on [SSRN 6606558](https://ssrn.com/abstract=6606558) · companion chat agent: [`vc-deal-flow-deepseek`](https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek) | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Overview"): | |
| gr.Markdown(overview_kpis()) | |
| gr.Plot(value=overview_signal_share_fig(), label="Signal-type composition over quarters") | |
| gr.Markdown(f"### Top 10 movers — {PERIOD_LABEL[LATEST]}") | |
| gr.Dataframe(value=overview_top10(), interactive=False, wrap=True) | |
| with gr.Tab("Sector heatmap"): | |
| gr.Plot(value=sector_heatmap_fig(), label="Sector × Quarter heatmap") | |
| gr.Markdown( | |
| "_Each cell shows the **average 14-day commit velocity** of the startups tracked in that sector " | |
| "for that quarter. Brighter = more engineering throughput. Use it to spot sector-level rotations._" | |
| ) | |
| with gr.Tab("Top movers"): | |
| with gr.Row(): | |
| period_dd = gr.Dropdown(["All"] + PERIOD_ORDER, value=LATEST, label="Quarter") | |
| sector_dd = gr.Dropdown(["All"] + SECTORS_SORTED, value="All", label="Sector") | |
| stage_dd = gr.Dropdown(["All"] + STAGES_SORTED, value="All", label="Stage") | |
| signal_dd = gr.Dropdown(["All"] + SIGNAL_TYPES, value="All", label="Signal type") | |
| topn_slider = gr.Slider(5, 50, value=15, step=5, label="Top N") | |
| init_table, init_fig = filter_movers(LATEST, "All", "All", "All", 15) | |
| movers_table = gr.Dataframe(value=init_table, label="Filtered movers", interactive=False, wrap=True) | |
| movers_fig = gr.Plot(value=init_fig, label="Velocity-change ranking") | |
| for control in (period_dd, sector_dd, stage_dd, signal_dd, topn_slider): | |
| control.change( | |
| filter_movers, | |
| inputs=[period_dd, sector_dd, stage_dd, signal_dd, topn_slider], | |
| outputs=[movers_table, movers_fig], | |
| ) | |
| with gr.Tab("Startup drilldown"): | |
| startup_dd = gr.Dropdown(STARTUPS_SORTED, value=STARTUPS_SORTED[0], label="Pick a startup") | |
| init_md, init_drill_fig = drilldown(STARTUPS_SORTED[0]) | |
| drill_md = gr.Markdown(value=init_md) | |
| drill_fig = gr.Plot(value=init_drill_fig, label="Commit velocity over time") | |
| startup_dd.change(drilldown, inputs=[startup_dd], outputs=[drill_md, drill_fig]) | |
| with gr.Tab("Methodology & cite"): | |
| gr.Markdown( | |
| """ | |
| ## How signals are computed | |
| The dataset is derived live from the [GitHub REST API v3](https://docs.github.com/en/rest). For each tracked startup we sample its most active public organisation repository on a 14-day rolling window, four times per quarter. | |
| **Working hypothesis (testable, falsifiable):** sustained engineering acceleration — commit velocity rising significantly above a startup's own baseline — tends to precede fundraise announcements by roughly 6–12 weeks. | |
| The classifier maps every (startup, quarter) observation onto one of four signal types: | |
| | Signal | Definition | | |
| |---|---| | |
| | `Engineering hiring burst` | Unique-contributor count spikes vs. trailing 90-day baseline | | |
| | `Infrastructure buildout` | Multiple new public repos created in the last 30 days | | |
| | `Deploy frequency spike` | Commit velocity ≥ 2× the trailing 90-day baseline | | |
| | `Framework migration` | High commit volume with low contributor growth and zero new repos | | |
| Full classifier source (MIT): [github.com/kindrat86/gitdealflow-signal-classifier](https://github.com/kindrat86/gitdealflow-signal-classifier) | |
| ## Cite this dataset | |
| ```bibtex | |
| @dataset{vc_deal_flow_signal_2026, | |
| author = {The Data Nerd}, | |
| title = {Startup GitHub Engineering Velocity Panel}, | |
| year = {2026}, | |
| publisher = {Zenodo}, | |
| doi = {10.5281/zenodo.19650920}, | |
| url = {https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal} | |
| } | |
| ``` | |
| ## Live mirrors and related artefacts | |
| - **HF dataset (this Space's source):** https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal | |
| - **Zenodo (DOI'd version):** https://zenodo.org/records/19650920 — concept DOI [10.5281/zenodo.19650919](https://doi.org/10.5281/zenodo.19650919) | |
| - **Kaggle mirror:** https://www.kaggle.com/datasets/thedatanerd2026/vc-deal-flow-signal | |
| - **Data.world mirror:** https://data.world/thedatanerd2026/vc-deal-flow-signal-startup-engineering-acceleration | |
| - **SSRN preprint (methodology):** https://ssrn.com/abstract=6606558 | |
| - **Live MCP server (read-only, public):** https://signals.gitdealflow.com/api/mcp/rpc | |
| - **Companion chat agent (Space):** https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek | |
| - **Production web app:** https://signals.gitdealflow.com | |
| """ | |
| ) | |
| gr.Markdown( | |
| """--- | |
| Built with [Gradio](https://gradio.app) on top of [Hugging Face Datasets](https://huggingface.co/docs/datasets). Code: MIT. Data: CC-BY-4.0. _Past acceleration does not guarantee future outcomes — this is alternative-data research, not investment advice._ | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |