the-data-nerd's picture
chore: upload app.py
163ee0b verified
"""VC Deal Flow Signal — Interactive Explorer.
Hugging Face Space that loads the live HF dataset
(huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) and renders
an interactive Gradio dashboard.
5 tabs:
- Overview : KPI strip + signal-type composition + top movers (latest quarter)
- Sector heatmap : sector x quarter avg commit velocity
- Top movers : filterable ranking of accelerating startups
- Startup drilldown : per-startup four-quarter trajectory
- Methodology & cite : SSRN, Zenodo, classifier code, MCP server, citation BibTeX
The dataset is public (CC-BY-4.0); no auth required at runtime.
"""
from __future__ import annotations
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from huggingface_hub import hf_hub_download
REPO_ID = "the-data-nerd/vc-deal-flow-signal"
PERIOD_ORDER = ["q3-2025", "q4-2025", "q1-2026", "q2-2026"]
PERIOD_LABEL = {p: p.upper().replace("-", " ") for p in PERIOD_ORDER}
def load_csv(name: str) -> pd.DataFrame:
path = hf_hub_download(repo_id=REPO_ID, filename=name, repo_type="dataset")
return pd.read_csv(path)
SIGNALS = load_csv("startup_signals.csv")
SECTORS = load_csv("sector_aggregates.csv")
TIMESERIES = load_csv("signal_type_timeseries.csv")
for _df in (SIGNALS, SECTORS, TIMESERIES):
_df["period"] = pd.Categorical(_df["period"], categories=PERIOD_ORDER, ordered=True)
LATEST = str(SIGNALS["period"].max())
N_STARTUPS = SIGNALS["startup_name"].nunique()
N_SECTORS = SIGNALS["sector_name"].nunique()
N_QUARTERS = SIGNALS["period"].nunique()
N_OBSERVATIONS = len(SIGNALS)
SECTORS_SORTED = sorted(SIGNALS["sector_name"].unique())
STAGES_SORTED = sorted(s for s in SIGNALS["stage"].unique() if isinstance(s, str))
SIGNAL_TYPES = sorted(s for s in SIGNALS["signal_type"].unique() if isinstance(s, str))
STARTUPS_SORTED = sorted(SIGNALS["startup_name"].unique())
def overview_kpis() -> str:
latest_df = SIGNALS[SIGNALS["period"] == LATEST]
top_mover = latest_df.sort_values("commit_velocity_change_pct", ascending=False).iloc[0]
return (
f"**{N_STARTUPS}** startups · **{N_SECTORS}** sectors · **{N_QUARTERS}** quarters · **{N_OBSERVATIONS}** observations \n"
f"**Top mover {PERIOD_LABEL[LATEST]}**: `{top_mover['startup_name']}` "
f"({top_mover['sector_name']}) — `{top_mover['commit_velocity_change_pct']:+.0f}%` Δ commit velocity"
)
def overview_signal_share_fig() -> go.Figure:
ts = TIMESERIES.copy()
ts["period"] = ts["period"].astype(str)
fig = px.bar(
ts,
x="period",
y="share_of_total",
color="signal_type",
title="Signal Type Share by Quarter",
labels={"share_of_total": "Share of total", "period": "Quarter", "signal_type": "Signal"},
category_orders={"period": PERIOD_ORDER},
)
fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=380, legend_title_text="")
return fig
def overview_top10() -> pd.DataFrame:
latest_df = SIGNALS[SIGNALS["period"] == LATEST]
return (
latest_df.sort_values("commit_velocity_change_pct", ascending=False)
.head(10)[
[
"startup_name",
"sector_name",
"stage",
"commit_velocity_14d",
"commit_velocity_change_pct",
"signal_type",
"github_url",
]
]
.rename(
columns={
"startup_name": "Startup",
"sector_name": "Sector",
"stage": "Stage",
"commit_velocity_14d": "Velocity (14d)",
"commit_velocity_change_pct": "Change %",
"signal_type": "Signal",
"github_url": "GitHub",
}
)
.reset_index(drop=True)
)
def sector_heatmap_fig() -> go.Figure:
pivot = (
SECTORS.pivot_table(
index="sector_name",
columns="period",
values="avg_commit_velocity_14d",
aggfunc="mean",
observed=True,
)
.reindex(columns=PERIOD_ORDER)
.sort_index()
)
fig = px.imshow(
pivot,
labels=dict(x="Quarter", y="Sector", color="Avg Commit Velocity (14d)"),
aspect="auto",
color_continuous_scale="Viridis",
title="Average Commit Velocity by Sector × Quarter",
text_auto=".0f",
)
fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=620)
return fig
def filter_movers(period: str, sector: str, stage: str, signal: str, top_n: int):
df = SIGNALS.copy()
if period != "All":
df = df[df["period"] == period]
if sector != "All":
df = df[df["sector_name"] == sector]
if stage != "All":
df = df[df["stage"] == stage]
if signal != "All":
df = df[df["signal_type"] == signal]
df = df.sort_values("commit_velocity_change_pct", ascending=False).head(int(top_n))
if df.empty:
empty_fig = go.Figure()
empty_fig.add_annotation(text="No rows match the selected filters", x=0.5, y=0.5, showarrow=False)
empty_fig.update_layout(height=380, margin=dict(l=20, r=20, t=50, b=20))
return df, empty_fig
fig = px.bar(
df,
y="startup_name",
x="commit_velocity_change_pct",
color="signal_type",
orientation="h",
title=f"Top {len(df)} Movers — Δ Commit Velocity",
labels={"commit_velocity_change_pct": "Velocity Change %", "startup_name": "Startup", "signal_type": "Signal"},
hover_data=["sector_name", "stage", "commit_velocity_14d"],
)
fig.update_layout(
yaxis={"categoryorder": "total ascending"},
height=max(380, 28 * len(df)),
margin=dict(l=20, r=20, t=50, b=20),
legend_title_text="",
)
table = (
df[
[
"startup_name",
"sector_name",
"stage",
"commit_velocity_14d",
"commit_velocity_change_pct",
"signal_type",
"github_url",
]
]
.rename(
columns={
"startup_name": "Startup",
"sector_name": "Sector",
"stage": "Stage",
"commit_velocity_14d": "Velocity (14d)",
"commit_velocity_change_pct": "Change %",
"signal_type": "Signal",
"github_url": "GitHub",
}
)
.reset_index(drop=True)
)
return table, fig
def drilldown(startup: str):
if not startup:
return "_Pick a startup above_", go.Figure()
df = SIGNALS[SIGNALS["startup_name"] == startup].copy()
if df.empty:
return f"_No rows for `{startup}`_", go.Figure()
df = df.sort_values("period")
df["period_str"] = df["period"].astype(str)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df["period_str"],
y=df["commit_velocity_14d"],
mode="lines+markers+text",
name="Commit velocity (14d)",
text=df["commit_velocity_14d"].astype(str),
textposition="top center",
)
)
fig.update_layout(
title=f"{startup} — commit velocity trajectory",
xaxis_title="Quarter",
yaxis_title="Commit velocity (14d)",
height=380,
margin=dict(l=20, r=20, t=50, b=20),
)
latest = df.iloc[-1]
md = (
f"**Sector:** {latest['sector_name']} \n"
f"**Stage:** {latest['stage']} \n"
f"**Geography:** {latest['geography']} \n"
f"**Latest signal ({PERIOD_LABEL[str(latest['period'])]}):** `{latest['signal_type']}` \n"
f"**Commit velocity (14d):** {latest['commit_velocity_14d']}{latest['commit_velocity_change_pct']:+.0f}%) \n"
f"**Contributors:** {latest['contributors']} (growth {latest['contributor_growth_pct']:+.0f}%) \n"
f"**GitHub:** [{latest['github_url']}]({latest['github_url']})"
)
return md, fig
with gr.Blocks(title="VC Deal Flow Signal — Interactive Explorer", theme=gr.themes.Soft()) as demo:
gr.Markdown(
f"""
# 📊 VC Deal Flow Signal — Interactive Explorer
Live engineering-velocity panel across **{N_STARTUPS}** venture-backed startups in **{N_SECTORS}** sectors over **{N_QUARTERS}** quarters of GitHub data.
Source: [`the-data-nerd/vc-deal-flow-signal`](https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) · CC-BY-4.0 · methodology on [SSRN 6606558](https://ssrn.com/abstract=6606558) · companion chat agent: [`vc-deal-flow-deepseek`](https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek)
"""
)
with gr.Tabs():
with gr.Tab("Overview"):
gr.Markdown(overview_kpis())
gr.Plot(value=overview_signal_share_fig(), label="Signal-type composition over quarters")
gr.Markdown(f"### Top 10 movers — {PERIOD_LABEL[LATEST]}")
gr.Dataframe(value=overview_top10(), interactive=False, wrap=True)
with gr.Tab("Sector heatmap"):
gr.Plot(value=sector_heatmap_fig(), label="Sector × Quarter heatmap")
gr.Markdown(
"_Each cell shows the **average 14-day commit velocity** of the startups tracked in that sector "
"for that quarter. Brighter = more engineering throughput. Use it to spot sector-level rotations._"
)
with gr.Tab("Top movers"):
with gr.Row():
period_dd = gr.Dropdown(["All"] + PERIOD_ORDER, value=LATEST, label="Quarter")
sector_dd = gr.Dropdown(["All"] + SECTORS_SORTED, value="All", label="Sector")
stage_dd = gr.Dropdown(["All"] + STAGES_SORTED, value="All", label="Stage")
signal_dd = gr.Dropdown(["All"] + SIGNAL_TYPES, value="All", label="Signal type")
topn_slider = gr.Slider(5, 50, value=15, step=5, label="Top N")
init_table, init_fig = filter_movers(LATEST, "All", "All", "All", 15)
movers_table = gr.Dataframe(value=init_table, label="Filtered movers", interactive=False, wrap=True)
movers_fig = gr.Plot(value=init_fig, label="Velocity-change ranking")
for control in (period_dd, sector_dd, stage_dd, signal_dd, topn_slider):
control.change(
filter_movers,
inputs=[period_dd, sector_dd, stage_dd, signal_dd, topn_slider],
outputs=[movers_table, movers_fig],
)
with gr.Tab("Startup drilldown"):
startup_dd = gr.Dropdown(STARTUPS_SORTED, value=STARTUPS_SORTED[0], label="Pick a startup")
init_md, init_drill_fig = drilldown(STARTUPS_SORTED[0])
drill_md = gr.Markdown(value=init_md)
drill_fig = gr.Plot(value=init_drill_fig, label="Commit velocity over time")
startup_dd.change(drilldown, inputs=[startup_dd], outputs=[drill_md, drill_fig])
with gr.Tab("Methodology & cite"):
gr.Markdown(
"""
## How signals are computed
The dataset is derived live from the [GitHub REST API v3](https://docs.github.com/en/rest). For each tracked startup we sample its most active public organisation repository on a 14-day rolling window, four times per quarter.
**Working hypothesis (testable, falsifiable):** sustained engineering acceleration — commit velocity rising significantly above a startup's own baseline — tends to precede fundraise announcements by roughly 6–12 weeks.
The classifier maps every (startup, quarter) observation onto one of four signal types:
| Signal | Definition |
|---|---|
| `Engineering hiring burst` | Unique-contributor count spikes vs. trailing 90-day baseline |
| `Infrastructure buildout` | Multiple new public repos created in the last 30 days |
| `Deploy frequency spike` | Commit velocity ≥ 2× the trailing 90-day baseline |
| `Framework migration` | High commit volume with low contributor growth and zero new repos |
Full classifier source (MIT): [github.com/kindrat86/gitdealflow-signal-classifier](https://github.com/kindrat86/gitdealflow-signal-classifier)
## Cite this dataset
```bibtex
@dataset{vc_deal_flow_signal_2026,
author = {The Data Nerd},
title = {Startup GitHub Engineering Velocity Panel},
year = {2026},
publisher = {Zenodo},
doi = {10.5281/zenodo.19650920},
url = {https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal}
}
```
## Live mirrors and related artefacts
- **HF dataset (this Space's source):** https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal
- **Zenodo (DOI'd version):** https://zenodo.org/records/19650920 — concept DOI [10.5281/zenodo.19650919](https://doi.org/10.5281/zenodo.19650919)
- **Kaggle mirror:** https://www.kaggle.com/datasets/thedatanerd2026/vc-deal-flow-signal
- **Data.world mirror:** https://data.world/thedatanerd2026/vc-deal-flow-signal-startup-engineering-acceleration
- **SSRN preprint (methodology):** https://ssrn.com/abstract=6606558
- **Live MCP server (read-only, public):** https://signals.gitdealflow.com/api/mcp/rpc
- **Companion chat agent (Space):** https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek
- **Production web app:** https://signals.gitdealflow.com
"""
)
gr.Markdown(
"""---
Built with [Gradio](https://gradio.app) on top of [Hugging Face Datasets](https://huggingface.co/docs/datasets). Code: MIT. Data: CC-BY-4.0. _Past acceleration does not guarantee future outcomes — this is alternative-data research, not investment advice._
"""
)
if __name__ == "__main__":
demo.launch()