Spaces:

the-data-nerd
/

vc-deal-flow-explorer

Sleeping

App Files Files Community

vc-deal-flow-explorer / app.py

the-data-nerd

chore: upload app.py

163ee0b verified 13 days ago

raw

history blame contribute delete

13.8 kB

	"""VC Deal Flow Signal — Interactive Explorer.

	Hugging Face Space that loads the live HF dataset
	(huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) and renders
	an interactive Gradio dashboard.

	5 tabs:
	- Overview : KPI strip + signal-type composition + top movers (latest quarter)
	- Sector heatmap : sector x quarter avg commit velocity
	- Top movers : filterable ranking of accelerating startups
	- Startup drilldown : per-startup four-quarter trajectory
	- Methodology & cite : SSRN, Zenodo, classifier code, MCP server, citation BibTeX

	The dataset is public (CC-BY-4.0); no auth required at runtime.
	"""
	from __future__ import annotations

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from huggingface_hub import hf_hub_download

	REPO_ID = "the-data-nerd/vc-deal-flow-signal"
	PERIOD_ORDER = ["q3-2025", "q4-2025", "q1-2026", "q2-2026"]
	PERIOD_LABEL = {p: p.upper().replace("-", " ") for p in PERIOD_ORDER}


	def load_csv(name: str) -> pd.DataFrame:
	path = hf_hub_download(repo_id=REPO_ID, filename=name, repo_type="dataset")
	return pd.read_csv(path)


	SIGNALS = load_csv("startup_signals.csv")
	SECTORS = load_csv("sector_aggregates.csv")
	TIMESERIES = load_csv("signal_type_timeseries.csv")

	for _df in (SIGNALS, SECTORS, TIMESERIES):
	_df["period"] = pd.Categorical(_df["period"], categories=PERIOD_ORDER, ordered=True)

	LATEST = str(SIGNALS["period"].max())
	N_STARTUPS = SIGNALS["startup_name"].nunique()
	N_SECTORS = SIGNALS["sector_name"].nunique()
	N_QUARTERS = SIGNALS["period"].nunique()
	N_OBSERVATIONS = len(SIGNALS)
	SECTORS_SORTED = sorted(SIGNALS["sector_name"].unique())
	STAGES_SORTED = sorted(s for s in SIGNALS["stage"].unique() if isinstance(s, str))
	SIGNAL_TYPES = sorted(s for s in SIGNALS["signal_type"].unique() if isinstance(s, str))
	STARTUPS_SORTED = sorted(SIGNALS["startup_name"].unique())


	def overview_kpis() -> str:
	latest_df = SIGNALS[SIGNALS["period"] == LATEST]
	top_mover = latest_df.sort_values("commit_velocity_change_pct", ascending=False).iloc[0]
	return (
	f"{N_STARTUPS} startups · {N_SECTORS} sectors · {N_QUARTERS} quarters · {N_OBSERVATIONS} observations \n"
	f"Top mover {PERIOD_LABEL[LATEST]}: `{top_mover['startup_name']}` "
	f"({top_mover['sector_name']}) — `{top_mover['commit_velocity_change_pct']:+.0f}%` Δ commit velocity"
	)


	def overview_signal_share_fig() -> go.Figure:
	ts = TIMESERIES.copy()
	ts["period"] = ts["period"].astype(str)
	fig = px.bar(
	ts,
	x="period",
	y="share_of_total",
	color="signal_type",
	title="Signal Type Share by Quarter",
	labels={"share_of_total": "Share of total", "period": "Quarter", "signal_type": "Signal"},
	category_orders={"period": PERIOD_ORDER},
	)
	fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=380, legend_title_text="")
	return fig


	def overview_top10() -> pd.DataFrame:
	latest_df = SIGNALS[SIGNALS["period"] == LATEST]
	return (
	latest_df.sort_values("commit_velocity_change_pct", ascending=False)
	.head(10)[
	[
	"startup_name",
	"sector_name",
	"stage",
	"commit_velocity_14d",
	"commit_velocity_change_pct",
	"signal_type",
	"github_url",
	]
	]
	.rename(
	columns={
	"startup_name": "Startup",
	"sector_name": "Sector",
	"stage": "Stage",
	"commit_velocity_14d": "Velocity (14d)",
	"commit_velocity_change_pct": "Change %",
	"signal_type": "Signal",
	"github_url": "GitHub",
	}
	)
	.reset_index(drop=True)
	)


	def sector_heatmap_fig() -> go.Figure:
	pivot = (
	SECTORS.pivot_table(
	index="sector_name",
	columns="period",
	values="avg_commit_velocity_14d",
	aggfunc="mean",
	observed=True,
	)
	.reindex(columns=PERIOD_ORDER)
	.sort_index()
	)
	fig = px.imshow(
	pivot,
	labels=dict(x="Quarter", y="Sector", color="Avg Commit Velocity (14d)"),
	aspect="auto",
	color_continuous_scale="Viridis",
	title="Average Commit Velocity by Sector × Quarter",
	text_auto=".0f",
	)
	fig.update_layout(margin=dict(l=20, r=20, t=50, b=20), height=620)
	return fig


	def filter_movers(period: str, sector: str, stage: str, signal: str, top_n: int):
	df = SIGNALS.copy()
	if period != "All":
	df = df[df["period"] == period]
	if sector != "All":
	df = df[df["sector_name"] == sector]
	if stage != "All":
	df = df[df["stage"] == stage]
	if signal != "All":
	df = df[df["signal_type"] == signal]

	df = df.sort_values("commit_velocity_change_pct", ascending=False).head(int(top_n))

	if df.empty:
	empty_fig = go.Figure()
	empty_fig.add_annotation(text="No rows match the selected filters", x=0.5, y=0.5, showarrow=False)
	empty_fig.update_layout(height=380, margin=dict(l=20, r=20, t=50, b=20))
	return df, empty_fig

	fig = px.bar(
	df,
	y="startup_name",
	x="commit_velocity_change_pct",
	color="signal_type",
	orientation="h",
	title=f"Top {len(df)} Movers — Δ Commit Velocity",
	labels={"commit_velocity_change_pct": "Velocity Change %", "startup_name": "Startup", "signal_type": "Signal"},
	hover_data=["sector_name", "stage", "commit_velocity_14d"],
	)
	fig.update_layout(
	yaxis={"categoryorder": "total ascending"},
	height=max(380, 28 * len(df)),
	margin=dict(l=20, r=20, t=50, b=20),
	legend_title_text="",
	)

	table = (
	df[
	[
	"startup_name",
	"sector_name",
	"stage",
	"commit_velocity_14d",
	"commit_velocity_change_pct",
	"signal_type",
	"github_url",
	]
	]
	.rename(
	columns={
	"startup_name": "Startup",
	"sector_name": "Sector",
	"stage": "Stage",
	"commit_velocity_14d": "Velocity (14d)",
	"commit_velocity_change_pct": "Change %",
	"signal_type": "Signal",
	"github_url": "GitHub",
	}
	)
	.reset_index(drop=True)
	)
	return table, fig


	def drilldown(startup: str):
	if not startup:
	return "_Pick a startup above_", go.Figure()
	df = SIGNALS[SIGNALS["startup_name"] == startup].copy()
	if df.empty:
	return f"_No rows for `{startup}`_", go.Figure()
	df = df.sort_values("period")
	df["period_str"] = df["period"].astype(str)

	fig = go.Figure()
	fig.add_trace(
	go.Scatter(
	x=df["period_str"],
	y=df["commit_velocity_14d"],
	mode="lines+markers+text",
	name="Commit velocity (14d)",
	text=df["commit_velocity_14d"].astype(str),
	textposition="top center",
	)
	)
	fig.update_layout(
	title=f"{startup} — commit velocity trajectory",
	xaxis_title="Quarter",
	yaxis_title="Commit velocity (14d)",
	height=380,
	margin=dict(l=20, r=20, t=50, b=20),
	)

	latest = df.iloc[-1]
	md = (
	f"Sector: {latest['sector_name']} \n"
	f"Stage: {latest['stage']} \n"
	f"Geography: {latest['geography']} \n"
	f"Latest signal ({PERIOD_LABEL[str(latest['period'])]}): `{latest['signal_type']}` \n"
	f"Commit velocity (14d): {latest['commit_velocity_14d']} (Δ {latest['commit_velocity_change_pct']:+.0f}%) \n"
	f"Contributors: {latest['contributors']} (growth {latest['contributor_growth_pct']:+.0f}%) \n"
	f"GitHub: [{latest['github_url']}]({latest['github_url']})"
	)
	return md, fig


	with gr.Blocks(title="VC Deal Flow Signal — Interactive Explorer", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	f"""
	# 📊 VC Deal Flow Signal — Interactive Explorer

	Live engineering-velocity panel across {N_STARTUPS} venture-backed startups in {N_SECTORS} sectors over {N_QUARTERS} quarters of GitHub data.

	Source: [`the-data-nerd/vc-deal-flow-signal`](https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal) · CC-BY-4.0 · methodology on [SSRN 6606558](https://ssrn.com/abstract=6606558) · companion chat agent: [`vc-deal-flow-deepseek`](https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek)
	"""
	)

	with gr.Tabs():
	with gr.Tab("Overview"):
	gr.Markdown(overview_kpis())
	gr.Plot(value=overview_signal_share_fig(), label="Signal-type composition over quarters")
	gr.Markdown(f"### Top 10 movers — {PERIOD_LABEL[LATEST]}")
	gr.Dataframe(value=overview_top10(), interactive=False, wrap=True)

	with gr.Tab("Sector heatmap"):
	gr.Plot(value=sector_heatmap_fig(), label="Sector × Quarter heatmap")
	gr.Markdown(
	"_Each cell shows the average 14-day commit velocity of the startups tracked in that sector "
	"for that quarter. Brighter = more engineering throughput. Use it to spot sector-level rotations._"
	)

	with gr.Tab("Top movers"):
	with gr.Row():
	period_dd = gr.Dropdown(["All"] + PERIOD_ORDER, value=LATEST, label="Quarter")
	sector_dd = gr.Dropdown(["All"] + SECTORS_SORTED, value="All", label="Sector")
	stage_dd = gr.Dropdown(["All"] + STAGES_SORTED, value="All", label="Stage")
	signal_dd = gr.Dropdown(["All"] + SIGNAL_TYPES, value="All", label="Signal type")
	topn_slider = gr.Slider(5, 50, value=15, step=5, label="Top N")

	init_table, init_fig = filter_movers(LATEST, "All", "All", "All", 15)
	movers_table = gr.Dataframe(value=init_table, label="Filtered movers", interactive=False, wrap=True)
	movers_fig = gr.Plot(value=init_fig, label="Velocity-change ranking")

	for control in (period_dd, sector_dd, stage_dd, signal_dd, topn_slider):
	control.change(
	filter_movers,
	inputs=[period_dd, sector_dd, stage_dd, signal_dd, topn_slider],
	outputs=[movers_table, movers_fig],
	)

	with gr.Tab("Startup drilldown"):
	startup_dd = gr.Dropdown(STARTUPS_SORTED, value=STARTUPS_SORTED[0], label="Pick a startup")
	init_md, init_drill_fig = drilldown(STARTUPS_SORTED[0])
	drill_md = gr.Markdown(value=init_md)
	drill_fig = gr.Plot(value=init_drill_fig, label="Commit velocity over time")
	startup_dd.change(drilldown, inputs=[startup_dd], outputs=[drill_md, drill_fig])

	with gr.Tab("Methodology & cite"):
	gr.Markdown(
	"""
	## How signals are computed

	The dataset is derived live from the [GitHub REST API v3](https://docs.github.com/en/rest). For each tracked startup we sample its most active public organisation repository on a 14-day rolling window, four times per quarter.

	Working hypothesis (testable, falsifiable): sustained engineering acceleration — commit velocity rising significantly above a startup's own baseline — tends to precede fundraise announcements by roughly 6–12 weeks.

	The classifier maps every (startup, quarter) observation onto one of four signal types:

	\| Signal \| Definition \|
	\|---\|---\|
	\| `Engineering hiring burst` \| Unique-contributor count spikes vs. trailing 90-day baseline \|
	\| `Infrastructure buildout` \| Multiple new public repos created in the last 30 days \|
	\| `Deploy frequency spike` \| Commit velocity ≥ 2× the trailing 90-day baseline \|
	\| `Framework migration` \| High commit volume with low contributor growth and zero new repos \|

	Full classifier source (MIT): [github.com/kindrat86/gitdealflow-signal-classifier](https://github.com/kindrat86/gitdealflow-signal-classifier)

	## Cite this dataset

	```bibtex
	@dataset{vc_deal_flow_signal_2026,
	author = {The Data Nerd},
	title = {Startup GitHub Engineering Velocity Panel},
	year = {2026},
	publisher = {Zenodo},
	doi = {10.5281/zenodo.19650920},
	url = {https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal}
	}
	```

	## Live mirrors and related artefacts

	- HF dataset (this Space's source): https://huggingface.co/datasets/the-data-nerd/vc-deal-flow-signal
	- Zenodo (DOI'd version): https://zenodo.org/records/19650920 — concept DOI [10.5281/zenodo.19650919](https://doi.org/10.5281/zenodo.19650919)
	- Kaggle mirror: https://www.kaggle.com/datasets/thedatanerd2026/vc-deal-flow-signal
	- Data.world mirror: https://data.world/thedatanerd2026/vc-deal-flow-signal-startup-engineering-acceleration
	- SSRN preprint (methodology): https://ssrn.com/abstract=6606558
	- Live MCP server (read-only, public): https://signals.gitdealflow.com/api/mcp/rpc
	- Companion chat agent (Space): https://huggingface.co/spaces/the-data-nerd/vc-deal-flow-deepseek
	- Production web app: https://signals.gitdealflow.com
	"""
	)

	gr.Markdown(
	"""---
	Built with [Gradio](https://gradio.app) on top of [Hugging Face Datasets](https://huggingface.co/docs/datasets). Code: MIT. Data: CC-BY-4.0. _Past acceleration does not guarantee future outcomes — this is alternative-data research, not investment advice._
	"""
	)


	if __name__ == "__main__":
	demo.launch()