Spaces:
Build error
Build error
Initial dashboard deployment
Browse files- Dockerfile +35 -0
- README.md +26 -4
- app.py +86 -0
- dashboard_helpers.py +61 -0
- data_pipeline.py +146 -0
- requirements.txt +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# System deps for GDAL/geopandas
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
libgdal-dev \
|
| 6 |
+
gdal-bin \
|
| 7 |
+
libgeos-dev \
|
| 8 |
+
libproj-dev \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
# Install Python deps
|
| 14 |
+
COPY requirements.txt .
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Copy application files
|
| 18 |
+
COPY data_pipeline.py .
|
| 19 |
+
COPY dashboard_helpers.py .
|
| 20 |
+
COPY app.py .
|
| 21 |
+
|
| 22 |
+
# Pre-download databases and bake the data
|
| 23 |
+
RUN python -c "import leafmap; \
|
| 24 |
+
leafmap.download_file( \
|
| 25 |
+
'https://opengeos.org/data/duckdb/nyc_data.db.zip', \
|
| 26 |
+
unzip=True, \
|
| 27 |
+
overwrite=True \
|
| 28 |
+
)"
|
| 29 |
+
RUN python data_pipeline.py
|
| 30 |
+
|
| 31 |
+
# Expose the port HF Spaces expects
|
| 32 |
+
EXPOSE 7860
|
| 33 |
+
|
| 34 |
+
# Launch Solara
|
| 35 |
+
CMD ["solara", "run", "app.py", "--host=0.0.0.0", "--port=7860"]
|
README.md
CHANGED
|
@@ -1,11 +1,33 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: 🏆
|
| 4 |
colorFrom: blue
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: NYC Taxi Mobility Equity Dashboard
|
|
|
|
| 3 |
colorFrom: blue
|
| 4 |
+
colorTo: green
|
| 5 |
sdk: docker
|
| 6 |
pinned: false
|
| 7 |
license: mit
|
| 8 |
---
|
| 9 |
|
| 10 |
+
# NYC Taxi Mobility Equity Dashboard
|
| 11 |
+
|
| 12 |
+
An interactive Solara dashboard analyzing whether **FHV** and **Yellow Taxi**
|
| 13 |
+
services are equitably distributed across NYC neighborhoods relative to
|
| 14 |
+
demographic baselines.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
|
| 18 |
+
- Pickup & drop-off choropleth maps for FHV and Yellow Taxi
|
| 19 |
+
- Diverging demographic maps showing deviation from citywide baseline
|
| 20 |
+
- Top-10 stats tables split by service and direction
|
| 21 |
+
- Month selector to explore Jan-Mar 2025 data
|
| 22 |
+
|
| 23 |
+
## Data Sources
|
| 24 |
+
|
| 25 |
+
- NYC Taxi Zone geometry: data.source.coop
|
| 26 |
+
- NYC Census Blocks: opengeos.org
|
| 27 |
+
- Trip Records: NYC TLC (Jan-Mar 2025)
|
| 28 |
+
|
| 29 |
+
## Architecture
|
| 30 |
+
|
| 31 |
+
- `data_pipeline.py`: one-time data processing, writes to `processed_dashboard.db`
|
| 32 |
+
- `dashboard_helpers.py`: reusable map and stats functions
|
| 33 |
+
- `app.py`: Solara UI components and layout
|
app.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import solara
|
| 2 |
+
import duckdb
|
| 3 |
+
from dashboard_helpers import build_trip_map, build_demo_map, build_stats
|
| 4 |
+
|
| 5 |
+
# Connect to pre-baked database in read-only mode
|
| 6 |
+
con = duckdb.connect('processed_dashboard.db', read_only=True)
|
| 7 |
+
con.install_extension('spatial')
|
| 8 |
+
con.load_extension('spatial')
|
| 9 |
+
|
| 10 |
+
# Read baselines
|
| 11 |
+
baseline_df = con.sql("SELECT * FROM city_baselines").df()
|
| 12 |
+
baseline_white = float(baseline_df['baseline_white_pct'].iloc[0]) / 100.0
|
| 13 |
+
baseline_black = float(baseline_df['baseline_black_pct'].iloc[0]) / 100.0
|
| 14 |
+
|
| 15 |
+
# Reactive state
|
| 16 |
+
selected_month = solara.reactive('Feb2025')
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@solara.component
|
| 20 |
+
def TaxiMap(service, metric, cmap, title):
|
| 21 |
+
month = selected_month.value
|
| 22 |
+
m = build_trip_map(con, service, month, metric, cmap, title)
|
| 23 |
+
with solara.Card(title=f'{title} ({month})'):
|
| 24 |
+
solara.display(m)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@solara.component
|
| 28 |
+
def DemographicMap(column, baseline_val, title):
|
| 29 |
+
m = build_demo_map(con, column, baseline_val, title)
|
| 30 |
+
with solara.Card(title=title):
|
| 31 |
+
solara.display(m)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@solara.component
|
| 35 |
+
def ServiceStatsCard(service, cmap_pu, cmap_do):
|
| 36 |
+
month = selected_month.value
|
| 37 |
+
pu_head, pu_table = build_stats(con, service, month, 'pu', cmap_pu, 'Pickups')
|
| 38 |
+
do_head, do_table = build_stats(con, service, month, 'do', cmap_do, 'Drop-offs')
|
| 39 |
+
with solara.Card():
|
| 40 |
+
with solara.Row():
|
| 41 |
+
with solara.Column():
|
| 42 |
+
solara.Markdown(pu_head, style={'text-align': 'center'})
|
| 43 |
+
solara.display(pu_table)
|
| 44 |
+
with solara.Column():
|
| 45 |
+
solara.Markdown(do_head, style={'text-align': 'center'})
|
| 46 |
+
solara.display(do_table)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@solara.component
|
| 50 |
+
def Page():
|
| 51 |
+
solara.Title('NYC Taxi Mobility Equity Dashboard')
|
| 52 |
+
|
| 53 |
+
with solara.Column():
|
| 54 |
+
with solara.Column(align='center'):
|
| 55 |
+
solara.Markdown('# NYC Taxi Mobility Equity Dashboard',
|
| 56 |
+
style={'text-align': 'center'})
|
| 57 |
+
solara.Markdown(
|
| 58 |
+
'<div style="text-align:center;">This interactive dashboard analyzes whether '
|
| 59 |
+
'<b>FHV</b> and <b>Yellow Taxi</b> services are over- or under-represented '
|
| 60 |
+
'in areas with different demographic compositions relative to the NYC baseline.</div>'
|
| 61 |
+
)
|
| 62 |
+
solara.Select(
|
| 63 |
+
label='Select Month',
|
| 64 |
+
value=selected_month,
|
| 65 |
+
values=['Jan2025', 'Feb2025', 'Mar2025']
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Trip maps grid
|
| 69 |
+
with solara.GridFixed(columns=2):
|
| 70 |
+
TaxiMap('FHV', 'pu', 'Blues', 'FHV Pickups')
|
| 71 |
+
TaxiMap('Yellow', 'pu', 'Blues', 'Yellow Pickups')
|
| 72 |
+
TaxiMap('FHV', 'do', 'Greens', 'FHV Drop-offs')
|
| 73 |
+
TaxiMap('Yellow', 'do', 'Greens', 'Yellow Drop-offs')
|
| 74 |
+
|
| 75 |
+
# Demographic baseline maps
|
| 76 |
+
with solara.GridFixed(columns=2):
|
| 77 |
+
DemographicMap('white_pct', baseline_white * 100,
|
| 78 |
+
f'White Pop. Deviation from Baseline ({baseline_white*100:.1f}%)')
|
| 79 |
+
DemographicMap('black_pct', baseline_black * 100,
|
| 80 |
+
f'Black Pop. Deviation from Baseline ({baseline_black*100:.1f}%)')
|
| 81 |
+
|
| 82 |
+
# Stats tables
|
| 83 |
+
with solara.Column(align='center'):
|
| 84 |
+
with solara.GridFixed(columns=2):
|
| 85 |
+
ServiceStatsCard('FHV', 'Blues', 'Greens')
|
| 86 |
+
ServiceStatsCard('Yellow', 'YlOrBr', 'YlOrBr')
|
dashboard_helpers.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import geopandas as gpd
|
| 2 |
+
import leafmap
|
| 3 |
+
|
| 4 |
+
def build_trip_map(con, service, month, metric, cmap, legend_prefix):
|
| 5 |
+
if metric == 'pu':
|
| 6 |
+
table, col = 'trip_counts_pu', 'trips_pu'
|
| 7 |
+
else:
|
| 8 |
+
table, col = 'trip_counts_do', 'trips_do'
|
| 9 |
+
df = con.sql(f"""
|
| 10 |
+
SELECT
|
| 11 |
+
tz.zone,
|
| 12 |
+
t.{col} AS trips,
|
| 13 |
+
ST_AsText(ST_Transform(tz.geometry, 'EPSG:26918', 'OGC:CRS84')) AS geometry
|
| 14 |
+
FROM {table} AS t
|
| 15 |
+
JOIN taxi_zones_utm AS tz ON t.LocationID = tz.LocationID
|
| 16 |
+
WHERE t.service = '{service}' AND t.month = '{month}'
|
| 17 |
+
""").df()
|
| 18 |
+
gdf = gpd.GeoDataFrame(df, geometry=gpd.GeoSeries.from_wkt(df['geometry']), crs='EPSG:4326')
|
| 19 |
+
m = leafmap.Map(center=[40.7, -73.9], zoom=10, draw_control=False)
|
| 20 |
+
m.layout.height = '400px'
|
| 21 |
+
m.add_basemap('CartoDB.DarkMatter')
|
| 22 |
+
m.add_data(gdf, column='trips', cmap=cmap,
|
| 23 |
+
legend_title=f'{legend_prefix} ({month})')
|
| 24 |
+
return m
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def build_demo_map(con, column, baseline_val, legend_title):
|
| 28 |
+
df = con.sql(f"""
|
| 29 |
+
SELECT
|
| 30 |
+
zd.TaxiZone AS zone,
|
| 31 |
+
ROUND(zd.{column} - {baseline_val}, 1) AS deviation,
|
| 32 |
+
ST_AsText(ST_Transform(tz.geometry, 'EPSG:26918', 'OGC:CRS84')) AS geometry
|
| 33 |
+
FROM zone_demographics AS zd
|
| 34 |
+
JOIN taxi_zones_utm AS tz ON zd.LocationID = tz.LocationID
|
| 35 |
+
WHERE zd.TotalPop > 0
|
| 36 |
+
""").df()
|
| 37 |
+
gdf = gpd.GeoDataFrame(df, geometry=gpd.GeoSeries.from_wkt(df['geometry']), crs='EPSG:4326')
|
| 38 |
+
m = leafmap.Map(center=[40.7, -73.9], zoom=10, draw_control=False)
|
| 39 |
+
m.layout.height = '400px'
|
| 40 |
+
m.add_basemap('CartoDB.DarkMatter')
|
| 41 |
+
m.add_data(gdf, column='deviation', cmap='RdYlBu', legend_title=legend_title)
|
| 42 |
+
return m
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def build_stats(con, service, month, metric, cmap, label):
|
| 46 |
+
if metric == 'pu':
|
| 47 |
+
table, col, alias = 'trip_counts_pu', 'trips_pu', 'Pickups'
|
| 48 |
+
else:
|
| 49 |
+
table, col, alias = 'trip_counts_do', 'trips_do', 'Dropoffs'
|
| 50 |
+
df = con.sql(f"""
|
| 51 |
+
SELECT
|
| 52 |
+
tz.zone AS Neighborhood,
|
| 53 |
+
t.{col} AS {alias}
|
| 54 |
+
FROM {table} t
|
| 55 |
+
JOIN taxi_zones_utm tz ON t.LocationID = tz.LocationID
|
| 56 |
+
WHERE t.service = '{service}' AND t.month = '{month}'
|
| 57 |
+
ORDER BY t.{col} DESC LIMIT 10
|
| 58 |
+
""").df()
|
| 59 |
+
header = f'### {service} Top 10 {label} ({month})'
|
| 60 |
+
styled = df.style.background_gradient(cmap=cmap, subset=[alias]).hide(axis='index')
|
| 61 |
+
return header, styled
|
data_pipeline.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pathlib
|
| 3 |
+
import duckdb
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import leafmap
|
| 6 |
+
import requests
|
| 7 |
+
|
| 8 |
+
# DuckDB connection & extensions
|
| 9 |
+
con = duckdb.connect('processed_dashboard.db')
|
| 10 |
+
con.install_extension('httpfs')
|
| 11 |
+
con.load_extension('httpfs')
|
| 12 |
+
con.install_extension('spatial')
|
| 13 |
+
con.load_extension('spatial')
|
| 14 |
+
|
| 15 |
+
# Taxi-zone geometry
|
| 16 |
+
TAXI_ZONES_URL = 'https://data.source.coop/cholmes/nyc-taxi-zones/taxi_zones.parquet'
|
| 17 |
+
con.sql(f"CREATE OR REPLACE VIEW taxi_zones AS SELECT * FROM '{TAXI_ZONES_URL}'")
|
| 18 |
+
|
| 19 |
+
# NYC census blocks
|
| 20 |
+
DB_PATH = pathlib.Path('nyc_data.db')
|
| 21 |
+
if not DB_PATH.exists():
|
| 22 |
+
leafmap.download_file(
|
| 23 |
+
'https://opengeos.org/data/duckdb/nyc_data.db.zip',
|
| 24 |
+
unzip=True,
|
| 25 |
+
overwrite=True,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
con.execute("ATTACH 'nyc_data.db' AS nyc_data (READ_ONLY)")
|
| 29 |
+
|
| 30 |
+
# Reproject taxi zones to UTM
|
| 31 |
+
con.sql("""
|
| 32 |
+
CREATE OR REPLACE TABLE taxi_zones_utm AS
|
| 33 |
+
SELECT * EXCLUDE (geometry),
|
| 34 |
+
ST_Transform(geometry, 'EPSG:2263', 'EPSG:26918') AS geometry
|
| 35 |
+
FROM taxi_zones
|
| 36 |
+
""")
|
| 37 |
+
|
| 38 |
+
# Zone demographics via spatial join
|
| 39 |
+
con.sql("""
|
| 40 |
+
CREATE OR REPLACE TABLE zone_demographics AS
|
| 41 |
+
SELECT
|
| 42 |
+
tz.LocationID,
|
| 43 |
+
tz.zone AS TaxiZone,
|
| 44 |
+
tz.borough AS Borough,
|
| 45 |
+
SUM(cb.popn_total) AS TotalPop,
|
| 46 |
+
SUM(cb.popn_white) AS WhitePop,
|
| 47 |
+
SUM(cb.popn_black) AS BlackPop,
|
| 48 |
+
100.0 * SUM(cb.popn_white) / SUM(cb.popn_total) AS white_pct,
|
| 49 |
+
100.0 * SUM(cb.popn_black) / SUM(cb.popn_total) AS black_pct
|
| 50 |
+
FROM nyc_data.nyc_census_blocks AS cb
|
| 51 |
+
JOIN taxi_zones_utm AS tz ON ST_Intersects(tz.geometry, cb.geom)
|
| 52 |
+
GROUP BY tz.LocationID, tz.zone, tz.borough
|
| 53 |
+
""")
|
| 54 |
+
|
| 55 |
+
# Citywide baselines
|
| 56 |
+
baseline_df = con.sql("""
|
| 57 |
+
SELECT
|
| 58 |
+
ROUND(100.0 * SUM(popn_white) / SUM(popn_total), 2) AS baseline_white_pct,
|
| 59 |
+
ROUND(100.0 * SUM(popn_black) / SUM(popn_total), 2) AS baseline_black_pct
|
| 60 |
+
FROM nyc_data.nyc_census_blocks
|
| 61 |
+
""").df()
|
| 62 |
+
|
| 63 |
+
baseline_white = float(baseline_df['baseline_white_pct'].iloc[0]) / 100.0
|
| 64 |
+
baseline_black = float(baseline_df['baseline_black_pct'].iloc[0]) / 100.0
|
| 65 |
+
|
| 66 |
+
# Save baselines into the db for app.py to read later
|
| 67 |
+
con.sql(f"""
|
| 68 |
+
CREATE OR REPLACE TABLE city_baselines AS
|
| 69 |
+
SELECT
|
| 70 |
+
{baseline_white * 100} AS baseline_white_pct,
|
| 71 |
+
{baseline_black * 100} AS baseline_black_pct
|
| 72 |
+
""")
|
| 73 |
+
|
| 74 |
+
# Trip data
|
| 75 |
+
pu_field = {'FHV': 'PUlocationID', 'Yellow': 'PULocationID'}
|
| 76 |
+
do_field = {'FHV': 'DOlocationID', 'Yellow': 'DOLocationID'}
|
| 77 |
+
trip_urls = {
|
| 78 |
+
'FHV_Jan2025': 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2025-01.parquet',
|
| 79 |
+
'FHV_Feb2025': 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2025-02.parquet',
|
| 80 |
+
'FHV_Mar2025': 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2025-03.parquet',
|
| 81 |
+
'Yellow_Jan2025': 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet',
|
| 82 |
+
'Yellow_Feb2025': 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-02.parquet',
|
| 83 |
+
'Yellow_Mar2025': 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-03.parquet',
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
con.execute("CREATE OR REPLACE TABLE trip_counts_pu (service VARCHAR, month VARCHAR, LocationID INTEGER, trips_pu BIGINT)")
|
| 87 |
+
con.execute("CREATE OR REPLACE TABLE trip_counts_do (service VARCHAR, month VARCHAR, LocationID INTEGER, trips_do BIGINT)")
|
| 88 |
+
|
| 89 |
+
DATA_DIR = 'trip_data'
|
| 90 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
for key, url in trip_urls.items():
|
| 93 |
+
local_path = os.path.join(DATA_DIR, f'{key}.parquet')
|
| 94 |
+
if not os.path.exists(local_path):
|
| 95 |
+
print(f'Downloading {key}...')
|
| 96 |
+
r = requests.get(url, timeout=120)
|
| 97 |
+
r.raise_for_status()
|
| 98 |
+
with open(local_path, 'wb') as f:
|
| 99 |
+
f.write(r.content)
|
| 100 |
+
else:
|
| 101 |
+
print(f'Already exists: {local_path}')
|
| 102 |
+
service, month = key.split('_')
|
| 103 |
+
pu, do = pu_field[service], do_field[service]
|
| 104 |
+
con.sql(f"""
|
| 105 |
+
INSERT INTO trip_counts_pu
|
| 106 |
+
SELECT '{service}', '{month}', CAST({pu} AS INTEGER), COUNT(*)
|
| 107 |
+
FROM '{local_path}'
|
| 108 |
+
WHERE {pu} IS NOT NULL AND CAST({pu} AS INTEGER) NOT IN (0, 264, 265)
|
| 109 |
+
GROUP BY {pu}
|
| 110 |
+
""")
|
| 111 |
+
con.sql(f"""
|
| 112 |
+
INSERT INTO trip_counts_do
|
| 113 |
+
SELECT '{service}', '{month}', CAST({do} AS INTEGER), COUNT(*)
|
| 114 |
+
FROM '{local_path}'
|
| 115 |
+
WHERE {do} IS NOT NULL AND CAST({do} AS INTEGER) NOT IN (0, 264, 265)
|
| 116 |
+
GROUP BY {do}
|
| 117 |
+
""")
|
| 118 |
+
|
| 119 |
+
# Representative ratios
|
| 120 |
+
rr_pu_df = con.sql(f"""
|
| 121 |
+
SELECT
|
| 122 |
+
tp.service, tp.month,
|
| 123 |
+
SUM(tp.trips_pu * zd.white_pct) * 1.0 / SUM(tp.trips_pu) / {baseline_white*100} AS RR_white_PU,
|
| 124 |
+
SUM(tp.trips_pu * zd.black_pct) * 1.0 / SUM(tp.trips_pu) / {baseline_black*100} AS RR_black_PU
|
| 125 |
+
FROM trip_counts_pu AS tp
|
| 126 |
+
JOIN zone_demographics AS zd ON tp.LocationID = zd.LocationID
|
| 127 |
+
WHERE zd.TotalPop > 0
|
| 128 |
+
GROUP BY tp.service, tp.month
|
| 129 |
+
""").df()
|
| 130 |
+
|
| 131 |
+
rr_do_df = con.sql(f"""
|
| 132 |
+
SELECT
|
| 133 |
+
td.service, td.month,
|
| 134 |
+
SUM(td.trips_do * zd.white_pct) * 1.0 / SUM(td.trips_do) / {baseline_white*100} AS RR_white_DO,
|
| 135 |
+
SUM(td.trips_do * zd.black_pct) * 1.0 / SUM(td.trips_do) / {baseline_black*100} AS RR_black_DO
|
| 136 |
+
FROM trip_counts_do AS td
|
| 137 |
+
JOIN zone_demographics AS zd ON td.LocationID = zd.LocationID
|
| 138 |
+
WHERE zd.TotalPop > 0
|
| 139 |
+
GROUP BY td.service, td.month
|
| 140 |
+
""").df()
|
| 141 |
+
|
| 142 |
+
rr_combined = pd.merge(rr_pu_df, rr_do_df, on=['service', 'month'], how='outer')
|
| 143 |
+
con.sql("CREATE OR REPLACE TABLE rr_combined AS SELECT * FROM rr_combined")
|
| 144 |
+
|
| 145 |
+
print('Pipeline complete. processed_dashboard.db is ready.')
|
| 146 |
+
con.close()
|
requirements.txt
ADDED
|
File without changes
|