diff --git a/.github/workflows/check-empty-cells.yml b/.github/workflows/check-empty-cells.yml
index 657d9b5dab1a9d461f71e2ce62f1df6da6439956..17d49936a0d5772cea9ce9c0fcaece5be846ca51 100644
--- a/.github/workflows/check-empty-cells.yml
+++ b/.github/workflows/check-empty-cells.yml
@@ -17,6 +17,9 @@ jobs:
- name: π Checkout code
uses: actions/checkout@v4
+ - name: π Install uv
+ uses: astral-sh/setup-uv@v4
+
- name: π Set up Python
uses: actions/setup-python@v5
with:
@@ -24,7 +27,7 @@ jobs:
- name: π Check for empty cells
run: |
- python scripts/check_empty_cells.py
+ make check_empty
- name: π Report results
if: failure()
diff --git a/.gitignore b/.gitignore
index 8a15952d45fcd80c20777b1f7e6e7289412200ea..aefd87f5e1049cff5ef42706f9f7b43a1567a168 100644
--- a/.gitignore
+++ b/.gitignore
@@ -175,3 +175,13 @@ __marimo__
# Generated site content
_site/
+
+# Editors
+*~
+
+# Temporary build files
+tmp/
+example.db
+example.db.wal
+log_data_filtered/*
+log_data_filtered.*
diff --git a/.typos.toml b/.typos.toml
index 1be8d6ef5ea558f897f9df642d8249af405c812e..971fb52d2c912068abb68630c8687a07a6d5282f 100644
--- a/.typos.toml
+++ b/.typos.toml
@@ -15,7 +15,10 @@ extend-ignore-re = [
# Words to explicitly accept
[default.extend-words]
+bimap = "bimap"
pn = "pn"
+setp = "setp"
+Plas = "Plas"
# You can also exclude specific files or directories if needed
# [files]
diff --git a/Makefile b/Makefile
index 5d98314039130e75e4d88d27d6e5319cc374c8a9..f7e0801ce87b748f392ca4d3bac3f1d81e1901da 100644
--- a/Makefile
+++ b/Makefile
@@ -1,24 +1,116 @@
-# Default target.
+ROOT := .
+SITE := _site
+TMP := ./tmp
+LESSON_DATA := ${TMP}/lessons.json
+TEMPLATES := $(wildcard templates/*.html)
+
+NOTEBOOK_INDEX := $(wildcard */index.md)
+NOTEBOOK_DIR := $(patsubst %/index.md,%,${NOTEBOOK_INDEX})
+NOTEBOOK_SRC := $(foreach dir,$(NOTEBOOK_DIR),$(wildcard $(dir)/??_*.py))
+NOTEBOOK_OUT := $(patsubst %.py,${SITE}/%.html,$(NOTEBOOK_SRC))
+
+DATABASES := \
+sql/public/lab.db \
+sql/public/penguins.db \
+sql/public/survey.db
+
+MARIMO := uv run marimo
+PYTHON := uv run python
+
+# Default target
all: commands
-## commands : show all commands.
+## commands : show all commands
commands:
@grep -h -E '^##' ${MAKEFILE_LIST} | sed -e 's/## //g' | column -t -s ':'
-## install: install minimal required packages into current environment.
+## install: install required packages
install:
- uv pip install marimo jinja2 markdown
+ uv pip install -r requirements.txt
+
+## check: run all simple checks
+check:
+ -@make check_empty
+ -@make check_titles
+ -@make check_typos
+ -@make check_packages
-## build: build entire site.
-build:
- rm -rf _site
- uv run scripts/build.py
+## check_exec: run notebooks to check for runtime errors
+check_exec:
+ @if [ -z "$(NOTEBOOKS)" ]; then \
+ bash bin/run_notebooks.sh $(NOTEBOOK_SRC); \
+ else \
+ bash bin/run_notebooks.sh $(NOTEBOOKS); \
+ fi
-## serve: run local web server without rebuilding.
+## build: build website
+build: ${LESSON_DATA} ${NOTEBOOK_OUT} ${TEMPLATES}
+ ${PYTHON} bin/build.py --root ${ROOT} --output ${SITE} --data ${LESSON_DATA}
+
+## links: check links locally (while 'make serve')
+links:
+ linkchecker -F text http://localhost:8000
+
+## serve: run local web server without rebuilding
serve:
- uv run python -m http.server --directory _site
+ ${PYTHON} -m http.server --directory ${SITE}
+
+## databases: rebuild datasets for SQL lessons
+databases: ${DATABASES}
-## clean: clean up stray files.
+## ---: ---
+
+## clean: clean up stray files
clean:
@find . -name '*~' -exec rm {} +
@find . -name '.DS_Store' -exec rm {} +
+ @rm -rf ${TMP}
+ @rm -f log_data_filtered*.*
+
+## check_empty: check for empty cells
+check_empty:
+ @${PYTHON} bin/check_empty_cells.py
+
+## check_titles: check for missing titles in notebooks
+check_titles:
+ @${PYTHON} bin/check_missing_titles.py
+
+## check_packages: check for inconsistent package versions across notebooks
+check_packages:
+ @if [ -z "$(NOTEBOOKS)" ]; then \
+ ${PYTHON} bin/check_notebook_packages.py $(NOTEBOOK_SRC); \
+ else \
+ ${PYTHON} bin/check_notebook_packages.py $(NOTEBOOKS); \
+ fi
+
+## check_typos: check for typos
+check_typos:
+ @typos ${TEMPLATES} ${NOTEBOOK_INDEX} ${NOTEBOOK_SRC}
+
+## extract: extract lesson data
+extract: ${LESSON_DATA}
+
+#
+# subsidiary targets
+#
+
+tmp/lessons.json: $(NOTEBOOK_INDEX)
+ ${PYTHON} bin/extract.py --root ${ROOT} --data ${LESSON_DATA}
+
+${SITE}/%.html: %.py
+ ${MARIMO} export html-wasm --force --mode edit $< -o $@ --sandbox
+
+sql/public/lab.db: bin/create_sql_lab.sql
+ @rm -f $@
+ @mkdir -p sql/public
+ sqlite3 $@ < $<
+
+sql/public/penguins.db: bin/create_sql_penguins.py data/penguins.csv
+ @rm -f $@
+ @mkdir -p sql/public
+ ${PYTHON} $< data/penguins.csv $@
+
+sql/public/survey.db: bin/create_sql_survey.py
+ @rm -f $@
+ @mkdir -p sql/public
+ ${PYTHON} $< $@ 192837
diff --git a/_server/README.md b/_server/README.md
index 80de9a7fec1ef68c920bdd552c41c0995971dfb6..18c3d6a2e500c4344bc3f7c87e5bc07c56499e67 100644
--- a/_server/README.md
+++ b/_server/README.md
@@ -1,8 +1,3 @@
----
-title: Readme
-marimo-version: 0.18.4
----
-
# marimo learn server
This folder contains server code for hosting marimo apps.
diff --git a/_server/main.py b/_server/main.py
index 16f3179e75ba691c48c517e6c9fdaa3db23df1e3..a518aca7524a2f3c37d729be01320bfb19b87db5 100644
--- a/_server/main.py
+++ b/_server/main.py
@@ -6,14 +6,14 @@
# "starlette",
# "python-dotenv",
# "pydantic",
-# "duckdb==1.3.2",
-# "altair==5.5.0",
+# "duckdb==1.4.4",
+# "altair==6.0.0",
# "beautifulsoup4==4.13.3",
# "httpx==0.28.1",
# "marimo",
# "nest-asyncio==1.6.0",
# "numba==0.61.0",
-# "numpy==2.1.3",
+# "numpy==2.4.3",
# "polars==1.24.0",
# ]
# ///
diff --git a/altair/01_introduction.py b/altair/01_introduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..2182214b6ad43c6b115ea1ebee01bd2ac52bad19
--- /dev/null
+++ b/altair/01_introduction.py
@@ -0,0 +1,670 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+# "altair==6.0.0",
+# "marimo",
+# "pandas==3.0.1",
+# ]
+# ///
+
+import marimo
+
+__generated_with = "0.20.4"
+app = marimo.App()
+
+
+@app.cell
+def _():
+ import marimo as mo
+
+ return (mo,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ # Introduction to Altair
+
+ [Altair](https://altair-viz.github.io/) is a declarative statistical visualization library for Python. Altair offers a powerful and concise visualization grammar for quickly building a wide range of statistical graphics.
+
+ By *declarative*, we mean that you can provide a high-level specification of *what* you want the visualization to include, in terms of *data*, *graphical marks*, and *encoding channels*, rather than having to specify *how* to implement the visualization in terms of for-loops, low-level drawing commands, *etc*. The key idea is that you declare links between data fields and visual encoding channels, such as the x-axis, y-axis, color, *etc*. The rest of the plot details are handled automatically. Building on this declarative plotting idea, a surprising range of simple to sophisticated visualizations can be created using a concise grammar.
+
+ Altair is based on [Vega-Lite](https://vega.github.io/vega-lite/), a high-level grammar of interactive graphics. Altair provides a friendly Python [API (Application Programming Interface)](https://en.wikipedia.org/wiki/Application_programming_interface) that generates Vega-Lite specifications in [JSON (JavaScript Object Notation)](https://en.wikipedia.org/wiki/JSON) format. Environments such as Jupyter Notebooks, JupyterLab, and Colab can then take this specification and render it directly in the web browser. To learn more about the motivation and basic concepts behind Altair and Vega-Lite, watch the [Vega-Lite presentation video from OpenVisConf 2017](https://www.youtube.com/watch?v=9uaHRWj04D4).
+
+ This notebook will guide you through the basic process of creating visualizations in Altair. First, you will need to make sure you have the Altair package and its dependencies installed (for more, see the [Altair installation documentation](https://altair-viz.github.io/getting_started/installation.html)), or you are using a notebook environment that includes the dependencies pre-installed.
+
+ _This notebook is part of the [data visualization curriculum](https://github.com/uwdata/visualization-curriculum)._
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Imports
+
+ To start, we must import the necessary libraries: Pandas for data frames and Altair for visualization.
+ """)
+ return
+
+
+@app.cell
+def _():
+ import pandas as pd
+ import altair as alt
+
+ return alt, pd
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Renderers
+
+ Depending on your environment, you may need to specify a [renderer](https://altair-viz.github.io/user_guide/display_frontends.html) for Altair. If you are using __JupyterLab__, __Jupyter Notebook__, or __Google Colab__ with a live Internet connection you should not need to do anything. Otherwise, please read the documentation for [Displaying Altair Charts](https://altair-viz.github.io/user_guide/display_frontends.html).
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Data
+
+ Data in Altair is built around the Pandas data frame, which consists of a set of named data *columns*. We will also regularly refer to data columns as data *fields*.
+
+ When using Altair, datasets are commonly provided as data frames. Alternatively, Altair can also accept a URL to load a network-accessible dataset. As we will see, the named columns of the data frame are an essential piece of plotting with Altair.
+
+ We will often use datasets from Altair's `datasets` sub-package. Some of these datasets are directly available as Pandas data frames:
+ """)
+ return
+
+
+@app.cell
+def _():
+ from altair.datasets import data # import vega_datasets
+ cars = data.cars() # load cars data as a Pandas data frame
+ cars.head() # display the first five rows
+ return cars, data
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Altair's datasets can also be accessed via URLs:
+ """)
+ return
+
+
+@app.cell
+def _(data):
+ data.cars.url
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Dataset URLs can be passed directly to Altair (for supported formats like JSON and [CSV](https://en.wikipedia.org/wiki/Comma-separated_values)), or loaded into a Pandas data frame like so:
+ """)
+ return
+
+
+@app.cell
+def _(data, pd):
+ pd.read_json(data.cars.url).head() # load JSON data into a data frame
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ For more information about data frames - and some useful transformations to prepare Pandas data frames for plotting with Altair! - see the [Specifying Data with Altair documentation](https://altair-viz.github.io/user_guide/data.html).
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ### Weather Data
+
+ Statistical visualization in Altair begins with ["tidy"](http://vita.had.co.nz/papers/tidy-data.html) data frames. Here, we'll start by creating a simple data frame (`df`) containing the average precipitation (`precip`) for a given `city` and `month` :
+ """)
+ return
+
+
+@app.cell
+def _(pd):
+ df = pd.DataFrame({
+ 'city': ['Seattle', 'Seattle', 'Seattle', 'New York', 'New York', 'New York', 'Chicago', 'Chicago', 'Chicago'],
+ 'month': ['Apr', 'Aug', 'Dec', 'Apr', 'Aug', 'Dec', 'Apr', 'Aug', 'Dec'],
+ 'precip': [2.68, 0.87, 5.31, 3.94, 4.13, 3.58, 3.62, 3.98, 2.56]
+ })
+
+ df
+ return (df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## The Chart Object
+
+ The fundamental object in Altair is the `Chart`, which takes a data frame as a single argument:
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ _chart = alt.Chart(df)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ So far, we have defined the `Chart` object and passed it the simple data frame we generated above. We have not yet told the chart to *do* anything with the data.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Marks and Encodings
+
+ With a chart object in hand, we can now specify how we would like the data to be visualized. We first indicate what kind of graphical *mark* (geometric shape) we want to use to represent the data. We can set the `mark` attribute of the chart object using the the `Chart.mark_*` methods.
+
+ For example, we can show the data as a point using `Chart.mark_point()`:
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ alt.Chart(df).mark_point()
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Here the rendering consists of one point per row in the dataset, all plotted on top of each other, since we have not yet specified positions for these points.
+
+ To visually separate the points, we can map various *encoding channels*, or *channels* for short, to fields in the dataset. For example, we could *encode* the field `city` of the data using the `y` channel, which represents the y-axis position of the points. To specify this, use the `encode` method:
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ alt.Chart(df).mark_point().encode(
+ y='city',
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ The `encode()` method builds a key-value mapping between encoding channels (such as `x`, `y`, `color`, `shape`, `size`, *etc.*) to fields in the dataset, accessed by field name. For Pandas data frames, Altair automatically determines an appropriate data type for the mapped column, which in this case is the *nominal* type, indicating unordered, categorical values.
+
+ Though we've now separated the data by one attribute, we still have multiple points overlapping within each category. Let's further separate these by adding an `x` encoding channel, mapped to the `'precip'` field:
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ alt.Chart(df).mark_point().encode(
+ x='precip',
+ y='city'
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _Seattle exhibits both the least-rainiest and most-rainiest months!_
+
+ The data type of the `'precip'` field is again automatically inferred by Altair, and this time is treated as a *quantitative* type (that is, a real-valued number). We see that grid lines and appropriate axis titles are automatically added as well.
+
+ Above we have specified key-value pairs using keyword arguments (`x='precip'`). In addition, Altair provides construction methods for encoding definitions, using the syntax `alt.X('precip')`. This alternative is useful for providing more parameters to an encoding, as we will see later in this notebook.
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ alt.Chart(df).mark_point().encode(
+ alt.X('precip'),
+ alt.Y('city')
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ The two styles of specifying encodings can be interleaved: `x='precip', alt.Y('city')` is also a valid input to the `encode` function.
+
+ In the examples above, the data type for each field is inferred automatically based on its type within the Pandas data frame. We can also explicitly indicate the data type to Altair by annotating the field name:
+
+ - `'b:N'` indicates a *nominal* type (unordered, categorical data),
+ - `'b:O'` indicates an *ordinal* type (rank-ordered data),
+ - `'b:Q'` indicates a *quantitative* type (numerical data with meaningful magnitudes), and
+ - `'b:T'` indicates a *temporal* type (date/time data)
+
+ For example, `alt.X('precip:N')`.
+
+ Explicit annotation of data types is necessary when data is loaded from an external URL directly by Vega-Lite (skipping Pandas entirely), or when we wish to use a type that differs from the type that was automatically inferred.
+
+ What do you think will happen to our chart above if we treat `precip` as a nominal or ordinal variable, rather than a quantitative variable? _Modify the code above and find out!_
+
+ We will take a closer look at data types and encoding channels in the next notebook of the [data visualization curriculum](https://github.com/uwdata/visualization-curriculum#data-visualization-curriculum).
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Data Transformation: Aggregation
+
+ To allow for more flexibility in how data are visualized, Altair has a built-in syntax for *aggregation* of data. For example, we can compute the average of all values by specifying an aggregation function along with the field name:
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ alt.Chart(df).mark_point().encode(
+ x='average(precip)',
+ y='city'
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Now within each x-axis category, we see a single point reflecting the *average* of the values within that category.
+
+ _Does Seattle really have the lowest average precipitation of these cities? (It does!) Still, how might this plot mislead? Which months are included? What counts as precipitation?_
+
+ Altair supports a variety of aggregation functions, including `count`, `min` (minimum), `max` (maximum), `average`, `median`, and `stdev` (standard deviation). In a later notebook, we will take a tour of data transformations, including aggregation, sorting, filtering, and creation of new derived fields using calculation formulas.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Changing the Mark Type
+
+ Let's say we want to represent our aggregated values using rectangular bars rather than circular points. We can do this by replacing `Chart.mark_point` with `Chart.mark_bar`:
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ alt.Chart(df).mark_bar().encode(
+ x='average(precip)',
+ y='city'
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Because the nominal field `a` is mapped to the `y`-axis, the result is a horizontal bar chart. To get a vertical bar chart, we can simply swap the `x` and `y` keywords:
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ alt.Chart(df).mark_bar().encode(
+ x='city',
+ y='average(precip)'
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Customizing a Visualization
+
+ By default Altair / Vega-Lite make some choices about properties of the visualization, but these can be changed using methods to customize the look of the visualization. For example, we can specify the axis titles using the `axis` attribute of channel classes, we can modify scale properties using the `scale` attribute, and we can specify the color of the marking by setting the `color` keyword of the `Chart.mark_*` methods to any valid [CSS color string](https://developer.mozilla.org/en-US/docs/Web/CSS/color_value):
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ alt.Chart(df).mark_point(color='firebrick').encode(
+ alt.X('precip', scale=alt.Scale(type='log'), axis=alt.Axis(title='Log-Scaled Values')),
+ alt.Y('city', axis=alt.Axis(title='Category')),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ A subsequent module will explore the various options available for scales, axes, and legends to create customized charts.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Multiple Views
+
+ As we've seen above, the Altair `Chart` object represents a plot with a single mark type. What about more complicated diagrams, involving multiple charts or layers? Using a set of *view composition* operators, Altair can take multiple chart definitions and combine them to create more complex views.
+
+ As a starting point, let's plot the cars dataset in a line chart showing the average mileage by the year of manufacture:
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars):
+ alt.Chart(cars).mark_line().encode(
+ alt.X('Year'),
+ alt.Y('average(Miles_per_Gallon)')
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ To augment this plot, we might like to add `circle` marks for each averaged data point. (The `circle` mark is just a convenient shorthand for `point` marks that used filled circles.)
+
+ We can start by defining each chart separately: first a line plot, then a scatter plot. We can then use the `layer` operator to combine the two into a layered chart. Here we use the shorthand `+` (plus) operator to invoke layering:
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars):
+ line = alt.Chart(cars).mark_line().encode(
+ alt.X('Year'),
+ alt.Y('average(Miles_per_Gallon)')
+ )
+
+ point = alt.Chart(cars).mark_circle().encode(
+ alt.X('Year'),
+ alt.Y('average(Miles_per_Gallon)')
+ )
+
+ line + point
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ We can also create this chart by *reusing* and *modifying* a previous chart definition! Rather than completely re-write a chart, we can start with the line chart, then invoke the `mark_point` method to generate a new chart definition with a different mark type:
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars):
+ mpg = alt.Chart(cars).mark_line().encode(
+ alt.X('Year'),
+ alt.Y('average(Miles_per_Gallon)')
+ )
+
+ mpg + mpg.mark_circle()
+ return (mpg,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ (The need to place points on lines is so common, the `line` mark also includes a shorthand to generate a new layer for you. Trying adding the argument `point=True` to the `mark_line` method!)
+
+ Now, what if we'd like to see this chart alongside other plots, such as the average horsepower over time?
+
+ We can use *concatenation* operators to place multiple charts side-by-side, either vertically or horizontally. Here, we'll use the `|` (pipe) operator to perform horizontal concatenation of two charts:
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars, mpg):
+ hp = alt.Chart(cars).mark_line().encode(
+ alt.X('Year'),
+ alt.Y('average(Horsepower)')
+ )
+
+ (mpg + mpg.mark_circle()) | (hp + hp.mark_circle())
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _We can see that, in this dataset, over the 1970s and early '80s the average fuel efficiency improved while the average horsepower decreased._
+
+ A later notebook will focus on *view composition*, including not only layering and concatenation, but also the `facet` operator for splitting data into sub-plots and the `repeat` operator to concisely generate concatenated charts from a template.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Interactivity
+
+ In addition to basic plotting and view composition, one of Altair and Vega-Lite's most exciting features is its support for interaction.
+
+ To create a simple interactive plot that supports panning and zooming, we can invoke the `interactive()` method of the `Chart` object. In the chart below, click and drag to *pan* or use the scroll wheel to *zoom*:
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars):
+ alt.Chart(cars).mark_point().encode(
+ x='Horsepower',
+ y='Miles_per_Gallon',
+ color='Origin',
+ ).interactive()
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ To provide more details upon mouse hover, we can use the `tooltip` encoding channel:
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars):
+ alt.Chart(cars).mark_point().encode(
+ x='Horsepower',
+ y='Miles_per_Gallon',
+ color='Origin',
+ tooltip=['Name', 'Origin'] # show Name and Origin in a tooltip
+ ).interactive()
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ For more complex interactions, such as linked charts and cross-filtering, Altair provides a *selection* abstraction for defining interactive selections and then binding them to components of a chart. We will cover this is in detail in a later notebook.
+
+ Below is a more complex example. The upper histogram shows the count of cars per year and uses an interactive selection to modify the opacity of points in the lower scatter plot, which shows horsepower versus mileage.
+
+ _Drag out an interval in the upper chart and see how it affects the points in the lower chart. As you examine the code, **don't worry if parts don't make sense yet!** This is an aspirational example, and we will fill in all the needed details over the course of the different notebooks._
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars):
+ # create an interval selection over an x-axis encoding
+ brush = alt.selection_interval(encodings=['x'])
+
+ # determine opacity based on brush
+ opacity = alt.condition(brush, alt.value(0.9), alt.value(0.1))
+
+ # an overview histogram of cars per year
+ # add the interval brush to select cars over time
+ overview = alt.Chart(cars).mark_bar().encode(
+ alt.X('Year:O', timeUnit='year', # extract year unit, treat as ordinal
+ axis=alt.Axis(title=None, labelAngle=0) # no title, no label angle
+ ),
+ alt.Y('count()', title=None), # counts, no axis title
+ opacity=opacity
+ ).add_params(
+ brush # add interval brush selection to the chart
+ ).properties(
+ width=400, # set the chart width to 400 pixels
+ height=50 # set the chart height to 50 pixels
+ )
+
+ # a detail scatterplot of horsepower vs. mileage
+ # modulate point opacity based on the brush selection
+ detail = alt.Chart(cars).mark_point().encode(
+ alt.X('Horsepower'),
+ alt.Y('Miles_per_Gallon'),
+ # set opacity based on brush selection
+ opacity=opacity
+ ).properties(width=400) # set chart width to match the first chart
+
+ # vertically concatenate (vconcat) charts using the '&' operator
+ overview & detail
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Aside: Examining the JSON Output
+
+ As a Python API to Vega-Lite, Altair's main purpose is to convert plot specifications to a JSON string that conforms to the Vega-Lite schema. Using the `Chart.to_json` method, we can inspect the JSON specification that Altair is exporting and sending to Vega-Lite:
+ """)
+ return
+
+
+@app.cell
+def _(alt, df):
+ _chart = alt.Chart(df).mark_bar().encode(x='average(precip)', y='city')
+ print(_chart.to_json())
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Notice here that `encode(x='average(precip)')` has been expanded to a JSON structure with a `field` name, a `type` for the data, and includes an `aggregate` field. The `encode(y='city')` statement has been expanded similarly.
+
+ As we saw earlier, Altair's shorthand syntax includes a way to specify the type of the field as well:
+ """)
+ return
+
+
+@app.cell
+def _(alt):
+ _x = alt.X('average(precip):Q')
+ print(_x.to_json())
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ This short-hand is equivalent to spelling-out the attributes by name:
+ """)
+ return
+
+
+@app.cell
+def _(alt):
+ _x = alt.X(aggregate='average', field='precip', type='quantitative')
+ print(_x.to_json())
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Publishing a Visualization
+
+ Once you have visualized your data, perhaps you would like to publish it somewhere on the web. This can be done straightforwardly using the [vega-embed JavaScript package](https://github.com/vega/vega-embed). A simple example of a stand-alone HTML document can be generated for any chart using the `Chart.save` method:
+
+ ```python
+ chart = alt.Chart(df).mark_bar().encode(
+ x='average(precip)',
+ y='city',
+ )
+ chart.save('chart.html')
+ ```
+
+
+ The basic HTML template produces output that looks like this, where the JSON specification for your plot produced by `Chart.to_json` should be stored in the `spec` JavaScript variable:
+
+ ```html
+
+
+
+
+ Approximating the Earth as a sphere, we can denote positions using a spherical coordinate system of _latitude_ (angle in degrees north or south of the _equator_) and _longitude_ (angle in degrees specifying east-west position). In this system, a _parallel_ is a circle of constant latitude and a _meridian_ is a circle of constant longitude. The [_prime meridian_](https://en.wikipedia.org/wiki/Prime_meridian) lies at 0Β° longitude and by convention is defined to pass through the Royal Observatory in Greenwich, England.
+
+ To "flatten" a three-dimensional sphere on to a two-dimensional plane, we must apply a [projection](https://en.wikipedia.org/wiki/Map_projection) that maps (`longitude`, `latitude`) pairs to (`x`, `y`) coordinates. Similar to [scales](https://github.com/uwdata/visualization-curriculum/blob/master/altair_scales_axes_legends.ipynb), projections map from a data domain (spatial position) to a visual range (pixel position). However, the scale mappings we've seen thus far accept a one-dimensional domain, whereas map projections are inherently two-dimensional.
+
+ In this notebook, we will introduce the basics of creating maps and visualizing spatial data with Altair, including:
+
+ - Data formats for representing geographic features,
+ - Geo-visualization techniques such as point, symbol, and choropleth maps, and
+ - A review of common cartographic projections.
+
+ _This notebook is part of the [data visualization curriculum](https://github.com/uwdata/visualization-curriculum)._
+ """)
+ return
+
+
+@app.cell
+def _():
+ import pandas as pd
+ import altair as alt
+ from altair.datasets import data
+ import json
+ import urllib.request
+
+ return alt, data
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Geographic Data: GeoJSON and TopoJSON
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Up to this point, we have worked with JSON and CSV formatted datasets that correspond to data tables made up of rows (records) and columns (fields). In order to represent geographic regions (countries, states, _etc._) and trajectories (flight paths, subway lines, _etc._), we need to expand our repertoire with additional formats designed to support rich geometries.
+
+ [GeoJSON](https://en.wikipedia.org/wiki/GeoJSON) models geographic features within a specialized JSON format. A GeoJSON `feature` can include geometric data – such as `longitude`, `latitude` coordinates that make up a country boundary – as well as additional data attributes.
+
+ Here is a GeoJSON `feature` object for the boundary of the U.S. state of Colorado:
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ~~~ json
+ {
+ "type": "Feature",
+ "id": 8,
+ "properties": {"name": "Colorado"},
+ "geometry": {
+ "type": "Polygon",
+ "coordinates": [
+ [[-106.32056285448942,40.998675790862656],[-106.19134826714341,40.99813863734313],[-105.27607827344248,40.99813863734313],[-104.9422739227986,40.99813863734313],[-104.05212898774828,41.00136155846029],[-103.57475287338661,41.00189871197981],[-103.38093099236758,41.00189871197981],[-102.65589358559272,41.00189871197981],[-102.62000064466328,41.00189871197981],[-102.052892177978,41.00189871197981],[-102.052892177978,40.74889940428302],[-102.052892177978,40.69733266640851],[-102.052892177978,40.44003613055551],[-102.052892177978,40.3492571857556],[-102.052892177978,40.00333031918079],[-102.04930288388505,39.57414465707943],[-102.04930288388505,39.56823596836465],[-102.0457135897921,39.1331416175485],[-102.0457135897921,39.0466599009048],[-102.0457135897921,38.69751011321283],[-102.0457135897921,38.61478847120581],[-102.0457135897921,38.268861604631],[-102.0457135897921,38.262415762396685],[-102.04212429569915,37.738153927339205],[-102.04212429569915,37.64415206142214],[-102.04212429569915,37.38900413964724],[-102.04212429569915,36.99365914927603],[-103.00046581851544,37.00010499151034],[-103.08660887674611,37.00010499151034],[-104.00905745863294,36.99580776335414],[-105.15404227428235,36.995270609834606],[-105.2222388620483,36.995270609834606],[-105.7175614468747,36.99580776335414],[-106.00829426840322,36.995270609834606],[-106.47490250048605,36.99365914927603],[-107.4224761410235,37.00010499151034],[-107.48349414060355,37.00010499151034],[-108.38081766383978,36.99903068447129],[-109.04483707103458,36.99903068447129],[-109.04483707103458,37.484617466122884],[-109.04124777694163,37.88049961001363],[-109.04124777694163,38.15283644441336],[-109.05919424740635,38.49983761802722],[-109.05201565922046,39.36680339854235],[-109.05201565922046,39.49786885730673],[-109.05201565922046,39.66062637372313],[-109.05201565922046,40.22248895514744],[-109.05201565922046,40.653823231326896],[-109.05201565922046,41.000287251421234],[-107.91779872584989,41.00189871197981],[-107.3183866123281,41.00297301901887],[-106.85895696843116,41.00189871197981],[-106.32056285448942,40.998675790862656]]
+ ]
+ }
+ }
+ ~~~
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ The `feature` includes a `properties` object, which can include any number of data fields, plus a `geometry` object, which in this case contains a single polygon that consists of `[longitude, latitude]` coordinates for the state boundary. The coordinates continue off to the right for a while should you care to scroll...
+
+ To learn more about the nitty-gritty details of GeoJSON, see the [official GeoJSON specification](http://geojson.org/) or read [Tom MacWright's helpful primer](https://macwright.org/2015/03/23/geojson-second-bite).
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ One drawback of GeoJSON as a storage format is that it can be redundant, resulting in larger file sizes. Consider: Colorado shares boundaries with six other states (seven if you include the corner touching Arizona). Instead of using separate, overlapping coordinate lists for each of those states, a more compact approach is to encode shared borders only once, representing the _topology_ of geographic regions. Fortunately, this is precisely what the [TopoJSON](https://github.com/topojson/topojson/blob/master/README.md) format does!
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Let's load a TopoJSON file of world countries (at 110 meter resolution):
+ """)
+ return
+
+
+@app.cell
+def _(data):
+ world = data.world_110m.url
+ world
+ return (world,)
+
+
+@app.cell
+def _(data):
+ with urllib.request.urlopen(world) as response:
+ world_topo = json.load(response)
+ return (world_topo,)
+
+
+@app.cell
+def _(world_topo):
+ world_topo.keys()
+ return
+
+
+@app.cell
+def _(world_topo):
+ world_topo['type']
+ return
+
+
+@app.cell
+def _(world_topo):
+ world_topo['objects'].keys()
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _Inspect the `world_topo` TopoJSON dictionary object above to see its contents._
+
+ In the data above, the `objects` property indicates the named elements we can extract from the data: geometries for all `countries`, or a single polygon representing all `land` on Earth. Either of these can be unpacked to GeoJSON data we can then visualize.
+
+ As TopoJSON is a specialized format, we need to instruct Altair to parse the TopoJSON format, indicating which named feature object we wish to extract from the topology. The following code indicates that we want to extract GeoJSON features from the `world` dataset for the `countries` object:
+
+ ~~~ js
+ alt.topo_feature(world, 'countries')
+ ~~~
+
+ This `alt.topo_feature` method call expands to the following Vega-Lite JSON:
+
+ ~~~ json
+ {
+ "values": world,
+ "format": {"type": "topojson", "feature": "countries"}
+ }
+ ~~~
+
+ Now that we can load geographic data, we're ready to start making maps!
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Geoshape Marks
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ To visualize geographic data, Altair provides the `geoshape` mark type. To create a basic map, we can create a `geoshape` mark and pass it our TopoJSON data, which is then unpacked into GeoJSON features, one for each country of the world:
+ """)
+ return
+
+
+@app.cell
+def _(alt, world):
+ alt.Chart(alt.topo_feature(world, 'countries')).mark_geoshape()
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ In the example above, Altair applies a default blue color and uses a default map projection (`mercator`). We can customize the colors and boundary stroke widths using standard mark properties. Using the `project` method we can also add our own map projection:
+ """)
+ return
+
+
+@app.cell
+def _(alt, world):
+ alt.Chart(alt.topo_feature(world, 'countries')).mark_geoshape(
+ fill='#2a1d0c', stroke='#706545', strokeWidth=0.5
+ ).project(
+ type='mercator'
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ By default Altair automatically adjusts the projection so that all the data fits within the width and height of the chart. We can also specify projection parameters, such as `scale` (zoom level) and `translate` (panning), to customize the projection settings. Here we adjust the `scale` and `translate` parameters to focus on Europe:
+ """)
+ return
+
+
+@app.cell
+def _(alt, world):
+ alt.Chart(alt.topo_feature(world, 'countries')).mark_geoshape(
+ fill='#2a1d0c', stroke='#706545', strokeWidth=0.5
+ ).project(
+ type='mercator', scale=400, translate=[100, 550]
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _Note how the 110m resolution of the data becomes apparent at this scale. To see more detailed coast lines and boundaries, we need an input file with more fine-grained geometries. Adjust the `scale` and `translate` parameters to focus the map on other regions!_
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ So far our map shows countries only. Using the `layer` operator, we can combine multiple map elements. Altair includes _data generators_ we can use to create data for additional map layers:
+
+ - The sphere generator (`{'sphere': True}`) provides a GeoJSON representation of the full sphere of the Earth. We can create an additional `geoshape` mark that fills in the shape of the Earth as a background layer.
+ - The graticule generator (`{'graticule': ...}`) creates a GeoJSON feature representing a _graticule_: a grid formed by lines of latitude and longitude. The default graticule has meridians and parallels every 10Β° between Β±80Β° latitude. For the polar regions, there are meridians every 90Β°. These settings can be customized using the `stepMinor` and `stepMajor` properties.
+
+ Let's layer sphere, graticule, and country marks into a reusable map specification:
+ """)
+ return
+
+
+@app.cell
+def _(alt, world):
+ map = alt.layer(
+ # use the sphere of the Earth as the base layer
+ alt.Chart({'sphere': True}).mark_geoshape(
+ fill='#e6f3ff'
+ ),
+ # add a graticule for geographic reference lines
+ alt.Chart({'graticule': True}).mark_geoshape(
+ stroke='#ffffff', strokeWidth=1
+ ),
+ # and then the countries of the world
+ alt.Chart(alt.topo_feature(world, 'countries')).mark_geoshape(
+ fill='#2a1d0c', stroke='#706545', strokeWidth=0.5
+ )
+ ).properties(
+ width=600,
+ height=400
+ )
+ return (map,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ We can extend the map with a desired projection and draw the result. Here we apply a [Natural Earth projection](https://en.wikipedia.org/wiki/Natural_Earth_projection). The _sphere_ layer provides the light blue background; the _graticule_ layer provides the white geographic reference lines.
+ """)
+ return
+
+
+@app.cell
+def _(map):
+ map.project(
+ type='naturalEarth1', scale=110, translate=[300, 200]
+ ).configure_view(stroke=None)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Point Maps
+
+ In addition to the _geometric_ data provided by GeoJSON or TopoJSON files, many tabular datasets include geographic information in the form of fields for `longitude` and `latitude` coordinates, or references to geographic regions such as country names, state names, postal codes, _etc._, which can be mapped to coordinates using a [geocoding service](https://en.wikipedia.org/wiki/Geocoding). In some cases, location data is rich enough that we can see meaningful patterns by projecting the data points alone — no base map required!
+
+ Let's look at a dataset of 5-digit zip codes in the United States, including `longitude`, `latitude` coordinates for each post office in addition to a `zip_code` field.
+ """)
+ return
+
+
+@app.cell
+def _(data):
+ zipcodes = data.zipcodes.url
+ zipcodes
+ return (zipcodes,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ We can visualize each post office location using a small (1-pixel) `square` mark. However, to set the positions we do _not_ use `x` and `y` channels. _Why is that?_
+
+ While cartographic projections map (`longitude`, `latitude`) coordinates to (`x`, `y`) coordinates, they can do so in arbitrary ways. There is no guarantee, for example, that `longitude` β `x` and `latitude` β `y`! Instead, Altair includes special `longitude` and `latitude` encoding channels to handle geographic coordinates. These channels indicate which data fields should be mapped to `longitude` and `latitude` coordinates, and then applies a projection to map those coordinates to (`x`, `y`) positions.
+ """)
+ return
+
+
+@app.cell
+def _(alt, zipcodes):
+ alt.Chart(zipcodes).mark_square(
+ size=1, opacity=1
+ ).encode(
+ longitude='longitude:Q', # apply the field named 'longitude' to the longitude channel
+ latitude='latitude:Q' # apply the field named 'latitude' to the latitude channel
+ ).project(
+ type='albersUsa'
+ ).properties(
+ width=900,
+ height=500
+ ).configure_view(
+ stroke=None
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _Plotting zip codes only, we can see the outline of the United States and discern meaningful patterns in the density of post offices, without a base map or additional reference elements!_
+
+ We use the `albersUsa` projection, which takes some liberties with the actual geometry of the Earth, with scaled versions of Alaska and Hawaii in the bottom-left corner. As we did not specify projection `scale` or `translate` parameters, Altair sets them automatically to fit the visualized data.
+
+ We can now go on to ask more questions of our dataset. For example, is there any rhyme or reason to the allocation of zip codes? To assess this question we can add a color encoding based on the first digit of the zip code. We first add a `calculate` transform to extract the first digit, and encode the result using the color channel:
+ """)
+ return
+
+
+@app.cell
+def _(alt, zipcodes):
+ alt.Chart(zipcodes).transform_calculate(
+ digit='datum.zip_code[0]'
+ ).mark_square(
+ size=2, opacity=1
+ ).encode(
+ longitude='longitude:Q',
+ latitude='latitude:Q',
+ color='digit:N'
+ ).project(
+ type='albersUsa'
+ ).properties(
+ width=900,
+ height=500
+ ).configure_view(
+ stroke=None
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _To zoom in on a specific digit, add a filter transform to limit the data shown! Try adding an [interactive selection](https://github.com/uwdata/visualization-curriculum/blob/master/altair_interaction.ipynb) to filter to a single digit and dynamically update the map. And be sure to use strings (\`'1'\`) instead of numbers (\`1\`) when filtering digit values!_
+
+ (This example is inspired by Ben Fry's classic [zipdecode](https://benfry.com/zipdecode/) visualization!)
+
+ We might further wonder what the _sequence_ of zip codes might indicate. One way to explore this question is to connect each consecutive zip code using a `line` mark, as done in Robert Kosara's [ZipScribble](https://eagereyes.org/zipscribble-maps/united-states) visualization:
+ """)
+ return
+
+
+@app.cell
+def _(alt, zipcodes):
+ alt.Chart(zipcodes).transform_filter(
+ '-150 < datum.longitude && 22 < datum.latitude && datum.latitude < 55'
+ ).transform_calculate(
+ digit='datum.zip_code[0]'
+ ).mark_line(
+ strokeWidth=0.5
+ ).encode(
+ longitude='longitude:Q',
+ latitude='latitude:Q',
+ color='digit:N',
+ order='zip_code:O'
+ ).project(
+ type='albersUsa'
+ ).properties(
+ width=900,
+ height=500
+ ).configure_view(
+ stroke=None
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _We can now see how zip codes further cluster into smaller areas, indicating a hierarchical allocation of codes by location, but with some notable variability within local clusters._
+
+ If you were paying careful attention to our earlier maps, you may have noticed that there are zip codes being plotted in the upper-left corner! These correspond to locations such as Puerto Rico or American Samoa, which contain U.S. zip codes but are mapped to `null` coordinates (`0`, `0`) by the `albersUsa` projection. In addition, Alaska and Hawaii can complicate our view of the connecting line segments. In response, the code above includes an additional filter that removes points outside our chosen `longitude` and `latitude` spans.
+
+ _Remove the filter above to see what happens!_
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Symbol Maps
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Now let's combine a base map and plotted data as separate layers. We'll examine the U.S. commercial flight network, considering both airports and flight routes. To do so, we'll need three datasets.
+ For our base map, we'll use a TopoJSON file for the United States at 10m resolution, containing features for `states` or `counties`:
+ """)
+ return
+
+
+@app.cell
+def _(data):
+ usa = data.us_10m.url
+ usa
+ return (usa,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ For the airports, we will use a dataset with fields for the `longitude` and `latitude` coordinates of each airport as well as the `iata` airport code — for example, `'SEA'` for [Seattle-Tacoma International Airport](https://en.wikipedia.org/wiki/Seattle%E2%80%93Tacoma_International_Airport).
+ """)
+ return
+
+
+@app.cell
+def _(data):
+ airports = data.airports.url
+ airports
+ return (airports,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Finally, we will use a dataset of flight routes, which contains `origin` and `destination` fields with the IATA codes for the corresponding airports:
+ """)
+ return
+
+
+@app.cell
+def _(data):
+ flights = data.flights_airport.url
+ flights
+ return (flights,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Let's start by creating a base map using the `albersUsa` projection, and add a layer that plots `circle` marks for each airport:
+ """)
+ return
+
+
+@app.cell
+def _(airports, alt, usa):
+ alt.layer(
+ alt.Chart(alt.topo_feature(usa, 'states')).mark_geoshape(
+ fill='#ddd', stroke='#fff', strokeWidth=1
+ ),
+ alt.Chart(airports).mark_circle(size=9).encode(
+ latitude='latitude:Q',
+ longitude='longitude:Q',
+ tooltip='iata:N'
+ )
+ ).project(
+ type='albersUsa'
+ ).properties(
+ width=900,
+ height=500
+ ).configure_view(
+ stroke=None
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _That's a lot of airports! Obviously, not all of them are major hubs._
+
+ Similar to our zip codes dataset, our airport data includes points that lie outside the continental United States. So we again see points in the upper-left corner. We might want to filter these points, but to do so we first need to know more about them.
+
+ _Update the map projection above to `albers` – side-stepping the idiosyncratic behavior of `albersUsa` – so that the actual locations of these additional points is revealed!_
+
+ Now, instead of showing all airports in an undifferentiated fashion, let's identify major hubs by considering the total number of routes that originate at each airport. We'll use the `routes` dataset as our primary data source: it contains a list of flight routes that we can aggregate to count the number of routes for each `origin` airport.
+
+ However, the `routes` dataset does not include the _locations_ of the airports! To augment the `routes` data with locations, we need a new data transformation: `lookup`. The `lookup` transform takes a field value in a primary dataset and uses it as a _key_ to look up related information in another table. In this case, we want to match the `origin` airport code in our `routes` dataset against the `iata` field of the `airports` dataset, then extract the corresponding `latitude` and `longitude` fields.
+ """)
+ return
+
+
+@app.cell
+def _(airports, alt, flights, usa):
+ alt.layer(
+ alt.Chart(alt.topo_feature(usa, 'states')).mark_geoshape(
+ fill='#ddd', stroke='#fff', strokeWidth=1
+ ),
+ alt.Chart(flights).mark_circle().transform_aggregate(
+ groupby=['origin'],
+ routes='count()'
+ ).transform_lookup(
+ lookup='origin',
+ from_=alt.LookupData(data=airports, key='iata',
+ fields=['state', 'latitude', 'longitude'])
+ ).transform_filter(
+ 'datum.state !== "PR" && datum.state !== "VI"'
+ ).encode(
+ latitude='latitude:Q',
+ longitude='longitude:Q',
+ tooltip=['origin:N', 'routes:Q'],
+ size=alt.Size('routes:Q', scale=alt.Scale(range=[0, 1000]), legend=None),
+ order=alt.Order('routes:Q', sort='descending')
+ )
+ ).project(
+ type='albersUsa'
+ ).properties(
+ width=900,
+ height=500
+ ).configure_view(
+ stroke=None
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _Which U.S. airports have the highest number of outgoing routes?_
+
+ Now that we can see the airports, which may wish to interact with them to better understand the structure of the air traffic network. We can add a `rule` mark layer to represent paths from `origin` airports to `destination` airports, which requires two `lookup` transforms to retrieve coordinates for each end point. In addition, we can use a `single` selection to filter these routes, such that only the routes originating at the currently selected airport are shown.
+
+ _Starting from the static map above, can you build an interactive version? Feel free to skip the code below to engage with the interactive map first, and think through how you might build it on your own!_
+ """)
+ return
+
+
+@app.cell
+def _(airports, alt, flights, usa):
+ # interactive selection for origin airport
+ # select nearest airport to mouse cursor
+ origin = alt.selection_point(
+ on='mouseover', nearest=True,
+ fields=['origin'], empty='none'
+ )
+
+ # shared data reference for lookup transforms
+ foreign = alt.LookupData(data=airports, key='iata',
+ fields=['latitude', 'longitude'])
+
+ alt.layer(
+ # base map of the United States
+ alt.Chart(alt.topo_feature(usa, 'states')).mark_geoshape(
+ fill='#ddd', stroke='#fff', strokeWidth=1
+ ),
+ # route lines from selected origin airport to destination airports
+ alt.Chart(flights).mark_rule(
+ color='#000', opacity=0.35
+ ).transform_filter(
+ origin # filter to selected origin only
+ ).transform_lookup(
+ lookup='origin', from_=foreign # origin lat/lon
+ ).transform_lookup(
+ lookup='destination', from_=foreign, as_=['lat2', 'lon2'] # dest lat/lon
+ ).encode(
+ latitude='latitude:Q',
+ longitude='longitude:Q',
+ latitude2='lat2',
+ longitude2='lon2',
+ ),
+ # size airports by number of outgoing routes
+ # 1. aggregate flights-airport data set
+ # 2. lookup location data from airports data set
+ # 3. remove Puerto Rico (PR) and Virgin Islands (VI)
+ alt.Chart(flights).mark_circle().transform_aggregate(
+ groupby=['origin'],
+ routes='count()'
+ ).transform_lookup(
+ lookup='origin',
+ from_=alt.LookupData(data=airports, key='iata',
+ fields=['state', 'latitude', 'longitude'])
+ ).transform_filter(
+ 'datum.state !== "PR" && datum.state !== "VI"'
+ ).add_params(
+ origin
+ ).encode(
+ latitude='latitude:Q',
+ longitude='longitude:Q',
+ tooltip=['origin:N', 'routes:Q'],
+ size=alt.Size('routes:Q', scale=alt.Scale(range=[0, 1000]), legend=None),
+ order=alt.Order('routes:Q', sort='descending') # place smaller circles on top
+ )
+ ).project(
+ type='albersUsa'
+ ).properties(
+ width=900,
+ height=500
+ ).configure_view(
+ stroke=None
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _Mouseover the map to probe the flight network!_
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Choropleth Maps
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ A [choropleth map](https://en.wikipedia.org/wiki/Choropleth_map) uses shaded or textured regions to visualize data values. Sized symbol maps are often more accurate to read, as people tend to be better at estimating proportional differences between the area of circles than between color shades. Nevertheless, choropleth maps are popular in practice and particularly useful when too many symbols become perceptually overwhelming.
+
+ For example, while the United States only has 50 states, it has thousands of counties within those states. Let's build a choropleth map of the unemployment rate per county, back in the recession year of 2008. In some cases, input GeoJSON or TopoJSON files might include statistical data that we can directly visualize. In this case, however, we have two files: our TopoJSON file that includes county boundary features (`usa`), and a separate text file that contains unemployment statistics:
+ """)
+ return
+
+
+@app.cell
+def _(data):
+ unemp = data.unemployment.url
+ unemp
+ return (unemp,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ To integrate our data sources, we will again need to use the `lookup` transform, augmenting our TopoJSON-based `geoshape` data with unemployment rates. We can then create a map that includes a `color` encoding for the looked-up `rate` field.
+ """)
+ return
+
+
+@app.cell
+def _(alt, unemp, usa):
+ alt.Chart(alt.topo_feature(usa, 'counties')).mark_geoshape(
+ stroke='#aaa', strokeWidth=0.25
+ ).transform_lookup(
+ lookup='id', from_=alt.LookupData(data=unemp, key='id', fields=['rate'])
+ ).encode(
+ alt.Color('rate:Q',
+ scale=alt.Scale(domain=[0, 0.3], clamp=True),
+ legend=alt.Legend(format='%')),
+ alt.Tooltip('rate:Q', format='.0%')
+ ).project(
+ type='albersUsa'
+ ).properties(
+ width=900,
+ height=500
+ ).configure_view(
+ stroke=None
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ *Examine the unemployment rates by county. Higher values in Michigan may relate to the automotive industry. Counties in the [Great Plains](https://en.wikipedia.org/wiki/Great_Plains) and Mountain states exhibit both low **and** high rates. Is this variation meaningful, or is it possibly an [artifact of lower sample sizes](https://medium.com/@uwdata/surprise-maps-showing-the-unexpected-e92b67398865)? To explore further, try changing the upper scale domain (e.g., to `0.2`) to adjust the color mapping.*
+
+ A central concern for choropleth maps is the choice of colors. Above, we use Altair's default `'yellowgreenblue'` scheme for heatmaps. Below we compare other schemes, including a _single-hue sequential_ scheme (`teals`) that varies in luminance only, a _multi-hue sequential_ scheme (`viridis`) that ramps in both luminance and hue, and a _diverging_ scheme (`blueorange`) that uses a white mid-point:
+ """)
+ return
+
+
+@app.cell
+def _(alt, unemp, usa):
+ # utility function to generate a map specification for a provided color scheme
+ def map_(scheme):
+ return alt.Chart().mark_geoshape().project(type='albersUsa').encode(
+ alt.Color('rate:Q', scale=alt.Scale(scheme=scheme), legend=None)
+ ).properties(width=305, height=200)
+
+ alt.hconcat(
+ map_('tealblues'), map_('viridis'), map_('blueorange'),
+ data=alt.topo_feature(usa, 'counties')
+ ).transform_lookup(
+ lookup='id', from_=alt.LookupData(data=unemp, key='id', fields=['rate'])
+ ).configure_view(
+ stroke=None
+ ).resolve_scale(
+ color='independent'
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ _Which color schemes do you find to be more effective? Why might that be? Modify the maps above to use other available schemes, as described in the [Vega Color Schemes documentation](https://vega.github.io/vega/docs/schemes/)._
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Cartographic Projections
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Now that we have some experience creating maps, let's take a closer look at cartographic projections. As explained by [Wikipedia](https://en.wikipedia.org/wiki/Map_projection),
+
+ > _All map projections necessarily distort the surface in some fashion. Depending on the purpose of the map, some distortions are acceptable and others are not; therefore, different map projections exist in order to preserve some properties of the sphere-like body at the expense of other properties._
+
+ Some of the properties we might wish to consider include:
+
+ - _Area_: Does the projection distort region sizes?
+ - _Bearing_: Does a straight line correspond to a constant direction of travel?
+ - _Distance_: Do lines of equal length correspond to equal distances on the globe?
+ - _Shape_: Does the projection preserve spatial relations (angles) between points?
+
+ Selecting an appropriate projection thus depends on the use case for the map. For example, if we are assessing land use and the extent of land matters, we might choose an area-preserving projection. If we want to visualize shockwaves emanating from an earthquake, we might focus the map on the quake's epicenter and preserve distances outward from that point. Or, if we wish to aid navigation, the preservation of bearing and shape may be more important.
+
+ We can also characterize projections in terms of the _projection surface_. Cylindrical projections, for example, project surface points of the sphere onto a surrounding cylinder; the "unrolled" cylinder then provides our map. As we further describe below, we might alternatively project onto the surface of a cone (conic projections) or directly onto a flat plane (azimuthal projections).
+
+ *Let's first build up our intuition by interacting with a variety of projections! **[Open the online Vega-Lite Cartographic Projections notebook](https://observablehq.com/@vega/vega-lite-cartographic-projections).** Use the controls on that page to select a projection and explore projection parameters, such as the `scale` (zooming) and x/y translation (panning). The rotation ([yaw, pitch, roll](https://en.wikipedia.org/wiki/Aircraft_principal_axes)) controls determine the orientation of the globe relative to the surface being projected upon.*
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ### A Tour of Specific Projection Types
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ [**Cylindrical projections**](https://en.wikipedia.org/wiki/Map_projection#Cylindrical) map the sphere onto a surrounding cylinder, then unroll the cylinder. If the major axis of the cylinder is oriented north-south, meridians are mapped to straight lines. [Pseudo-cylindrical](https://en.wikipedia.org/wiki/Map_projection#Pseudocylindrical) projections represent a central meridian as a straight line, with other meridians "bending" away from the center.
+ """)
+ return
+
+
+@app.cell
+def _(alt, map):
+ _minimap = map.properties(width=225, height=225)
+ alt.hconcat(_minimap.project(type='equirectangular').properties(title='equirectangular'), _minimap.project(type='mercator').properties(title='mercator'), _minimap.project(type='transverseMercator').properties(title='transverseMercator'), _minimap.project(type='naturalEarth1').properties(title='naturalEarth1')).properties(spacing=10).configure_view(stroke=None)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ - [Equirectangular](https://en.wikipedia.org/wiki/Equirectangular_projection) (`equirectangular`): Scale `lat`, `lon` coordinate values directly.
+ - [Mercator](https://en.wikipedia.org/wiki/Mercator_projection) (`mercator`): Project onto a cylinder, using `lon` directly, but subjecting `lat` to a non-linear transformation. Straight lines preserve constant compass bearings ([rhumb lines](https://en.wikipedia.org/wiki/Rhumb_line)), making this projection well-suited to navigation. However, areas in the far north or south can be greatly distorted.
+ - [Transverse Mercator](https://en.wikipedia.org/wiki/Transverse_Mercator_projection) (`transverseMercator`): A mercator projection, but with the bounding cylinder rotated to a transverse axis. Whereas the standard Mercator projection has highest accuracy along the equator, the Transverse Mercator projection is most accurate along the central meridian.
+ - [Natural Earth](https://en.wikipedia.org/wiki/Natural_Earth_projection) (`naturalEarth1`): A pseudo-cylindrical projection designed for showing the whole Earth in one view.
+
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ [**Conic projections**](https://en.wikipedia.org/wiki/Map_projection#Conic) map the sphere onto a cone, and then unroll the cone on to the plane. Conic projections are configured by two _standard parallels_, which determine where the cone intersects the globe.
+ """)
+ return
+
+
+@app.cell
+def _(alt, map):
+ _minimap = map.properties(width=180, height=130)
+ alt.hconcat(_minimap.project(type='conicEqualArea').properties(title='conicEqualArea'), _minimap.project(type='conicEquidistant').properties(title='conicEquidistant'), _minimap.project(type='conicConformal', scale=35, translate=[90, 65]).properties(title='conicConformal'), _minimap.project(type='albers').properties(title='albers'), _minimap.project(type='albersUsa').properties(title='albersUsa')).properties(spacing=10).configure_view(stroke=None)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ - [Conic Equal Area](https://en.wikipedia.org/wiki/Albers_projection) (`conicEqualArea`): Area-preserving conic projection. Shape and distance are not preserved, but roughly accurate within standard parallels.
+ - [Conic Equidistant](https://en.wikipedia.org/wiki/Equidistant_conic_projection) (`conicEquidistant`): Conic projection that preserves distance along the meridians and standard parallels.
+ - [Conic Conformal](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection) (`conicConformal`): Conic projection that preserves shape (local angles), but not area or distance.
+ - [Albers](https://en.wikipedia.org/wiki/Albers_projection) (`albers`): A variant of the conic equal area projection with standard parallels optimized for creating maps of the United States.
+ - [Albers USA](https://en.wikipedia.org/wiki/Albers_projection) (`albersUsa`): A hybrid projection for the 50 states of the United States of America. This projection stitches together three Albers projections with different parameters for the continental U.S., Alaska, and Hawaii.
+
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ [**Azimuthal projections**](https://en.wikipedia.org/wiki/Map_projection#Azimuthal_%28projections_onto_a_plane%29) map the sphere directly onto a plane.
+ """)
+ return
+
+
+@app.cell
+def _(alt, map):
+ _minimap = map.properties(width=180, height=180)
+ alt.hconcat(_minimap.project(type='azimuthalEqualArea').properties(title='azimuthalEqualArea'), _minimap.project(type='azimuthalEquidistant').properties(title='azimuthalEquidistant'), _minimap.project(type='orthographic').properties(title='orthographic'), _minimap.project(type='stereographic').properties(title='stereographic'), _minimap.project(type='gnomonic').properties(title='gnomonic')).properties(spacing=10).configure_view(stroke=None)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ - [Azimuthal Equal Area](https://en.wikipedia.org/wiki/Lambert_azimuthal_equal-area_projection) (`azimuthalEqualArea`): Accurately projects area in all parts of the globe, but does not preserve shape (local angles).
+ - [Azimuthal Equidistant](https://en.wikipedia.org/wiki/Azimuthal_equidistant_projection) (`azimuthalEquidistant`): Preserves proportional distance from the projection center to all other points on the globe.
+ - [Orthographic](https://en.wikipedia.org/wiki/Orthographic_projection_in_cartography) (`orthographic`): Projects a visible hemisphere onto a distant plane. Approximately matches a view of the Earth from outer space.
+ - [Stereographic](https://en.wikipedia.org/wiki/Stereographic_projection) (`stereographic`): Preserves shape, but not area or distance.
+ - [Gnomonic](https://en.wikipedia.org/wiki/Gnomonic_projection) (`gnomonic`): Projects the surface of the sphere directly onto a tangent plane. [Great circles](https://en.wikipedia.org/wiki/Great_circle) around the Earth are projected to straight lines, showing the shortest path between points.
+
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Coda: Wrangling Geographic Data
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ The examples above all draw from the vega-datasets collection, including geometric (TopoJSON) and tabular (airports, unemployment rates) data. A common challenge to getting starting with geographic visualization is collecting the necessary data for your task. A number of data providers abound, including services such as the [United States Geological Survey](https://www.usgs.gov/products/data/all-data) and [U.S. Census Bureau](https://www.census.gov/data/datasets.html).
+
+ In many cases you may have existing data with a geographic component, but require additional measures or geometry. To help you get started, here is one workflow:
+
+ 1. Visit [Natural Earth Data](http://www.naturalearthdata.com/downloads/) and browse to select data for regions and resolutions of interest. Download the corresponding zip file(s).
+ 2. Go to [MapShaper](https://mapshaper.org/) and drop your downloaded zip file onto the page. Revise the data as desired, and then "Export" generated TopoJSON or GeoJSON files.
+ 3. Load the exported data from MapShaper for use with Altair!
+
+ Of course, many other tools – both open-source and proprietary – exist for working with geographic data. For more about geo-data wrangling and map creation, see Mike Bostock's tutorial series on [Command-Line Cartography](https://medium.com/@mbostock/command-line-cartography-part-1-897aa8f8ca2c).
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Summary
+
+ At this point, we've only dipped our toes into the waters of map-making. _(You didn't expect a single notebook to impart centuries of learning, did you?)_ For example, we left untouched topics such as [_cartograms_](https://en.wikipedia.org/wiki/Cartogram) and conveying [_topography_](https://en.wikipedia.org/wiki/Topography) — as in Imhof's illuminating book [_Cartographic Relief Presentation_](https://books.google.com/books?id=cVy1Ms43fFYC). Nevertheless, you should now be well-equipped to create a rich array of geo-visualizations. For more, MacEachren's book [_How Maps Work: Representation, Visualization, and Design_](https://books.google.com/books?id=xhAvN3B0CkUC) provides a valuable overview of map-making from the perspective of data visualization.
+ """)
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/altair/08_debugging.py b/altair/08_debugging.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3195a040d5b8a9ed3e6fbaa8dc900fd61e6f72d
--- /dev/null
+++ b/altair/08_debugging.py
@@ -0,0 +1,369 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+# "altair==6.0.0",
+# "marimo",
+# "pandas==3.0.1",
+# ]
+# ///
+
+import marimo
+
+__generated_with = "0.20.4"
+app = marimo.App()
+
+
+@app.cell
+def _():
+ import marimo as mo
+
+ return (mo,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ # Altair Debugging Guide
+
+ In this notebook we show you common debugging techniques that you can use if you run into issues with Altair.
+
+ You can jump to the following sections:
+
+ * [Installation and Setup](#Installation) when Altair is not installed correctly
+ * [Display Issues](#Display-Troubleshooting) when you don't see a chart
+ * [Invalid Specifications](#Invalid-Specifications) when you get an error
+ * [Properties are Being Ignored](#Properties-are-Being-Ignored) when you don't see any errors or warnings
+ * [Asking for Help](#Asking-for-Help) when you get stuck
+ * [Reporting Issues](#Reporting-Issues) when you find a bug
+
+ In addition to this notebook, you might find the [Frequently Asked Questions](https://altair-viz.github.io/user_guide/faq.html) and [Display Troubleshooting](https://altair-viz.github.io/user_guide/troubleshooting.html) guides helpful.
+
+ _This notebook is part of the [data visualization curriculum](https://github.com/uwdata/visualization-curriculum)._
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Installation
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ These instructions follow [the Altair documentation](https://altair-viz.github.io/getting_started/installation.html) but focus on some specifics for this series of notebooks.
+
+ In every notebook, we will import the [Altair](https://github.com/altair-viz/altair) package. If you are running this notebook on [Colab](https://colab.research.google.com), Altair should be preinstalled and ready to go. The notebooks in this series are designed for Colab but should also work in Jupyter Lab or the Jupyter Notebook (the notebook requires a bit more setup [described below](#Special-Setup-for-the-Jupyter-Notebook)) but additional packages are required.
+
+ If you are running in Jupyter Lab or Jupyter Notebooks, you have to install the necessary packages by running the following command in your terminal.
+
+ ```bash
+ pip install altair
+ ```
+
+ Or if you use [Conda](https://conda.io)
+
+ ```bash
+ conda install -c conda-forge altair
+ ```
+
+ You can run command line commands from a code cell by prefixing it with `!`. For example, to install Altair and Vega Datasets with [Pip](https://pip.pypa.io/), you can run the following cell.
+ """)
+ return
+
+
+@app.cell
+def _():
+ # packages added via marimo's package management: altair !pip install altair
+ return
+
+
+@app.cell
+def _():
+ import altair as alt
+ from altair.datasets import data
+
+ return alt, data
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ### Make sure you are Using the Latest Version of Altair
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ If you are running into issues with Altair, first make sure that you are running the latest version. To check the version of Altair that you have installed, run the cell below.
+ """)
+ return
+
+
+@app.cell
+def _(alt):
+ alt.__version__
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ To check what the latest version of altair is, go to [this page](https://pypi.org/project/altair/) or run the cell below (requires Python 3).
+ """)
+ return
+
+
+@app.cell
+def _():
+ import urllib.request, json
+ with urllib.request.urlopen("https://pypi.org/pypi/altair/json") as url:
+ print(json.loads(url.read().decode())['info']['version'])
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ If you are not running the latest version, you can update it with `pip`. You can update Altair and Vega Datasets by running this command in your terminal.
+
+ ```
+ pip install -U altair
+ ```
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ### Try Making a Chart
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Now you can create an Altair chart.
+ """)
+ return
+
+
+@app.cell
+def _(alt, data):
+ cars = data.cars()
+
+ alt.Chart(cars).mark_point().encode(
+ x='Horsepower',
+ y='Displacement',
+ color='Origin'
+ )
+ return (cars,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ### Special Setup for the Jupyter Notebook
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ If you are running in Jupyter Lab, Jupyter Notebook, or Colab (and have a working Internet connection) you should be seeing a chart. If you are running in another environment (or offline), you will need to tell Altair to use a different renderer;
+
+ To activate a different renderer in a notebook cell:
+
+ ```python
+ # to run in nteract, VSCode, or offline in JupyterLab
+ alt.renderers.enable('mimebundle')
+
+ ```
+
+ To run offline in Jupyter Notebook you must install an additional dependency, the `vega` package. Run this command in your terminal:
+
+ ```bash
+ pip install vega
+ ```
+
+ Then activate the notebook renderer:
+
+ ```python
+ # to run offline in Jupyter Notebook
+ alt.renderers.enable('notebook')
+
+ ```
+
+
+ These instruction follow [the instructions on the Altair website](https://altair-viz.github.io/getting_started/installation.html#installation-notebook).
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Display Troubleshooting
+
+ If you are having issues with seeing a chart, make sure your setup is correct by following the [debugging instruction above](#Installation). If you are still having issues, follow the [instruction about debugging display issues in the Altair documentation](https://iliatimofeev.github.io/altair-viz.github.io/user_guide/troubleshooting.html).
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ### Non Existent Fields
+
+ A common error is [accidentally using a field that does not exist](https://iliatimofeev.github.io/altair-viz.github.io/user_guide/troubleshooting.html#plot-displays-but-the-content-is-empty).
+ """)
+ return
+
+
+@app.cell
+def _(alt):
+ import pandas as pd
+
+ df = pd.DataFrame({'x': [1, 2, 3],
+ 'y': [3, 1, 4]})
+
+ alt.Chart(df).mark_point().encode(
+ x='x:Q',
+ y='y:Q',
+ color='color:Q' # <-- this field does not exist in the data!
+ )
+ return (df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Check the spelling of your files and print the data source to confirm that the data and fields exist. For instance, here you see that `color` is not a valid field.
+ """)
+ return
+
+
+@app.cell
+def _(df):
+ df.head()
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Invalid Specifications
+
+ Another common issue is creating an invalid specification and getting an error.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ### Invalid Properties
+
+ Altair might show an `SchemaValidationError` or `ValueError`. Read the error message carefully. Usually it will tell you what is going wrong.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ For example, if you forget the mark type, you will see this `SchemaValidationError`.
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars):
+ alt.Chart(cars).encode(
+ y='Horsepower'
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Or if you use a non-existent channel, you get a `TypeError`.
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars):
+ try:
+ alt.Chart(cars).mark_point().encode(
+ z='Horsepower'
+ )
+ except TypeError as e:
+ print(f"TypeError: {e}")
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Properties are Being Ignored
+
+ Altair might ignore a property that you specified. In the chart below, we are using a `text` channel, which is only compatible with `mark_text`. You do not see an error or a warning about this in the notebook. However, the underlying Vega-Lite library will show a warning in the browser console. Press Alt+Cmd+I on Mac or Alt+Ctrl+I on Windows and Linux to open the developer tools and click on the `Console` tab. When you run the example in the cell below, you will see a the following warning.
+
+ ```
+ WARN text dropped as it is incompatible with "bar".
+ ```
+ """)
+ return
+
+
+@app.cell
+def _(alt, cars):
+ alt.Chart(cars).mark_bar().encode(
+ y='mean(Horsepower)',
+ text='mean(Acceleration)'
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ If you find yourself debugging issues related to Vega-Lite, you can open the chart in the [Vega Editor](https://vega.github.io/editor/) either by clicking on the "Open in Vega Editor" link at the bottom of the chart or in the action menu (click to open) at the top right of a chart. The Vega Editor provides additional debugging but you will be writing Vega-Lite JSON instead of Altair in Python.
+
+ **Note**: The Vega Editor may be using a newer version of Vega-Lite and so the behavior may vary.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Asking for Help
+
+ If you find a problem with Altair and get stuck, you can ask a question on Stack Overflow. Ask your question with the `altair` and `vega-lite` tags. You can find a list of questions people have asked before [here](https://stackoverflow.com/questions/tagged/altair).
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Reporting Issues
+
+ If you find a problem with Altair and believe it is a bug, please [create an issue in the Altair GitHub repo](https://github.com/altair-viz/altair/issues/new) with a description of your problem. If you believe the issue is related to the underlying Vega-Lite library, please [create an issue in the Vega-Lite GitHub repo](https://github.com/vega/vega-lite/issues/new).
+ """)
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/altair/index.md b/altair/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ab1bf2a4b347414482716e438465e0744c19f9f
--- /dev/null
+++ b/altair/index.md
@@ -0,0 +1,14 @@
+---
+title: Learn Altair
+description: >
+ Learn the basics of Altair, a high-performance visualization library,
+ using lessons developed at the University of Washington.
+---
+
+## Acknowledgments
+
+These notebooks were created by Jeffrey Heer, Dominik Moritz, Jake VanderPlas, and Brock Craft
+as part of the [Visualization Curriculum](https://uwdata.github.io/visualization-curriculum/intro.html)
+at the University of Washington.
+Our thanks to the authors for making their work available under an open license:
+if we all share a little, we all get a lot.
diff --git a/assets/styles.css b/assets/styles.css
new file mode 100644
index 0000000000000000000000000000000000000000..ca85ec2c8e79d94affa672e8c8f2f51fe1aa8205
--- /dev/null
+++ b/assets/styles.css
@@ -0,0 +1,51 @@
+:root {
+ --primary-green: #10B981;
+ --dark-green: #047857;
+ --light-green: #D1FAE5;
+}
+.bg-primary { background-color: var(--primary-green); }
+.text-primary { color: var(--primary-green); }
+.border-primary { border-color: var(--primary-green); }
+.bg-light { background-color: var(--light-green); }
+.hover-grow { transition: transform 0.2s ease; }
+.hover-grow:hover { transform: scale(1.02); }
+.card-shadow { box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05), 0 1px 3px rgba(0, 0, 0, 0.1); }
+
+/* Prose styles for markdown-generated content */
+.prose h1 { font-size: 1.875rem; font-weight: 700; color: #1f2937; margin: 1.5rem 0 0.75rem; }
+.prose h2 { font-size: 1.5rem; font-weight: 700; color: #1f2937; margin: 1.5rem 0 0.75rem; }
+.prose h3 { font-size: 1.25rem; font-weight: 600; color: #1f2937; margin: 1.25rem 0 0.5rem; }
+.prose h4 { font-size: 1.125rem; font-weight: 600; color: #1f2937; margin: 1rem 0 0.5rem; }
+.prose p { color: #4b5563; margin-bottom: 1rem; line-height: 1.75; }
+.prose ul { list-style-type: disc; padding-left: 1.25rem; margin-bottom: 1rem; color: #4b5563; }
+.prose ol { list-style-type: decimal; padding-left: 1.25rem; margin-bottom: 1rem; color: #4b5563; }
+.prose li { margin-bottom: 0.25rem; line-height: 1.75; }
+.prose a { color: var(--primary-green); }
+.prose a:hover { color: var(--dark-green); }
+.prose strong { font-weight: 600; }
+.prose code { font-family: ui-monospace, monospace; font-size: 0.875em;
+ background-color: #f3f4f6; padding: 0.1em 0.3em; border-radius: 0.25rem; }
+.prose pre { background-color: #f3f4f6; color: #1f2937; padding: 1rem;
+ border-radius: 0.5rem; overflow-x: auto; margin-bottom: 1rem; }
+.prose pre code { background: none; padding: 0; font-size: 0.875rem; color: inherit; }
+
+/* Component classes */
+.logo-container { background-color: var(--light-green); padding: 0.25rem; border-radius: 0.5rem; }
+.card-accent { height: 0.5rem; background-color: var(--primary-green); }
+.feature-card { background-color: #ffffff; padding: 1.5rem; border-radius: 0.5rem;
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05), 0 1px 3px rgba(0, 0, 0, 0.1); }
+.content-card { background-color: #ffffff; border: 1px solid #e5e7eb; border-radius: 0.5rem;
+ overflow: hidden; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05), 0 1px 3px rgba(0, 0, 0, 0.1); }
+.icon-container { width: 3rem; height: 3rem; background-color: var(--light-green);
+ border-radius: 9999px; display: flex; align-items: center;
+ justify-content: center; margin-bottom: 1rem; }
+
+.link-primary { color: var(--primary-green); }
+.link-primary:hover { color: var(--dark-green); }
+
+.btn-primary { background-color: var(--primary-green); color: #ffffff; font-weight: 500;
+ border-radius: 0.375rem; transition: background-color 300ms ease-in-out; }
+.btn-primary:hover { background-color: var(--dark-green); }
+
+.footer-link { color: #d1d5db; transition: color 300ms ease-in-out; }
+.footer-link:hover { color: #ffffff; }
diff --git a/bin/build.py b/bin/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe9cb56f6205291a2064f6dc1774fe155b27efdc
--- /dev/null
+++ b/bin/build.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+"""Generate a static site from Jinja2 templates and lesson data."""
+
+import argparse
+import datetime
+import json
+import re
+import shutil
+from pathlib import Path
+
+import frontmatter
+import markdown as md
+from jinja2 import Environment, FileSystemLoader
+
+from utils import get_notebook_title
+
+
+def transform_lessons(data: dict, root: Path) -> dict:
+ """Transform raw lesson data into template-ready form."""
+ for course_id, course in data.items():
+ desc = course.get("description", "").strip()
+ course["description_html"] = f"
-
-
-
-
-
-
-
-
-
diff --git a/sql/01_basic_select.py b/sql/01_basic_select.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19fbbc0038ea14f3bb2291437ee50b74702f51d
--- /dev/null
+++ b/sql/01_basic_select.py
@@ -0,0 +1,382 @@
+# /// script
+# requires-python = ">=3.13"
+# dependencies = [
+# "marimo",
+# "marimo-learn>=0.7.0",
+# "polars==1.24.0",
+# "sqlalchemy",
+# ]
+# ///
+
+import marimo
+
+__generated_with = "0.20.4"
+app = marimo.App(width="medium")
+
+with app.setup:
+ import marimo as mo
+ import marimo_learn as mol
+ from marimo_learn import MultipleChoiceWidget, OrderingWidget
+ import sqlalchemy
+
+ db_path = mol.localize_file("penguins.db")
+ DATABASE_URL = f"sqlite:///{db_path}"
+ engine = sqlalchemy.create_engine(DATABASE_URL)
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ # Basic Selection
+
+ This tutorial shows how to select values from a single table in a database using SQL. We have already made a connection between this notebook and our `penguins.db` databaseβwe'll show you how to do that laterβso let's have a look at the data in the `penguins` table.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select * from penguins;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Almost every **query** in SQL starts with the word `select`. The value immediately after it tells the database manager what we want to see. In this case, we use the shorthand `*` to mean "all the columns". We then say `from penguins` to tell the database manager which table we want to get the data from. The semi-colon at the end marks the end of the query.
+
+ Note that the database manager doesn't format the output nicely, draw the little distribution histograms above columns, or give us the page-forward/page-backward controls: all the credit for that belongs to the Marimo notebook.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Choosing Columns
+
+ We don't have to select all of the columns every time we get data from a table. If we only want specific columns, we give their names instead of using `*` to mean "all". As the output below shows, the columns are displayed in the order in which we gave their names.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select sex, island, species from penguins;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Try editing the SQL in the query cell to change the column order, or to get the `bill_length_mm` column.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Upper and Lower Case
+
+ We can write the query above in any mixture of upper and lower case and get the same result.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ SELECT Sex, island, SPECIES frOM pEnGuInS;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Please don't do this: it makes your queries very hard to read. It *is* common to use upper case for keywords like `SELECT` and `FROM`, and lower case for column names like `penguins` and `island`; whatever you choose, the most important thing is to be consistent.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Sorting
+
+ When we look at a spreadsheet or a printed table, the rows are in a particular order. A database manager, on the other hand, might rearrange rows for the sake of efficiency as data is added or deleted, which means the rows displayed by `select` can be in whatever order it wants. If we want a particular order, we can add `order by` and the names of one or more columns to our query.
+
+ Note that we have split the query below across several lines to make it easier to read. Just as SQL doesn't care about upper and lower case, it doesn't care about line breaks. As our queries become larger and more complicated, formatting them like this will make them a lot easier to understand.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select island, species, sex
+ from penguins
+ order by island, species;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ If you page through the output from the query above, you'll see that our penguins have been ordered by island: Biscoe before Dream, and Dream before Torgersen. Within each of those groups, the penguins are sub-ordered by species (Adelie, Chinstrap, and then Gentoo). The penguins aren't ordered by sex, but they could be: as with island and species, the sorting goes from left to right.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Try rearranging the order of columns in the `select` while leaving the order in `order by` alone and vice versa. Notice that you don't have to sort in the order in which the columns are displayed (but you usually should to make the output easier to understand).
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > What do you think will happen if you select `island` and `species` but `order by sex`? How can you tell if your prediction is correct?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Limiting Output
+
+ The `penguins` table has 344 rows. If we only want to see the first five, we can add a `limit` clause to our query, which specifies the maximum number of rows we want.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select * from penguins limit 5;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ What if we want the next five? Or the five after that? To get those, we can add an offset, which is the number of rows to skip before selecting as many rows as we've asked for.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select * from penguins
+ limit 5 offset 5;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Selecting one chunk of data after another is called **paging**. Applications frequently do this in order to save memory and bandwidth: people can't look at 100,000 rows at once, so there's usually no point grabbing that many in one gulp.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Add a cell below to get rows 12 through 17 from the `penguins` table. Think carefully about what the `offset` and `limit` need to be to get precisely these rows.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Try changing the query above to be `offset 5 limit 5`. Do you understand the result?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > 1. What happens if you specify a limit that is greater than the number of rows in the table?
+ > 1. What happens if you specify an offset that is greater than the number of rows in the table?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Suppose your program is paging through a table while another application is adding and deleting rows. What would you want to happen? What do you think will happen?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Removing Duplicates
+
+ Suppose we want to find out which kinds of penguins were seen on which islands. We could scroll through the data, taking note of each unique (species, island) pair we see, but SQL will do this for us if we add the `distinct` keyword to our query.
+
+ Note that the query below includes a comment explaining what it does. While comments in Python start with `#`, comments in SQL start with `--` and run to the end of the line.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ -- Show unique (species, island) pairs.
+ select distinct species, island
+ from penguins;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Modify the query above to show (island, species) instead of (species, island), and to sort by island name and then by species name.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Doing Calculations
+
+ The `penguins` table records the penguins' masses in grams (at least, that's what we think the `_g` suffix on the column name means). If we want the mass in kilograms, we can divide the given values by 1000.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select species, sex, body_mass_g, body_mass_g / 1000
+ from penguins
+ limit 10;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ The query above shows both the mass in grams and the mass in kilograms so that we can check the latter against the former. However, the name that the database manager automatically gives the calculated column isn't particular readable. Let's use `as` to fix that.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select species, sex, body_mass_g, body_mass_g / 1000 as mass_kg
+ from penguins
+ limit 10;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Can you use `as` to select a column from the table but display it with a different name? Should you?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Write a query to calculate the ratio of bill length and bill height for every penguin. Call the calculated column `bill_ratio`.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Check Understanding
+
+ 
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ _widget = mo.ui.anywidget(
+ OrderingWidget(
+ question="Arrange these SQL clauses in the order they must appear in a query.",
+ items=["SELECT", "FROM", "ORDER BY", "LIMIT"],
+ )
+ )
+ _widget
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ _widget = mo.ui.anywidget(
+ MultipleChoiceWidget(
+ question="What does `SELECT *` mean in a SQL query?",
+ options=[
+ "Select only the first row of the table",
+ "Select all columns from the table",
+ "Select all rows but only the first column",
+ "Count the total number of rows",
+ ],
+ correct_answer=1,
+ explanation="* is shorthand for 'all columns'. SELECT * retrieves every column; the number of rows returned depends on whether you add WHERE, LIMIT, or other clauses.",
+ )
+ )
+ _widget
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/sql/02_filter.py b/sql/02_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b6be537f99c32c9836a5683d83144342971bd72
--- /dev/null
+++ b/sql/02_filter.py
@@ -0,0 +1,320 @@
+# /// script
+# requires-python = ">=3.13"
+# dependencies = [
+# "marimo",
+# "marimo-learn>=0.7.0",
+# "polars==1.24.0",
+# "sqlalchemy",
+# ]
+# ///
+import marimo
+
+__generated_with = "0.20.4"
+app = marimo.App(width="medium")
+
+
+@app.cell(hide_code=True)
+def _():
+ import marimo as mo
+ import marimo_learn as mol
+ import sqlalchemy
+
+ db_path = mol.localize_file("penguins.db")
+ DATABASE_URL = f"sqlite:///{db_path}"
+ engine = sqlalchemy.create_engine(DATABASE_URL)
+ return engine, mo, mol
+
+
+@app.cell(hide_code=True)
+def _():
+ from marimo_learn import MatchingWidget, MultipleChoiceWidget
+ return MatchingWidget, MultipleChoiceWidget
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ # Filtering
+
+ The previous tutorial showed how to select specific columns from a database table, and how to page through the data that a query returns. However, people almost always **filter** data based on its properties rather than on its position in a table. To see how this works, let's look at the combinations of species, island, and sex in the `penguins` table.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select distinct species, island, sex from penguins;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Equality
+
+ Suppose we only want to see penguins from Dream island, regardless of their species or sex. To get this, we add a `where` clause to our query.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select distinct species, island, sex
+ from penguins
+ where island = "Dream";
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ There are several noteworthy things in this query:
+
+ 1. We don't have to use `distinct`. If we leave it out, we get *all* the penguins on Dream island. (We included it to make the output easier to read without paging.)
+ 2. The `where` clause *must* come after the `from` clause. SQL is very picky about orderingβ¦
+ 3. We don't put quotation marks around `island` because it's the name of a column. We *do* put quotes around `"Dream"` because it's an actual literal piece of text.
+ 4. We use a single equals sign `=` to check for equality. This is different from most programming languages, which use `==`.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Write a query to select all the Chinstrap penguins regardless of what island they're on.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > 1. Change the column name `island` to `ISLAND` and re-run the query: what happens?
+ > 2. Change the text value `"Dream"` to `"DREAM"`: what happens?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Comparisons
+
+ We can do all of the usual comparisons in SQL:
+
+ | name | symbol | example |
+ | :--- | ------ | :------ |
+ | less than | `<` | `body_mass_g < 3300` |
+ | less than or equal | `<=` | `flipper_length_mm < 200.0` |
+ | equal | `=` | `species = "Gentoo"` |
+ | not equal | `!=` or `<>` | `species != "Gentoo"` |
+ | greater than or equal | `>=` | `flipper_length_mm >= 200.0` |
+ | greater than | `>` | `body_mass_g > 3300` |
+
+ Comparing numbers is straightforward. When we compare text, the comparison uses dictionary order: A is less than B, AA is than AB, and so on.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Find all the penguins that _aren't_ on Torgersen island.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Use `where`, `order by`, and `limit` to find the heaviest penguin. Use it again to find the lightest.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > What happens if we accidentally compare a number to text? For example, what happens if we select penguins where `species` is less than 3000, or where `body_mass_g` is greater than the letter 'M'?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Combining Conditions
+
+ We can combine conditions using `and` and `or`. `and` is the simpler of the two: when we write `where condition_1 and condition_2`, we get the rows where *both* conditions are true.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select * from penguins
+ where species = 'Gentoo' and body_mass_g > 6000.0;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ If we use `or`, we get rows where *either or both* condition is true. This is different from common English usage: if you tell a child that they can have an ice cream cone or a chocolate bar, you mean "either/or". When you use `or` in SQL, on the other hand, it means "if any of the conditions is true". For example, the query below gets all of the penguins on Biscoe island, as well as all of the Gentoo penguins. Some penguins satisfy both conditions (the Adelie penguins on Biscoe island), some satisfy just one (the Adelies on Torgersen and the Gentoos on Biscoe). Penguins that don't satisfy either, like Chinstrap penguins on Dream island, don't show up at all.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select distinct species, island from penguins
+ where species = 'Adelie' or island = 'Biscoe';
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ We have written our `where` conditions as we would say them. Many programmers would wrap each condition in parentheses to make them easier to read.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select distinct species, island from penguins
+ where (species = 'Adelie') or (island = 'Biscoe');
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ The more complex our conditions are, the more important it is to use parentheses to make sure everyone reading the query (including ourselves) understands what it means. The query below shows an example.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select distinct species, island from penguins
+ where ((species = 'Adelie') and (island = 'Biscoe')) or (species = 'Chinstrap');
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Explain in simple terms what the condition in the query above is selecting.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > We can put `not` in front of a condition to invert its meaning. Use this to write a query that fetches the same rows as one with the condition `species != 'Chinstrap'`, but which uses `=` instead of `!=`.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Does the expression `species not = 'Gentoo'` work?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > 1. Write a query to find all of the penguins whose bill length is greater than their bill depth.
+ > 2. Write another query to find all of the penguins whose bill length is less than their bill depth. What do you notice about the output of this query?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > The previous tutorial showed how to do calculations on the fly to (for example) produce a column called `mass_kg` showing the body mass of each penguin in kilograms. Can these on-the-fly columns be used in `where` conditions? To find out, write a query that finds all of the penguins that weight more than 4.0 kg.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Check Understanding
+
+ 
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(MatchingWidget, mo):
+ _widget = mo.ui.anywidget(
+ MatchingWidget(
+ question="Match each SQL comparison operator to its meaning.",
+ left=["<", "!=", ">=", "="],
+ right=["equal to", "not equal to", "less than", "greater than or equal to"],
+ correct_matches={0: 2, 1: 1, 2: 3, 3: 0},
+ )
+ )
+ _widget
+ return
+
+
+@app.cell(hide_code=True)
+def _(MultipleChoiceWidget, mo):
+ _widget = mo.ui.anywidget(
+ MultipleChoiceWidget(
+ question="A query uses `WHERE species = 'Adelie' OR island = 'Biscoe'`. Which rows does it return?",
+ options=[
+ "Only rows where both conditions are true (Adelie penguins on Biscoe)",
+ "Rows where either condition is true, or both",
+ "Rows where species is Adelie but island is not Biscoe",
+ "Rows where island is Biscoe but species is not Adelie",
+ ],
+ correct_answer=1,
+ explanation="In SQL, OR returns every row where at least one condition is true. This includes rows satisfying just the first condition, just the second, or both simultaneously.",
+ )
+ )
+ _widget
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/sql/03_aggregate_group.py b/sql/03_aggregate_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..b696881a7b3d0a2ebac6dc9df8c6cfc685750f04
--- /dev/null
+++ b/sql/03_aggregate_group.py
@@ -0,0 +1,418 @@
+# /// script
+# requires-python = ">=3.13"
+# dependencies = [
+# "marimo",
+# "marimo-learn>=0.7.0",
+# "polars==1.24.0",
+# "sqlalchemy",
+# ]
+# ///
+import marimo
+
+__generated_with = "0.20.4"
+app = marimo.App(width="medium")
+
+
+@app.cell(hide_code=True)
+def _():
+ import marimo as mo
+ import marimo_learn as mol
+ import sqlalchemy
+
+ db_path = mol.localize_file("penguins.db")
+ DATABASE_URL = f"sqlite:///{db_path}"
+ engine = sqlalchemy.create_engine(DATABASE_URL)
+ return engine, mo, mol
+
+
+@app.cell(hide_code=True)
+def _():
+ from marimo_learn import FlashcardWidget, LabelingWidget
+ return FlashcardWidget, LabelingWidget
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ # Aggregating and Grouping
+
+ The queries we wrote in the previous two tutorials operated on each row separately. We often want to ask questions about groups of rows, such as "how heavy is the largest penguin we weighed?" or "how many Gentoo penguins did we see?" This tutorial looks first at how to write queries that **aggregate** data, and then at how to calculate aggregate values for several subsets of our data simultaneously.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Aggregation
+
+ Let's start by finding out how heavy the heaviest penguin in our dataset is. To do this, we use a function called `max`, and give it the name of the column it is to get data from. To make the result more readable, we will use `as` to call the result `heaviest`.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select max(body_mass_g) as heaviest from penguins;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ The query below shows the six most commonly used aggregation functions in SQL applied to different columns of the penguins data.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select
+ avg(flipper_length_mm) as averagest,
+ count(species) as num_penguins,
+ max(body_mass_g) as heaviest,
+ min(flipper_length_mm) as shortest,
+ sum(body_mass_g) as total_mass
+ from penguins;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > How much do the penguins weigh in total?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > The function `length` calculates the number of characters in a piece of text. Write a query that returns the length of the longest island name in the database.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > The function `round` rounds off a number, e.g., `round(1.234, 1)` produces `1.2`. Use this to display the average flipper length of all the penguins rounded to one decimal place.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Note: rather than writing `count(species)` or `count(island)`, we often write `count(*)` to count the total number of rows. However, as we will see in the next tutorial `count(species)` and `count(*)` can sometimes produce slightly different answers.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Grouping
+
+ The query shown above applies the aggregation function to all of the rows in the table. If we want, we can apply it to just the first ten.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select avg(body_mass_g) as avg_mass
+ from penguins
+ limit 10;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ The order of operations here is important. We aren't asking SQL to calculate an average and then give us the first ten rows of the result. Instead, we are asking it to get the first ten rows and *then* calculate the average of those. This matters more when we use `where` to filter the data: the filtering happens before SQL applies the function, which lets us do things like calculate the average mass of all the Gentoo penguins.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select avg(body_mass_g) as avg_mass
+ from penguins
+ where species = 'Gentoo';
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ But what if we want to calculate the average mass for all of the species? We could write three queries, one for each species, but (a) that would be annoying and (b) if someone adds Emperor penguins to the data and we don't remember to update our query, we won't get the full picture.
+
+ What we should do instead is tell SQL to group the data based on the values in one or more columns, and then calculate the aggregate value within each group.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select avg(body_mass_g) as avg_mass
+ from penguins
+ group by species;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Since there are three species, we get three rows of output. Unfortunately, we don't know which average corresponds to which species. To get that, we add the `species` column to the `select` clause.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select species, avg(body_mass_g) as avg_mass
+ from penguins
+ group by species;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ And just as we can order data by multiple columns at once, we can group by multiple columns. When we do, we get one bucket for each unique combination of grouping values.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select species, sex, avg(body_mass_g) as avg_mass
+ from penguins
+ group by species, sex;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ We will explain what the blanks in the `sex` column mean in the next tutorial.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > How many penguins of each sex were found on each island?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > What is difference in weight between the heaviest female penguin and the lightest female penguin within each species?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Explain what the query below is calculating, and when its result would be useful.
+ >
+ > ```sql
+ > select round(body_mass_g/1000, 1) as weight, count(*)
+ > from penguins
+ > group by weight;
+ > ```
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Arbitrary Choice in Aggregation
+
+ The query shown below is legal SQL, but probably not what anyone would want.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select sex, species, body_mass_g
+ from penguins
+ group by species;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ The rule that SQL follows is this: if we have created groups using `group by`, and we _don't_ specify how to combine the values in a group for a particular column, then the database picks one of the values for that column in that group arbitrarily. For example, since we only grouped by `species`, but we're asking to show `sex`, the result shows one of the values for `sex` for each species. Similarly, since we didn't specify how to combine the various body masses for each species, the three values shown each come from a penguin of that species, but we don't know (and can't control) which one.
+
+ We used this behavior earlier when we selected `species` and `avg(body_mass_g)` after grouping by `species`. Since all of the penguins within a group are of the same species, it doesn't matter which `species` value the database shows us for that group: they're all the same. If we forget to choose an aggregation function by accident, though, the answer will be plausible (because it's an actual value) but wrong.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Filtering After Aggregation
+
+ Just as we can use `where` to filter individual rows before aggregating (or if we're not aggregating at all), we can use `having` to filter aggregated values. For example, the query below finds those combinations of sex and species whose average weight is 4kg or more.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select sex, species, avg(body_mass_g) as avg_mass
+ from penguins
+ group by sex, species
+ having avg_mass >= 4000.0;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Explain what the query below is calculating.
+ >
+ > ```sql
+ > select max(flipper_length_mm) as long_flipper, species, sex
+ > from penguins
+ > where sex = 'FEMALE'
+ > group by species, sex
+ > having long_flipper > 210.0;
+ > ```
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ What we *can't* do with the tools we've seen so far is compare individual values to aggregates. For example, we can't use a query like the one below to find penguins that are heavier than average.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select * from penguins
+ where body_mass_g > avg(body_mass_g);
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ We will see how to write this query in a couple of tutorials.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Check Understanding
+
+ 
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(FlashcardWidget, mo):
+ _widget = mo.ui.anywidget(
+ FlashcardWidget(
+ question="SQL Aggregation Functions",
+ cards=[
+ {"front": "avg(column)", "back": "Returns the average of all non-null values in the column"},
+ {"front": "count(*)", "back": "Counts the total number of rows, including rows with null values"},
+ {"front": "count(column)", "back": "Counts the number of non-null values in the column (rows with null are skipped)"},
+ {"front": "max(column)", "back": "Returns the largest non-null value in the column"},
+ {"front": "min(column)", "back": "Returns the smallest non-null value in the column"},
+ {"front": "sum(column)", "back": "Adds up all non-null values in the column"},
+ ],
+ )
+ )
+ _widget
+ return
+
+
+@app.cell(hide_code=True)
+def _(LabelingWidget, mo):
+ _widget = mo.ui.anywidget(
+ LabelingWidget(
+ question="Drag each label to the line of the query it best describes.",
+ labels=["aggregation function", "alias", "source table", "grouping column"],
+ text_lines=[
+ "select species, avg(body_mass_g) as avg_mass",
+ "from penguins",
+ "group by species;",
+ ],
+ correct_labels={0: [0, 1], 1: [2], 2: [3]},
+ )
+ )
+ _widget
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/sql/04_null.py b/sql/04_null.py
new file mode 100644
index 0000000000000000000000000000000000000000..beb11772c941454c95c03a139ddc8710fb16710a
--- /dev/null
+++ b/sql/04_null.py
@@ -0,0 +1,397 @@
+# /// script
+# requires-python = ">=3.13"
+# dependencies = [
+# "marimo",
+# "marimo-learn>=0.7.0",
+# "polars==1.24.0",
+# "sqlalchemy",
+# ]
+# ///
+import marimo
+
+__generated_with = "0.20.4"
+app = marimo.App(width="medium")
+
+
+@app.cell(hide_code=True)
+def _():
+ import marimo as mo
+ import marimo_learn as mol
+ import sqlalchemy
+
+ db_path = mol.localize_file("penguins.db")
+ DATABASE_URL = f"sqlite:///{db_path}"
+ engine = sqlalchemy.create_engine(DATABASE_URL)
+ return engine, mo, mol
+
+
+@app.cell(hide_code=True)
+def _():
+ from marimo_learn import ConceptMapWidget, MatchingWidget
+ return ConceptMapWidget, MatchingWidget
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ # Missing Data
+
+ The biggest challenge people facing when using databases isn't remembering the order of clauses in a SQL query. The biggest challenge is handling missing data. This tutorial builds on the filtering introduced in the previous one to show how to manage this in our queries.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Null
+
+ Here are all of the distinct combinations of island, species, and sex in the `penguins` table.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select distinct island, species, sex from penguins;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Notice the two blanks in the `sex` column, and the fact that its subtitle says there are 3 unique values. Those blanks show the special value `null`, which SQL uses to mean "I don't know". In this case, those values tell us that the scientists who collected the penguins didn't record the sex of some of the Adelie penguins on Dream and Torgersen islands.
+
+ The most important thing about **null values** is that almost any operation that involves a `null` produces `null` as an answer. For example, we can use SQL as a very complicated desk calculator and ask, "What is 1 + 2?"
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select 1 + 2 as result;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ If we ask, "What is 1 + `null`?", the answer is `null`, because one plus "I don't know" is "I don't know".
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select 1 + null as result;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ We get the same thing if we subtract `null`, multiply by it, and so on. (As the saying goes, "Garbage in, garbage out.") We also get the same thing if we do comparisons. Is `null` equal to 3? Again, the answer is `null`.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select null = 3 as result;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ We get the same thing if we ask if `null` is *not* equal to 3, because if we don't know the value, we don't know if it *isn't* 3.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select null != 3 as result;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ What about `null = null`? If we have two numbers, and we don't know what either is, we don't know if they're the same or not, so the answer is once again `null`, *not* `true`.
+ """)
+ return
+
+
+@app.cell
+def _():
+ _df = mo.sql(
+ f"""
+ select null = null as result;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > 1. Where does SQL put `null` values when sorting: at the start, at the end, or somewhere else?
+ > 2. Does it follow the same rule for both numbers and text?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Aggregating Nulls
+
+ If 1 + `null` is `null`, then 1 + 2 + `null` should be `null` as well. Continuing this line of thought, the sum of a column that includes one or more nulls ought to be `null`; so should the `max`, `min`, and so on, because if we don't know all of the inputs, we can't know the output.
+
+ SQL isn't this strict because it wouldn't be useful. Instead, its aggregation functions ignore `null` values. If we calculate a sum, for example, we get the sum of all the numbers that we actually know. If we calculate an average, we get the sum of the known values divided by the number of known values (rather than by the total number of known and unknown values), and so on.
+
+ There is one exception to this rule. If we ask for `count(sex)` in the penguins database, we get the number of penguins whose sex is known:
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select count(sex) from penguins;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ If we use `count(*)`, on the other hand, we get the total number of rows regardless of whether some values are `null` or not:
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select count(*) from penguins;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Compare `sum(body_mass_g) / count(body_mass_g)` with `sum(body_mass_g) / count(*)` and with `avg(body_mass_g)`. Are the results consistent with the explanation above?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Handling Nulls
+
+ There are only two things we can do with `null` that don't produce `null` as a result: ask if a value is `null`, and ask if it isn't. If we're interested in the `sex` column, the first is written `sex is null`, while the second is written `sex is not null`. Note that `is null` and `is not null` are written as multiple words, but are a single test; it's confusing, but we're stuck with it.
+
+ Let's have a look at some practical examples. If we select the distinct values of `sex` from the `penguins` table, we get `"FEMALE"`, `"MALE"`, and `null`. (The first line of output is blank, which is how Marimo shows null values.)
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select distinct sex from penguins order by sex;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ If we want to get all the rows that have a null value for `sex`, we *cannot* do this:
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select sex from penguins
+ where (sex != 'MALE') and (sex != 'FEMALE');
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ That doesn't produce any output because the rows with null values for `sex` don't pass the test. If we want the rows with missing sex, we have to ask for them explicitly. This query gives us 11 rows.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select sex from penguins
+ where sex is null;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ How many times did the scientists fail to record a penguin's mass or flipper length? The answer is "twice", and in both cases they didn't record *any* of the physical measurements.
+ """)
+ return
+
+
+@app.cell
+def _(penguins):
+ _df = mo.sql(
+ f"""
+ select * from penguins
+ where (body_mass_g is null) or (flipper_length_mm is null);
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > 1. Write a query to find penguins whose body mass is known but whose sex is not.
+ > 2. Write another query to find penguins whose sex is known but whose body mass is not.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Explain why the query shown earlier (and reproduced below) does not produce any rows:
+ >
+ > ```sql
+ > select sex from penguins
+ > where (sex != 'MALE') and (sex != 'FEMALE');
+ > ```
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Some programmers find `null` very annoying. Instead of putting it in their tables, they use marker values like -1 or `"NA"` to signal missing data. Doing this almost always leads to problems. For example, if we are calculating the average age of people who are 17, 19, 21, and and unknown number of years old, the sensible thing to do is add the values we know (the 17, 19, and 21) and then divide by 3. As we will see in the next tutorial, SQL will do this for us automatically _if_ we have used `null` to represent the unknown age. If we use -1, on the other hand, it's all too easy to calculate (17 + 19 + 21 - 1) / 4 and get an average age of 14. We could use `where` to filter out the -1 ages before doing the sum, but (a) we'd have to know to do that and (b) we'd have to know that this programmer used -1 instead of -999999 or something else to mean "I don't know". While it takes a bit of getting used to, it's (almost) always better to use `null` when there are holes in our data.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Ternary Logic
+
+ These tutorials avoid theory when they can, but a little bit will help understand how `null` works. In conventional logic, a statement is either true or false. If we have two statements `A` and `B`, then `A and B` is true when both are true, while `A or B` is true if either or both are true. These rules are sometimes referred to as **binary logic** (also called **Boolean logic**) because there are only two possible values.
+
+ SQL is unusual among programming languages in using **ternary logic**, in which any statement can be true, false, or null. Since `null` is not `true`, `where` drops rows if the filter expression produces `null`.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Checking Understanding
+
+ 
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(MatchingWidget, mo):
+ _widget = mo.ui.anywidget(
+ MatchingWidget(
+ question="Match each SQL expression involving null to its result.",
+ left=["1 + null", "null = null", "null is null", "null != 3"],
+ right=[
+ "null β arithmetic with an unknown is unknown",
+ "null β comparing unknowns yields unknown, not true",
+ "true β the only test that reliably works on null",
+ "null β even inequality checks return unknown for null",
+ ],
+ correct_matches={0: 0, 1: 1, 2: 2, 3: 3},
+ )
+ )
+ _widget
+ return
+
+
+@app.cell(hide_code=True)
+def _(ConceptMapWidget, mo):
+ _widget = mo.ui.anywidget(
+ ConceptMapWidget(
+ question="Connect these null-related concepts by selecting a relationship term and clicking two concepts.",
+ concepts=["null", "unknown value", "is null", "ternary logic", "aggregation functions"],
+ terms=["means", "tested with", "uses", "ignore"],
+ correct_edges=[
+ {"from": "null", "to": "unknown value", "label": "means"},
+ {"from": "null", "to": "is null", "label": "tested with"},
+ {"from": "ternary logic", "to": "null", "label": "uses"},
+ {"from": "aggregation functions", "to": "null", "label": "ignore"},
+ ],
+ )
+ )
+ _widget
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/sql/05_join.py b/sql/05_join.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc1ad08cb14401cb769c415e6e268a3009a48c58
--- /dev/null
+++ b/sql/05_join.py
@@ -0,0 +1,317 @@
+# /// script
+# requires-python = ">=3.13"
+# dependencies = [
+# "marimo",
+# "marimo-learn>=0.7.0",
+# "polars==1.24.0",
+# "sqlalchemy",
+# ]
+# ///
+import marimo
+
+__generated_with = "0.20.4"
+app = marimo.App(width="medium")
+
+
+@app.cell(hide_code=True)
+def _():
+ import marimo as mo
+ import marimo_learn as mol
+ import sqlalchemy
+
+ db_path = mol.localize_file("lab.db")
+ DATABASE_URL = f"sqlite:///{db_path}"
+ engine = sqlalchemy.create_engine(DATABASE_URL)
+ return engine, mo, mol
+
+
+@app.cell(hide_code=True)
+def _():
+ from marimo_learn import LabelingWidget, OrderingWidget
+ return LabelingWidget, OrderingWidget
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ # Combining Tables
+
+ Relational databases get their name from the fact that they store the relations between tables. This tutorial shows how to connect and combine information from multiple tables. We will save most of the exercises for the next tutorial, where we start working with our first complex database.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Basic Joins
+
+ The `jobs` database has two tables. The first, called `job`, shows the credits that students can earn doing different kinds of jobs, and has two rows and two columns:
+
+ | name | credits |
+ | :--- | ------: |
+ | calibrate | 1.5 |
+ | clean | 0.5 |
+
+ The other table, `work`, keeps track of who has done which jobs:
+
+ | person | job |
+ | :----- | :-- |
+ | Amal | calibrate |
+ | Amal | clean |
+ | Amal | complain |
+ | Gita | clean |
+ | Gita | clean |
+ | Gita | complain |
+ | Madhi | complain |
+
+ We want to know how many credits each student has earned. The first step in answering this is to **join** the tables together.
+ """)
+ return
+
+
+@app.cell
+def _(job, work):
+ _df = mo.sql(
+ f"""
+ select *
+ from job join work;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ The `join` operation creates a temporary table in memory by combining every row of `job` with every row of `work`. Since `job` has two rows and `work` has seven, the temporary table has 2Γ7=14 rows.
+
+ Some of these rows are useful: the first, for example, tells us that Amal did some calibration, and that calibrating is worth 1.5 credits. The second, however, combines information about calibrating with the fact that Amal did some cleaning. We can get rid of the rows that aren't useful by filtering with `where`.
+ """)
+ return
+
+
+@app.cell
+def _(job, work):
+ _df = mo.sql(
+ f"""
+ select *
+ from job join work
+ where job.name = work.job;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ This query demonstrates two things:
+
+ 1. When we are working with two or more tables, we refer to columns using `table_name.column_name`, as in `job.name` or `work.job`. We don't absolutely need to do this in this query, since columns' names are all unique, but it's very common to have columns with the same names in different tables. In those cases the two-part names are required to avoid ambiguity; it is therefore good practice to *always* use two-part names when working with multiple tables.
+ 2. There isn't an entry in `job` for `complain`, so `job.name = work.job` isn't true for any of the combined rows that involve complaining. On the other hand, Gita cleaned up the lab twice, so there are two records in the result for that. This shows that `join` doesn't automatically remove duplicates.
+
+ While we can use `where`, the SQL standard encourages us to use a different keyword `on`:
+ """)
+ return
+
+
+@app.cell
+def _(job, work):
+ _df = mo.sql(
+ f"""
+ select *
+ from job join work
+ on job.name = work.job;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Many years ago, using `on` sometimes gave slightly higher performance. Today, though, the two forms are equivalent from the database manager's point of view. Many people still prefer `on` for readability: it shows how the rows are being combined, while `where` shows how combined rows are being filtered. As with almost everything in programming, what matters most is to pick one and stick to it so that your queries are consistent.
+
+ The standard also encourages us to write our join as `inner join`, because as we will see in a moment, other kinds of joins exist. People often skip this and just write `join`, or even use a simple comma between the table names, but from now on we will be pedantic to make what we're doing clearer.
+
+ We are now able to answer our original question: how many credits has each student earned?
+ """)
+ return
+
+
+@app.cell
+def _(job, work):
+ _df = mo.sql(
+ f"""
+ select work.person, sum(job.credits) as total -- add up the credits for each person
+ from job inner join work -- notice: inner join
+ on job.name = work.job
+ group by work.person; -- put all the credits for each person into a separate bucket
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Left Joins
+
+ The query above shows us how many credits Amal and Gita have earned, but doesn't show anything for Madhi. Ideally, we'd like a row showing that she has earned zero credits. To get this, we need to use a different kind of join called a **left join**. A left join is created by following these rules:
+
+ 1. If the row from the left-hand table matches one or more rows from the right-hand table, combine them as an inner join would.
+ 2. If the row from the left-hand table _doesn't_ match any rows from the right-hand table, create one row in the result with the values from the left row and `null` where the values from the right-hand table would be.
+
+ An example will make this clearer.
+ """)
+ return
+
+
+@app.cell
+def _(job, work):
+ _df = mo.sql(
+ f"""
+ select *
+ from work left join job
+ on work.job = job.name;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Let's trace this query's execution step by step:
+
+ 1. The `(Amal, calibrate)` row from `work` matches the `(calibrate, 1.5)` row from `job`, so that is the first row of output.
+ 2. Similarly, the `(Amal, clean)` row matches the `(clean, 0.5)` row, so we get the second row of output.
+ 3. But `(Amal, complain)` _doesn't_ match anything from `job`, so we get a row with the values from the left table (`Amal` and `complain`) and `null` for `name` and `work`.
+ 4. We then get two rows for Gita cleaning because there's a matchβ¦
+ 5. β¦and two rows with `null` values for Gita and Madhi complaining because there isn't.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > What do we get if we invert the order of the tables, i.e., do `job left join work`? Why?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Coalesce
+
+ We can now sum up everyone's credits:
+ """)
+ return
+
+
+@app.cell
+def _(job, work):
+ _df = mo.sql(
+ f"""
+ select work.person, sum(job.credits) as total
+ from work left join job -- notice: left join
+ on work.job = job.name
+ group by work.person;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ This is *almost* what we want: we have a row for Madhi, but her `total` is `null` because that's what `sum` produces when all of the values it's adding up are `null`. We can fix this using a built-in SQL function called `coalesce`:
+ """)
+ return
+
+
+@app.cell
+def _(job, work):
+ _df = mo.sql(
+ f"""
+ select
+ work.person,
+ coalesce(sum(job.credits), 0) as total
+ from
+ work left join job
+ on
+ work.job = job.name
+ group by
+ work.person;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ `coalesce` takes two inputs. If the first is not `null`, `coalesce` returns that. If the first value *is* `null`, on the other hand, `coalesce` returns its second input. In simpler terms, it gives us a value or a default if the value is `null`.
+
+ Note that we have split this query across several lines with the keywords at the left margin and the parts of the query that belong to them indented below them. As our queries become more complex, this style makes them easier to read. As with `join` versus `inner join`, the most important thing is to be consistent so that the reader isn't distracted by stylistic differences.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Check Understanding
+
+ 
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(OrderingWidget, mo):
+ _widget = mo.ui.anywidget(
+ OrderingWidget(
+ question="Arrange the steps SQL follows when executing an INNER JOIN.",
+ items=[
+ "Combine every row from the left table with every row from the right table",
+ "Apply the ON condition to keep only matching row pairs",
+ "Apply any WHERE clause to filter the matched rows further",
+ "Apply SELECT to return only the requested columns",
+ ],
+ )
+ )
+ _widget
+ return
+
+
+@app.cell(hide_code=True)
+def _(LabelingWidget, mo):
+ _widget = mo.ui.anywidget(
+ LabelingWidget(
+ question="Drag each label to the line of the query it best describes.",
+ labels=["left table", "right table", "join condition", "fallback for null"],
+ text_lines=[
+ "from work left join job",
+ "on work.job = job.name",
+ "coalesce(sum(job.credits), 0) as total",
+ ],
+ correct_labels={0: [0, 1], 1: [2], 2: [3]},
+ )
+ )
+ _widget
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/sql/06_keys.py b/sql/06_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc5b33fa571a268bdfa2ef508c2fc1a8c19e3ec5
--- /dev/null
+++ b/sql/06_keys.py
@@ -0,0 +1,483 @@
+# /// script
+# requires-python = ">=3.13"
+# dependencies = [
+# "marimo",
+# "marimo-learn>=0.7.0",
+# "polars==1.24.0",
+# "sqlalchemy",
+# ]
+# ///
+import marimo
+
+__generated_with = "0.20.4"
+app = marimo.App(width="medium")
+
+
+@app.cell(hide_code=True)
+def _():
+ import marimo as mo
+ import marimo_learn as mol
+ import sqlalchemy
+
+ db_path = mol.localize_file("survey.db")
+ DATABASE_URL = f"sqlite:///{db_path}"
+ engine = sqlalchemy.create_engine(DATABASE_URL)
+ return engine, mo, mol
+
+
+@app.cell(hide_code=True)
+def _():
+ from marimo_learn import ConceptMapWidget, FlashcardWidget
+ return ConceptMapWidget, FlashcardWidget
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ # Primary and Foreign Keys
+
+ The previous tutorial explained how to combine information from two tables using `inner join` and `left join`. This tutorial will explain how we can tell when it makes sense to do this, and introduce our first complex database. To start, let's look at a diagram showing the four tables in the `survey` database.
+
+ 
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Let's start with `person`, which has four columns: `persond_id`, `personal`, `family`, and `supervisor_id` (which we will discuss in the next section). `person_id` is shown in __*bold italics*__ to indicate that it is the table's **primary key**: each row in the table has a non-`null` `person_id`, and each of those values is unique. These values can therefore be used to uniquely identify specific rows in the table. We can check that by selecting all of the people and inspecting the `person_id` values by eye:
+ """)
+ return
+
+
+@app.cell
+def _(person):
+ _df = mo.sql(
+ f"""
+ select person_id from person;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ A better way is to count the number of rows in the table, the number of non-`null` `person_id` values, and the number of distinct person ID values. Remember, `count(*)` counts rows, while `count(column_name)` counts the number of non-`null` values in that particular column. We haven't seen `count(distinct column_name)` before, but as you might guess, it counts the number of distinct values in a particular column.
+ """)
+ return
+
+
+@app.cell
+def _(person):
+ _df = mo.sql(
+ f"""
+ select
+ count(*) as num_rows,
+ count(person_id) as num_non_null,
+ count(distinct person_id) as num_distinct
+ from person;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Now let's take a look at the `survey` table. Each survey has a survey ID, the ID of the person who did the survey, and the survey's start and end dates. `survey_id` is in __*bold italics*__, which tells us that each survey has a unique ID. `person_id`, on the other hand, is just in *italics*, and there's an arrow connecting it to the `person` table's primary key, which is also called `person_id`. The use of italics and the arrow signals that `survey.person_id` is a **foreign key**, i.e., a value stored in one table that references the primary key of another table. This relationship tells us that:
+
+ 1. It makes sense to use `survey.person_id = person.person_id` as a condition in a join because every `survey.person_id` is guaranteed to refer to an existing `person.person_id`.
+ 2. Several surveys might refer to the same person (or equivalently, one person might have done several surveys). This is called a **one-to-many relationship**.
+
+ Let's write some queries. Who is in the `person` table?
+ """)
+ return
+
+
+@app.cell
+def _(person):
+ _df = mo.sql(
+ f"""
+ select * from person;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ How many surveys has AscensiΓ³n Sierra done? Her `person_id` is `P001`, so we can answer the question by filtering the `survey` table and then aggregating.
+ """)
+ return
+
+
+@app.cell
+def _(survey):
+ _df = mo.sql(
+ f"""
+ select count(*) as num_surveys from survey
+ where person_id = 'P001';
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ What if we want AscensiΓ³n's name displayed along with this count? To get that, we need to join the tables.
+ """)
+ return
+
+
+@app.cell
+def _(person, survey):
+ _df = mo.sql(
+ f"""
+ select person.personal, person.family, count(*)
+ from person join survey
+ on person.person_id = survey.person_id
+ where person.person_id = 'P001';
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ What if we want to get AscensiΓ³n's full name in a single column? We can do that by concatenating her personal and family name using the `||` operator (which is sometimes called "glue"). As the output of the query below shows, `||` does for text what `+` does for numbers.
+ """)
+ return
+
+
+@app.cell
+def _(person, survey):
+ _df = mo.sql(
+ f"""
+ select person.personal || person.family as full_name, count(*)
+ from person join survey
+ on person.person_id = survey.person_id
+ where person.person_id = 'P001';
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Whoops: we probably want a space between AscensiΓ³n's personal and family names, so we will glue her personal name to a space and then glue that to her family name (just as we would write 1 + 2 + 3).
+ """)
+ return
+
+
+@app.cell
+def _(person, survey):
+ _df = mo.sql(
+ f"""
+ select person.personal || ' ' || person.family as full_name, count(*)
+ from person join survey
+ on person.person_id = survey.person_id
+ where person.person_id = 'P001';
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Now, what if we want the number of surveys done by each person ordered by family and personal name?
+ """)
+ return
+
+
+@app.cell
+def _(person, survey):
+ _df = mo.sql(
+ f"""
+ select person.personal || ' ' || person.family as full_name, count(*)
+ from person join survey
+ on person.person_id = survey.person_id
+ group by person.person_id
+ order by person.family, person.personal;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Notice that "Γguila" (with an acute accent) comes after "Sierra". Correcting this mistake is out of the scope of this tutorial, but can be done by installing the [International Components for Unicode](https://icu.unicode.org/) and writing the query like this:
+
+ ```sql
+ select * from person order by family, personal collate 'es_ES';
+ ```
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > When did the earliest survey done by each person start?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Which people have done 17 or more surveys?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Just as `sum` adds up all the values in a column, `group_concat` concatenates all the text in a column. For example, if the column is called `name`, then `select group_concat(name, ':')` joins all the values in `name` with colons. Use this to write a query that generates two columns: a person's full name, and a comma-separated list of the IDs of the survey that person has done.
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Explain what the following query produces and why.
+ >
+ > ```sql
+ > select person.personal || ' ' || person.family
+ > from person left join survey
+ > on person.person_id = survey.person_id
+ > where survey.survey_id is null;
+ > ```
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Self-Joins
+
+ As a reminder, here's the structure of the survey database.
+ """)
+ return
+
+
+@app.cell
+def _():
+ mo.image(src="./public/survey_tables.svg", alt="table diagram of survey database")
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Notice that the `person` table has a foreign key called `supervisor_id` that refers back to the table's own primary key, `person_id`. This relationship makes sense: supervisors are people, so they're stored in the same table as everyone else. However, if we want to generate a list of people's names and their supervisors' names, we _can't_ just join `person` to `person`.
+ """)
+ return
+
+
+@app.cell
+def _(person):
+ _df = mo.sql(
+ f"""
+ select *
+ from person inner join person
+ on person.person_id = person.supervisor_id;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ The problem is that `person.person_id` and `person.supervisor_id` are ambiguous: are we referring to the left-hand use of the `person` table or the right-hand use? To resolve this, we give each copy of the table an **alias** using `as`, just as we gave columns names using `as`. We also have to specify the columns that we want using two-part `table.column` notation.
+ """)
+ return
+
+
+@app.cell
+def _(person):
+ _df = mo.sql(
+ f"""
+ select
+ pa.personal as pa_personal,
+ pa.family as pa_family,
+ pb.personal as pb_personal,
+ pb.family as pb_family
+ from person pa join person pb
+ on pa.person_id = pb.supervisor_id;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ Joining a table to itself is called a **self join**. The hard part is figuring out whether `pa` is the minion and `pb` is the supervisor or vice versa. The logic is that the supervisor of person `pb` is person `pa`, which means `pa` is the supervisor and `pb` is the minion. (Alternatively, we can inspect the first couple of rows, check back against the `person` table, and decide that way.) Let's rewrite the query to show the relationship explicitly.
+ """)
+ return
+
+
+@app.cell
+def _(person):
+ _df = mo.sql(
+ f"""
+ select
+ pa.personal || ' ' || pa.family as supervisor,
+ pb.personal || ' ' || pb.family as minion
+ from person pa join person pb
+ on pa.person_id = pb.supervisor_id
+ order by pa.family, pa.personal;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Write a query that finds the full names of everyone who doesn't have a supervisor. (Hint: you do not need to use a `join`.)
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Write a query to find all the people who supervise someone who supervises someone. (Hint: you will need to join three copies of `person` to get the person, their boss, and their grand-boss.)
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Many-to-Many Relationships
+
+ Each survey is done by one person, which means that people have a one-to-many relationship with surveys. However, any number of people can have ratings for any number of machines and vice versa, which means these two tables have a **many-to-many relationship**. These relationships can be hard to express in a table: if, for example, we knew that people never have ratings for more than three machines, we could add `machine_1`, `machine_2`, and `machine_3` columns to `person`, but (a) we would have to check several columns if we wanted to find a particular machine, and (b) we would have to redesign our table if the rules changed and people could have ratings for four or five machines.
+
+ A better approach is to create another intermediate table that stores the relationship between the two tables we're interested in. Such a table is sometimes called a **join table** because its main purpose is to allow us to join two other tables. The `rating` table in our database is an example of a join table. Each row stores a foreign key into `person` and a foreign key into `machine`, which shows that the person has some relationship to the machine. The table also stores `level`, which is the actual rating (or `null`), but it is quite common for join tables to only store pairs of foreign keys.
+
+ So, which people have ratings for which machines?
+ """)
+ return
+
+
+@app.cell
+def _(machine, person, rating):
+ _df = mo.sql(
+ f"""
+ select
+ person.personal, person.family, machine.machine_type, rating.level
+ from
+ person join rating join machine
+ on
+ person.person_id = rating.person_id
+ and
+ rating.machine_id = machine.machine_id
+ where
+ rating.level is not null
+ order by
+ person.family, person.personal, machine.machine_type
+ ;
+ """,
+ engine=engine
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Which people have a level of 3 or more on at least one machine?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Write a query that generates a comma-separated list of the machines that Asensio Amaya is rated on, even if the level is `null`. (Hint: use `group_concat`.)
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ > Many of the `level` values in `rating` are `null`. What do you think this might mean?
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.md(r"""
+ ## Check Understanding
+
+ 
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(FlashcardWidget, mo):
+ _widget = mo.ui.anywidget(
+ FlashcardWidget(
+ question="Database Key and Relationship Concepts",
+ cards=[
+ {"front": "Primary key", "back": "A column (or set of columns) whose values are unique and non-null for every row, used to uniquely identify each row in a table"},
+ {"front": "Foreign key", "back": "A column in one table whose values reference the primary key of another table, establishing a link between the two tables"},
+ {"front": "One-to-many relationship", "back": "A relationship where one row in table A can be referenced by many rows in table B β e.g., one person can have many surveys"},
+ {"front": "Many-to-many relationship", "back": "A relationship where rows in table A can relate to many rows in table B and vice versa β requires a join table to represent"},
+ {"front": "Join table", "back": "An intermediate table storing pairs of foreign keys to represent a many-to-many relationship between two other tables"},
+ {"front": "Self-join", "back": "Joining a table to itself using two aliases, used when rows in a table relate to other rows in the same table (e.g., supervisors and employees)"},
+ ],
+ )
+ )
+ _widget
+ return
+
+
+@app.cell(hide_code=True)
+def _(ConceptMapWidget, mo):
+ _widget = mo.ui.anywidget(
+ ConceptMapWidget(
+ question="Connect these database design concepts by selecting a relationship term and clicking two concepts.",
+ concepts=["primary key", "foreign key", "one-to-many", "many-to-many", "join table"],
+ terms=["referenced by", "implemented with", "requires"],
+ correct_edges=[
+ {"from": "primary key", "to": "foreign key", "label": "referenced by"},
+ {"from": "many-to-many", "to": "join table", "label": "implemented with"},
+ {"from": "one-to-many", "to": "foreign key", "label": "requires"},
+ ],
+ )
+ )
+ _widget
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/sql/index.md b/sql/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..85ad2de4ccbf63a92a74c0f11ea6ff6df8f8a99f
--- /dev/null
+++ b/sql/index.md
@@ -0,0 +1,14 @@
+---
+title: Learn SQL
+description: >
+ Learn the basics of SQL, the industry standard for interacting
+ with relational databases. These notebooks also show how easy
+ it is to work with relational data in marimo.
+tracking: 133
+---
+
+## Contributors
+
+Thanks to our notebook authors:
+
+* [Greg Wilson](https://github.com/gvwilson)
diff --git a/sql/public/01_concepts.svg b/sql/public/01_concepts.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2b88ebd9191d488f8273ebd69910aa53d15a5c41
--- /dev/null
+++ b/sql/public/01_concepts.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/sql/public/02_concepts.svg b/sql/public/02_concepts.svg
new file mode 100644
index 0000000000000000000000000000000000000000..0a4576b374c69132a57ef6d9cbd4982f7885132d
--- /dev/null
+++ b/sql/public/02_concepts.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/sql/public/03_concepts.svg b/sql/public/03_concepts.svg
new file mode 100644
index 0000000000000000000000000000000000000000..6374a0d2f8434d997ec52110925b2ede75a2f47a
--- /dev/null
+++ b/sql/public/03_concepts.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/sql/public/04_concepts.svg b/sql/public/04_concepts.svg
new file mode 100644
index 0000000000000000000000000000000000000000..47e22aa800504ea601c692602eecef1bbf027dd6
--- /dev/null
+++ b/sql/public/04_concepts.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/sql/public/05_concepts.svg b/sql/public/05_concepts.svg
new file mode 100644
index 0000000000000000000000000000000000000000..cb625864aed67ae1de3a41e04ab4a9ae6769490b
--- /dev/null
+++ b/sql/public/05_concepts.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/sql/public/06_concepts.svg b/sql/public/06_concepts.svg
new file mode 100644
index 0000000000000000000000000000000000000000..8885b7393a466e597d84db8e99a96863aea4417a
--- /dev/null
+++ b/sql/public/06_concepts.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/sql/public/lab.db b/sql/public/lab.db
new file mode 100644
index 0000000000000000000000000000000000000000..63514ababaf069406baf4b5d2d2b8ad1c8152d9a
Binary files /dev/null and b/sql/public/lab.db differ
diff --git a/sql/public/penguins.db b/sql/public/penguins.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e43ca6e97fd73b6dd287e4ba62f60c26916709d
Binary files /dev/null and b/sql/public/penguins.db differ
diff --git a/sql/public/survey.db b/sql/public/survey.db
new file mode 100644
index 0000000000000000000000000000000000000000..d5f3f859705f958ea1c3a81510c82b0a2b2416d8
Binary files /dev/null and b/sql/public/survey.db differ
diff --git a/sql/public/survey_tables.svg b/sql/public/survey_tables.svg
new file mode 100644
index 0000000000000000000000000000000000000000..b7ea2d9981aed208c941651542f0bcfcbd799859
--- /dev/null
+++ b/sql/public/survey_tables.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/templates/base.html b/templates/base.html
new file mode 100644
index 0000000000000000000000000000000000000000..c371441c103aabea653568b9b5e5cc67499b9b3b
--- /dev/null
+++ b/templates/base.html
@@ -0,0 +1,47 @@
+
+
+
+
+
+ {% block title %}Marimo Learn{% endblock %}
+
+
+
+
+
+
+
+{% block content %}{% endblock %}
+
+{% include "contribute.html" %}
+
+
+
+
+
+
diff --git a/templates/contribute.html b/templates/contribute.html
new file mode 100644
index 0000000000000000000000000000000000000000..cb2777fe7992e3dcdf56194b12daf51d34c64618
--- /dev/null
+++ b/templates/contribute.html
@@ -0,0 +1,10 @@
+
+
+
Want to Contribute?
+
Help us improve these learning materials by contributing to the GitHub repository. We welcome new content, bug fixes, and improvements!