Baby Names - SSA database
Social Security Administration's baby names database
In this cookbook recipe, you will:
- Download the full
`babynames_yob`babynames_yobdata from the Social Security Administration website. - ingest all the CSV files at once using DuckDB.
- Create and populate rank and popularity for each year/sex combination.
- See how to export the data to popular file formats CSV and parquet
- See a few analyses queries
- See the data in action in a Shiny R app
Download the names.zip file from catalog.data.gov website
I saved mine into `D:\opendata\SSA\. Just change line 11 below if you save it elsewhere.babynames_yob_2024`babynames_yob_2024
-- DROP TABLE IF EXISTS babynames;
CREATE TABLE babynames AS
SELECT
name,
sex,
cnt,
-- filename() gives full path → split out the year
regexp_extract(filename, 'yob([0-9]{4})', 1)::INT AS year
FROM read_csv(
'D:/opendata/SSA/babynames_yob_2024/yob*.txt',
columns = {
'name': 'TEXT',
'sex': 'VARCHAR',
'cnt': 'INT'
},
header = false,
delim = ',',
auto_detect = false,
filename = true -- << important
);
Add rank (rnk) and popularity_index columns and populate them.
-- ================================================================================
-- Add rnk and popularity_index
ALTER TABLE main.babynames ADD COLUMN rnk INT NULL;
ALTER TABLE main.babynames ADD COLUMN popularity_index DOUBLE NULL;
-- ================================================================================
-- populate rank
UPDATE babynames AS b
SET rnk = x.rnk
FROM (
SELECT name, sex, year,
ROW_NUMBER() OVER (PARTITION BY year, sex ORDER BY cnt DESC) AS rnk
FROM babynames
) AS x
WHERE b.name = x.name
AND b.sex = x.sex
AND b.year = x.year;
-- ================================================================================
-- populate popularity_index
UPDATE babynames AS b
SET popularity_index = sub.popularity_index
FROM (
SELECT name, sex, year,
100.0 * cnt / MAX(cnt) OVER (PARTITION BY sex, year) AS popularity_index
FROM babynames
) AS sub
WHERE b.name = sub.name
AND b.sex = sub.sex
AND b.year = sub.year;
Export to files for use with other tools
-- Export to csv and parquet
COPY babynames TO 'D:/opendata/SSA/babynames_yob_2024/babynames.csv' (FORMAT CSV, HEADER TRUE, DELIMITER ',', QUOTE '"');
COPY babynames TO 'D:/opendata/SSA/babynames_yob_2024/babynames.parquet' (FORMAT PARQUET, COMPRESSION 'ZSTD');
Top ranked names for both male and female over the years
-- ================================================================================
-- #1 M & F names over the years
SELECT * FROM babynames WHERE sex = 'F' AND rnk = 1 ORDER BY YEAR DESC;
SELECT * FROM babynames WHERE sex = 'M' AND rnk = 1 ORDER BY YEAR DESC;
Number of unique names per year/sex
-- ================================================================================
-- Unique name counts by year/sex
SELECT *
FROM (SELECT YEAR, sex, COUNT(*) AS N FROM babynames GROUP BY ALL)
PIVOT (
SUM(N) FOR sex IN ('M', 'F')
)
ORDER BY year;
Top 10 # 1 Streaks for both Male and Female
-- ================================================================================
-- Top 10 #1 streaks for M and F
WITH ranked AS (
SELECT name,sex,year,rnk
FROM babynames
WHERE rnk = 1
),
islands AS (
SELECT name,sex,year,
/* grouping key for consecutive years */
year - ROW_NUMBER() OVER (
PARTITION BY name, sex
ORDER BY year
) AS grp
FROM ranked
),
streaks AS (
SELECT name, sex,
MIN(year) AS start_year,
MAX(year) AS end_year,
COUNT(*) AS streak_length
FROM islands
GROUP BY name, sex, grp
),
top10_f AS (
SELECT * FROM streaks WHERE sex = 'F' ORDER BY streak_length DESC LIMIT 10
),
top10_m AS (
SELECT * FROM streaks WHERE sex = 'M' ORDER BY streak_length DESC LIMIT 10
)
SELECT 'F' AS sex, * FROM top10_f
UNION ALL
SELECT 'M' AS sex, * FROM top10_m
ORDER BY sex, streak_length DESC;