Skip to main content

Baby Names - SSA database

Social Security Administration's baby names database

In this cookbook recipe, you will:

  1. Download the full `babynames_yob` data from the Social Security Administration website. 
  2. ingest all the CSV files at once using DuckDB.
  3. Create and populate rank and popularity for each year/sex combination.
  4. See how to export the data to popular file formats CSV and parquet
  5. See a few analyses queries
  6. See the data in action in a Shiny R app

Download the names.zip file from catalog.data.gov website

I saved mine into `D:\opendata\SSA\babynames_yob_2024`.  Just change line 11 below if you save it elsewhere.

-- DROP TABLE IF EXISTS babynames;

CREATE TABLE babynames AS
SELECT
    name,
    sex,
    cnt,
    -- filename() gives full path → split out the year
    regexp_extract(filename, 'yob([0-9]{4})', 1)::INT AS year
FROM read_csv(
    'D:/opendata/SSA/babynames_yob_2024/yob*.txt',
    columns = {
        'name': 'TEXT',
        'sex': 'VARCHAR',
        'cnt': 'INT'
    },
    header = false,
    delim = ',',
    auto_detect = false,
    filename = true    -- << important
);
Add rank (rnk) and popularity_index columns and populate them.
-- ================================================================================
-- Add rnk and popularity_index
ALTER TABLE main.babynames ADD COLUMN rnk INT NULL;
ALTER TABLE main.babynames ADD COLUMN popularity_index DOUBLE NULL;

-- ================================================================================
-- populate rank
UPDATE babynames AS b
SET rnk = x.rnk
FROM (
    SELECT name, sex, year,
        ROW_NUMBER() OVER (PARTITION BY year, sex ORDER BY cnt DESC) AS rnk
    FROM babynames
) AS x
WHERE b.name = x.name
  AND b.sex = x.sex
  AND b.year = x.year;

-- ================================================================================
-- populate popularity_index
UPDATE babynames AS b
SET popularity_index = sub.popularity_index
FROM (
    SELECT name, sex, year,
        100.0 * cnt / MAX(cnt) OVER (PARTITION BY sex, year) AS popularity_index
    FROM babynames
) AS sub
WHERE b.name = sub.name
  AND b.sex  = sub.sex
  AND b.year = sub.year;
Export to files for use with other tools
-- Export to csv and parquet
COPY babynames TO 'D:/opendata/SSA/babynames_yob_2024/babynames.csv' (FORMAT CSV, HEADER TRUE, DELIMITER ',', QUOTE '"');
COPY babynames TO 'D:/opendata/SSA/babynames_yob_2024/babynames.parquet' (FORMAT PARQUET, COMPRESSION 'ZSTD');
Top ranked names for both male and female over the years
-- ================================================================================
-- #1 M & F names over the years
SELECT * FROM babynames WHERE sex = 'F' AND rnk = 1 ORDER BY YEAR DESC;
SELECT * FROM babynames WHERE sex = 'M' AND rnk = 1 ORDER BY YEAR DESC;
Top 10 # 1 Streaks for both Male and Female
-- ================================================================================
-- Top 10 #1 streaks for M and F
WITH ranked AS (
    SELECT name,sex,year,rnk
    FROM babynames
    WHERE rnk = 1
),

islands AS (
    SELECT name,sex,year,
        /* grouping key for consecutive years */
        year - ROW_NUMBER() OVER (
            PARTITION BY name, sex
            ORDER BY year
        ) AS grp
    FROM ranked
),

streaks AS (
    SELECT name, sex,
        MIN(year) AS start_year,
        MAX(year) AS end_year,
        COUNT(*) AS streak_length
    FROM islands
    GROUP BY name, sex, grp
),

top10_f AS (
    SELECT * FROM streaks WHERE sex = 'F' ORDER BY streak_length DESC LIMIT 10
),
top10_m AS (
    SELECT * FROM streaks WHERE sex = 'M' ORDER BY streak_length DESC LIMIT 10
)

SELECT 'F' AS sex, * FROM top10_f
UNION ALL
SELECT 'M' AS sex, * FROM top10_m
ORDER BY sex, streak_length DESC;