Skip to main content

Building SELECT lists with SQL

If you are using SQL for ingestion and you want to rename columns

PascalCase to snake_case

-- PascalCase to snake_case
-- , OtherNFLStats AS other_nfl_stats

SELECT
    CONCAT(', ', column_name, ' AS ', LOWER(
      REGEXP_REPLACE(
        REGEXP_REPLACE(column_name, '([a-z0-9])([A-Z])', '$1_$2'), -- Pass 1: Lower to Upper
        '([A-Z])([A-Z][a-z])', '$1_$2'                             -- Pass 2: Acronym to Word
      )
    )) AS formatted_column
FROM information_schema.columns
WHERE table_name = 'my_table'
    AND table_schema = 'my_schema'
ORDER BY ordinal_position;

 

PascalCase to snake_case with substitution

-- PascalCase to snake_case
-- , OverageAmount AS overage_amt
WITH RawColumns AS (
    SELECT  column_name
        , LOWER(
            REGEXP_REPLACE(
                REGEXP_REPLACE(column_name, '([a-z0-9])([A-Z])', '$1_$2')
                , '([A-Z])([A-Z][a-z])', '$1_$2'
            )
        ) AS snake_name
    FROM information_schema.columns
    WHERE table_name = 'my_table'
    AND table_schema = 'my_schema'
)
SELECT CONCAT(', ', column_name, ' AS ', 
        AGGREGATE( -- AGGREGATE is Spark-specific (thus applicable to databricks)
            -- Define the list of [pattern, replacement]
            ARRAY(
                  STRUCT('_amount$'      AS pat, '_amt' AS rep)
                , STRUCT('_percent$'     AS pat, '_pct' AS rep)
                , STRUCT('_flag$'        AS pat, '_flg' AS rep)
                , STRUCT('_code$'        AS pat, '_cd'  AS rep)
                , STRUCT('_number$'      AS pat, '_nbr' AS rep)
                , STRUCT('_name$'        AS pat, '_nm'  AS rep)
                , STRUCT('_description$' AS pat, '_dsc' AS rep)
                , STRUCT('_desc$'        AS pat, '_dsc' AS rep)
                , STRUCT('_date$'        AS pat, '_dts' AS rep)
            )
            , snake_name -- Initial value
            , (acc, x) -> REGEXP_REPLACE(acc, x.pat, x.rep) -- Iterative function
        )
    ) AS formatted_column
FROM RawColumns
ORDER BY column_name;