Skip to main content

Building SELECT lists with SQL

If you are using SQL for ingestion and you want to rename columns

PascalCase to snake_case

-- PascalCase to snake_case
-- , OtherNFLStats AS other_nfl_stats

SELECT
    CONCAT(', ', column_name, ' AS ', LOWER(
      REGEXP_REPLACE(
        REGEXP_REPLACE(column_name, '([a-z0-9])([A-Z])', '$1_$2'), -- Pass 1: Lower to Upper
        '([A-Z])([A-Z][a-z])', '$1_$2'                             -- Pass 2: Acronym to Word
      )
    )) AS formatted_column
FROM information_schema.columns
WHERE table_name = 'my_table'
    AND table_schema = 'my_schema'
ORDER BY ordinal_position;

PascalCase to snake_case with substitution

-- PascalCase to snake_case
-- , OverageAmount AS overage_amt
WITH RawColumns AS (
    SELECT  column_name
        , LOWER(
            REGEXP_REPLACE(
                REGEXP_REPLACE(column_name, '([a-z0-9])([A-Z])', '$1_$2')
                , '([A-Z])([A-Z][a-z])', '$1_$2'
            )
        ) AS snake_name
    FROM information_schema.columns
    WHERE table_name = 'my_table'
    AND table_schema = 'my_schema'
)
SELECT CONCAT(', ', column_name, ' AS ', 
        AGGREGATE( -- AGGREGATE is Spark-specific (thus applicable to databricks)
            -- Define the list of [pattern, replacement]
            ARRAY(
                  STRUCT('_amount$'      AS pat, '_amt' AS rep)
                , STRUCT('_percent$'     AS pat, '_pct' AS rep)
                , STRUCT('_flag$'        AS pat, '_flg' AS rep)
                , STRUCT('_code$'        AS pat, '_cd'  AS rep)
                , STRUCT('_number$'      AS pat, '_nbr' AS rep)
                , STRUCT('_name$'        AS pat, '_nm'  AS rep)
                , STRUCT('_description$' AS pat, '_dsc' AS rep)
                , STRUCT('_desc$'        AS pat, '_dsc' AS rep)
                , STRUCT('_date$'        AS pat, '_dts' AS rep)
            )
            , snake_name -- Initial value
            , (acc, x) -> REGEXP_REPLACE(acc, x.pat, x.rep) -- Iterative function
        )
    ) AS formatted_column
FROM RawColumns
ORDER BY column_name;

Samesnake_case Caseto withPascalCase + custom rules + suffix substitution

-- Customsnake_case rulesto +PascalCase with suffix substitution
WITH base AS (
    SELECT
          column_name
        , CASE 
            WHEN column_name =LIKE 'meta_updated'%\_%' THEN  CONCAT('ARRAY_JOIN( TRANSFORM(SPLIT(LOWER(column_name), '_'), column_name,x '-> ASINITCAP(x) '), 'EDWLastModifiedDTS'' )  -- customunderscore renameto PascalCase
            ELSE INITCAP(column_name)
          END AS pascal_name
        , ordinal_position
    FROM my_catalog.information_schema.columns
    WHERE table_schema = 'my_schema'
        AND table_name = 'my_table'
)
SELECT
    CASE 
        -- Custom
        WHEN column_name = 'hcsys_file_name' THEN CONCAT(', ', regexp_extract(column_name, '[^/]+$', 0), ' AS FileNM') -- SourceHCSystemFileNM?
        WHEN LEFT(column_name, 5)column_name = 'meta_'hcsys_row_num' THEN CONCAT(', ', column_name, ' AS src_', column_name)    -- prepend src_ to all meta_ columns  
        ELSE 
            CASESourceHCSystemRowNBR')
        WHEN column_name ILIKE= '%Amount'meta_updated' THEN CONCAT(', ', column_name, ' AS  EDWLastModifiedDTS')
        WHEN column_name = 'meta_deleted' THEN CONCAT(', ', column_name, ' AS MetaDeletedFLG')
        WHEN column_name = 'meta_surrogate_key' THEN CONCAT(', ', column_name, ' AS MetaSurrogateKeyID')
        WHEN column_name = 'meta_checksum' THEN CONCAT(', ', column_name, ' AS MetaChecksumID')
        WHEN column_name = 'meta_location' THEN CONCAT(', ', column_name, ' AS MetaLocationDSC')       
        WHEN column_name = '_ROW_NUM' THEN CONCAT(', ', column_name, ' AS RowNBR')
        -- Replace
        WHEN pascal_name RLIKE '(?i)Amount$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Amount$', 'AMT'))
        WHEN column_namepascal_name ILIKERLIKE '%Percent'(?i)Percent$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Percent$', 'PCT'))
        WHEN column_namepascal_name ILIKERLIKE '%Flag'(?i)Flag$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Flag$', 'FLG'))
        WHEN column_namepascal_name ILIKERLIKE '%Code'(?i)Code$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Code$', 'CD'))
        WHEN column_namepascal_name ILIKERLIKE '%Number'(?i)Number$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Number$', 'NBR'))
        WHEN column_namepascal_name ILIKERLIKE '%Name'(?i)Name$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Name$', 'NM'))
        WHEN column_namepascal_name ILIKERLIKE '%Description'(?i)Description$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Description$', 'DSC'))
        WHEN column_namepascal_name ILIKERLIKE '%Desc'(?i)Desc$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Desc$', 'DSC'))
        WHEN column_namepascal_name ILIKERLIKE '%Date'(?i)Date$' THEN CONCAT(', try_to_timestamp(', column_name, ', \'yyyy-MM-dd HH:mm:ss\') AS ', REGEXP_REPLACE(pascal_name, '(?i)Date$', 'DTS'))  -- May need to chg to try_to_date if no time data
        WHEN pascal_name RLIKE '(?i)Count$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Date$Count$', 'DTS'CNT'))
        WHEN column_namepascal_name ILIKERLIKE '%Count'(?i)Id$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(column_name,pascal_name, '(?i)Count$Id$', 'CNT'ID'))
        WHEN pascal_name RLIKE '(?i)Address1$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(pascal_name, '(?i)Address1$', 'Address01TXT'))
        WHEN pascal_name RLIKE '(?i)Address2$' THEN CONCAT(', ', column_name, ' AS ', REGEXP_REPLACE(pascal_name, '(?i)Address2$', 'Address02TXT'))
        -- Appended suffixes
        WHEN pascal_name RLIKE '(?i)Gender$' THEN CONCAT(', ', column_name, ' AS ', CONCAT(pascal_name, 'CD'))
        WHEN pascal_name RLIKE '(?i)Age$' THEN CONCAT(', ', column_name, ' AS ', CONCAT(pascal_name, 'NBR'))
        WHEN pascal_name RLIKE '(?i)Address$' THEN CONCAT(', ', column_name, ' AS ', CONCAT(pascal_name, 'TXT'))
        WHEN pascal_name RLIKE '(?i)City$' THEN CONCAT(', ', column_name, ' AS ', CONCAT(pascal_name, 'NN'))
        WHEN pascal_name RLIKE '(?i)State$' THEN CONCAT(', ', column_name, ' AS ', CONCAT(pascal_name, 'NN'))
        WHEN pascal_name RLIKE '(?i)Zip$' THEN CONCAT(', ', column_name, ' AS ', CONCAT(pascal_name, 'CD'))          --REGEXP_REPLACE(pascal_name, '(?i)Zip$', 'ZipCD'))
        WHEN pascal_name RLIKE '(?i)Phone$' THEN CONCAT(', ', column_name, ' AS ', CONCAT(pascal_name, 'NBR'))
        WHEN pascal_name RLIKE '(?i)Fax$' THEN CONCAT(', ', column_name, ' AS ', CONCAT(pascal_name, 'NBR'))
        WHEN pascal_name RLIKE '(?i)Active$' THEN CONCAT(', ', column_name, ' AS ', CONCAT(pascal_name, 'FLG'))

        ELSE CONCAT(', ', column_name, ' AS ', column_name)pascal_name, END'TXT')

    END AS c
FROM information_schema.columns
WHERE table_schema = 'my_schema'
    AND table_name = 'my_table'base
ORDER BY ordinal_position;