PART I - Getting Started Chapter 1 - Quick Start Guide WITH vwMETA AS ( SELECT LOWER(TABLE_NAME) AS TABLE_NAME, LOWER(COLUMN_NAME) AS COLUMN_NAME, COLUMN_ID AS COLUMN_ID, LOWER(DATA_TYPE) AS DATA_TYPE, ROW_NUMBER() OVER (PARTITION BY TABLE_NAME ORDER BY COLUMN_ID) AS RNBR, COUNT(*) OVER (PARTITION BY TABLE_NAME) AS TOT_RNBR FROM ALL_TAB_COLUMNS WHERE OWNER='PROD_SCHEMA' AND TABLE_NAME IN ('DIM_POSTAL_CODE') ) SELECT TABLE_NAME, LISTAGG(CT,' ') WITHIN GROUP (ORDER BY RNBR) AS CT_FINAL FROM ( SELECT TABLE_NAME, RNBR, CASE WHEN RNBR=1 AND RNBR<>TOT_RNBR THEN 'create table prod_schema.' || TABLE_NAME || '(' || COLUMN_NAME || ' ' || DT || ',' WHEN RNBR=1 AND RNBR=TOT_RNBR THEN 'create table prod_schema.' || TABLE_NAME || '(' || COLUMN_NAME || ' ' || DT WHEN RNBR>1 AND RNBR<>TOT_RNBR THEN COLUMN_NAME || ' ' || DT || ',' WHEN RNBR>1 AND RNBR=TOT_RNBR THEN COLUMN_NAME || ' ' || DT || ');' END AS CT FROM ( SELECT COLUMN_ID,TABLE_NAME,COLUMN_NAME,RNBR,TOT_RNBR, CASE WHEN DATA_TYPE IN ('char','varchar','varchar2') THEN 'string' WHEN DATA_TYPE='date' THEN 'timestamp' WHEN DATA_TYPE='number' AND COLUMN_NAME IN ('latitude','longitude') THEN 'double' WHEN DATA_TYPE='number' THEN 'bigint' ELSE '?????' END AS DT FROM vwMETA ) ) GROUP BY TABLE_NAME ORDER BY TABLE_NAME; create table prod_schema.dim_postal_code(postal_code string, city string, state_code string, latitude double, longitude double); sqoop import --hive-table TMP_DIM_POSTAL_CODE --connect --table DIM_POSTAL_CODE --target-dir /data/prod/teams/prod_schema/TMP_DIM_POSTAL_CODE [hdpserver:21000] prod_schema> select * from prod_schema.tmp_dim_postal_code order by 1 limit 5; +-------------+------------+------------+-----------+------------+ | postal_code | city | state_code | latitude | longitude | +-------------+------------+------------+-----------+------------+ | 00210 | PORTSMOUTH | NH | 43.005895 | -71.013202 | | 00211 | PORTSMOUTH | NH | 43.005895 | -71.013202 | | 00212 | PORTSMOUTH | NH | 43.005895 | -71.013202 | | 00213 | PORTSMOUTH | NH | 43.005895 | -71.013202 | | 00214 | PORTSMOUTH | NH | 43.005895 | -71.013202 | +-------------+------------+------------+-----------+------------+ [hdpserver:21000] prod_schema> desc tmp_dim_postal_code; +-------------+--------+---------+ | name | type | comment | +-------------+--------+---------+ | postal_code | string | | | city | string | | | state_code | string | | | latitude | string | | | longitude | string | | +-------------+--------+---------+ [hdpserver:21000] prod_schema> drop table if exists prod_schema.dim_postal_code purge; [hdpserver:21000] prod_schema> create table prod_schema.dim_postal_code( postal_code string, city string, state_code string, latitude double, longitude double); [hdpserver:21000] prod_schema> insert into prod_schema.dim_postal_code select postal_code, city, state_code, cast(latitude as double) as latitude, cast(longitude as double) as longitude from prod_schema.tmp_dim_postal_code; [hdpserver:21000] prod_schema> desc prod_schema.dim_postal_code; +-------------+--------+---------+ | name | type | comment | +-------------+--------+---------+ | postal_code | string | | | city | string | | | state_code | string | | | latitude | double | | | longitude | double | | +-------------+--------+---------+ [hdpserver:21000] prod_schema> drop table if exists prod_schema.tmp_dim_postal_code purge; state_code,state_name aa,u.s. armed forces - americas ae,u.s. armed forces - europe ak,alaska al,alabama ap,u.s. armed forces - pacific ar,arkansas as,american samoa az,arizona ca,california co,colorado ...snip... hadoop fs -mkdir /data/prod/teams/prod_schema/tmp_us_state_mapping hadoop fs -copyFromLocal /home/smithbob/us_state_mapping.csv /data/prod/teams/prod_schema/tmp_us_state_mapping/tmp_us_state_mapping.csv hadoop fs -ls -R /data/prod/teams/prod_schema/tmp_us_state_mapping [hdpserver:21000] prod_schema> drop table if exists prod_schema.tmp_us_state_mapping purge; [hdpserver:21000] prod_schema> create external table prod_schema.tmp_us_state_mapping(state_code string, state_name string) row format delimited fields terminated by ',' stored as textfile location '/data/prod/teams/prod_schema/tmp_us_state_mapping' tblproperties('skip.header.line.count'='1'); [hdpserver:21000] prod_schema> select * from prod_schema.tmp_us_state_mapping limit 10; +------------+------------------------------+ | state_code | state_name | +------------+------------------------------+ | aa | u.s. armed forces - americas | | ae | u.s. armed forces - europe | | ak | alaska | | al | alabama | | ap | u.s. armed forces - pacific | | ar | arkansas | | as | american samoa | | az | arizona | | ca | california | | co | colorado | +------------+------------------------------+ [hdpserver:21000] prod_schema> drop table if exists prod_schema.dim_us_state_mapping purge; create table prod_schema.dim_us_state_mapping(state_code string, state_name string); insert into prod_schema.dim_us_state_mapping select upper(trim(state_code)) as state_code, upper(trim(state_name)) as state_name from prod_schema.tmp_us_state_mapping; [hdpserver:21000] prod_schema> drop table if exists prod_schema.tmp_us_state_mapping purge; [hdpserver:21000] prod_schema> select * from prod_schema.dim_us_state_mapping limit 10; +------------+------------------------------+ | state_code | state_name | +------------+------------------------------+ | AA | U.S. ARMED FORCES - AMERICAS | | AE | U.S. ARMED FORCES - EUROPE | | AK | ALASKA | | AL | ALABAMA | | AP | U.S. ARMED FORCES - PACIFIC | | AR | ARKANSAS | | AS | AMERICAN SAMOA | | AZ | ARIZONA | | CA | CALIFORNIA | | CO | COLORADO | +------------+------------------------------+ [hdpserver:21000] prod_schema> drop table if exists prod_schema.bigmike_output purge; create external table prod_schema.bigmike_output(postal_code string, city string, state_code string, latitude double, longitude double, state_name string) row format delimited fields terminated by '\t' stored as textfile tblproperties('serialization.null.format'=''); [hdpserver:21000] prod_schema> insert into prod_schema.bigmike_output select A.postal_code, A.city, A.state_code, A.latitude, A.longitude, B.state_name from prod_schema.dim_postal_code A left join prod_schema.dim_us_state_mapping B on A.state_code=B.state_code; hadoop fs -getmerge /data/prod/teams/prod_schema/bigmike_output /home/smithbob/bigmike_output.tsv 00623 CABO ROJO PR 18.08643 -67.15222 PUERTO RICO 00633 CAYEY PR 18.194527 -66.1834669 PUERTO RICO 00640 COAMO PR 18.077197 -66.359104 PUERTO RICO 00676 MOCA PR 18.37956 -67.0842399 PUERTO RICO 00728 PONCE PR 18.013353 -66.65218 PUERTO RICO 00734 PONCE PR 17.999499 -66.643934 PUERTO RICO 00735 CEIBA PR 18.258444 -65.65987 PUERTO RICO 00748 FAJARDO PR 18.326732 -65.652484 PUERTO RICO 00766 VILLALBA PR 18.126023 -66.48208 PUERTO RICO 00771 LAS PIEDRAS PR 18.18744 -65.87088 PUERTO RICO Chapter 2 - Hadoop Administrator E-Mail Hadoop Administrators: Tally Ho! My name is Bob Smith and I work for the department and, as you may have heard, I've been tasked with moving data off our legacy database to the Hadoop database. I was hoping that you could be my contact for the duration of this conversion. First, thank you up-front for helping out since this Hadoop shizz is new to me and my team. Second, you probably won't be surprised that I have about a bazillion questions for you which I've placed below. Your responses will go a long way in helping me and my team move to Hadoop as quickly (and painlessly!) as possible. Here goes... * Do you have a Linux edge node server that my team can use? If so, what's the server's host name? My team and I will be automating some processes using Linux scripts, so access to a Linux edge node server will help us out greatly. * My team and I plan to use PuTTY to connect to the Linux edge node server. I just want to confirm that we must use port 22 (SSH) when setting up a connection to the edge node server. Do you recommend something other than PuTTY? * On our legacy database, the schema we use is named . Can you please set up the same schema name on the Hadoop database? * Since my team and I will use the edge node server as well as the Hadoop database, can you please set up the following individuals with an account on the Linux edge node server as well as access to the Hadoop database schema requested above? Also, the following team members should be given privileged access to run Hadoop commands via hadoop/hdfs from the Linux command line: * Not all of my team members are highly technical, but would like run simple queries against the Hadoop database. Do you have the Hadoop database web interface Hue set up and accessible? If so, what's the URL? * In order to kill runaway SQL queries, can you please list the URLs to the Hadoop query webpages? I believe these URLs generally use port 25000 (/queries), but don't hold me to that...I'm new to these parts. * Can you recommend a SQL client application (such as Toad Data Point, DBeaver, SQuirreL, etc.) for use with Hadoop? What do you use? * Do you have Hive and Impala ODBC (32-bit/64-bit) and JDBC drivers available on the corporate network? If so, I'd like to access them so that I may set up my team's SQL client software (among other things). If not, can you recommend where I may download these drivers? * Speaking of ODBC and JDBC drivers, can you please provide example connection information/strings for both ODBC and JDBC connections to Hive (port 10000?) as well as Impala (port 21050?)? We'll be using the ODBC connection information with applications such as Microsoft Excel, PowerBI, Tableau, etc. The JDBC connection strings will be used with client software that uses JDBC rather than ODBC such as DBeaver, SQuirreL, etc. * Does our corporate network run Kerberos? If so, when creating cron jobs to run automatically, we may need to create a keytab file containing Kerberos-related information. Which encryption types do you suggest we include in the keytab file? arcfour-hmac-md5? aes256-cts? rc4-hmac? Any others? Also, what's our Kerberos Realm and Host FQDN? If not Kerberos, then LDAP? * We would like the ability to access our legacy database () from the Linux edge node server for use with sqoop and other tools. Can you please install the software necessary so that my team and I may access the legacy database from there? * Is there a generic account available on the Linux edge node server for me and a few of my team members to use? We'd like a single account to execute our production code. If so, can you please forward the username and password? If not, can you please create an account on the Linux edge node server whose password is static? Also, please give this account access to the appropriate schemas as well as hadoop/hdfs privileges. * Is HPL/SQL available from the Linux edge node server? If not, can you please install it so that my team and I can create and execute procedures on the Linux edge node server against the Hadoop database? Also, where is the file hplsql-site.xml located? * Is there a directory on the Linux edge node server where we can store the team's production code? If not, can you please create a directory accessible by my team as well as the generic account? * Can you please create a directory in HDFS specifically for me and my team for use with external tables? Something like hdfs://hdpserver/data/prod/ teams/ or whatever your standard is. * I feel completely comfortable downloading and maintaining many of my department's dimension tables, but some of the fact tables are quite large. I'm hoping you can intercept the process involved in importing the fact tables and incorporate them into your process. Can we have a conversation about that? * What are the version numbers for the following? * Linux (on the edge node server) * Apache Hadoop * Hive * Impala * HPL/SQL * Hive ODBC Driver * Impala ODBC Driver * Hive JDBC Driver * Impala JDBC Driver * Can you please install the Linux utility dos2unix on the Linux edge node server? Since our laptops are Windows-based, we may need to convert files using dos2unix. * Which Thrift Transport Mode should we be using? SASL? Binary? HTTP? * Does the Hadoop Database use Secure Sockets Layer (SSL) for connections? When I go to set up an ODBC connection, there's option asking whether I should enable SSL. Should I? * My team and I will be using the storage formats TEXTFILE, PARQUET and KUDU almost exclusively. Can you please indicate the SQL CREATE TABLE options required to use the KUDU storage format, if any? Can you recommend the number of partitions we should use with KUDU tables? Do we have to include the table property kudu.master_addresses in our SQL code? If so, can you include an example of this? * In our legacy database, we have access to useful metadata such as table names, column names, data types, etc. within the database via ALL_TABLES, ALL_TAB_COLUMNS, INFORMATION_SCHEMA, etc. Can you create a view or views to mimic this from within the Hadoop database accessible from our new database schema? If not, can you give us read-only access to the underlying MetaStore database's metadata tables/views? * Does the version of ImpalaSQL installed on the Hadoop database include the extensions to GROUP BY such as CUBE, ROLLUP, GROUPING SETS, etc.? * Is Apache Spark installed on the Linux edge node server? If so, what's the version number? I want to use Spark with Python, is pyspark available to use? * My Team and I may create one or more user-defined functions (UDFs) for Impala. Can you create a directory in HDFS where we may place our Java .jar files? Also, can you update the PATH and CLASSPATH so that we have access to java and javac? Thanks, Bob Smith Chapter 3 - Recommended Windows Client Software C:\Program Files (x86)\Java\jre7\bin> java -version java version "1.7.0_65" Java(TM) SE Runtime Environment (build 1.7.0_65-b19) Java HotSpot(TM) Client VM (build 24.65-b04, mixed mode, sharing) Driver=Cloudera ODBC Driver for Impala; Host=hdpserver; Port=21050; AuthMech=3; KrbRealm=; KrbHostFQDN=; KrbServiceName=impala; UseNativeQuery=0; AuthMech Description 0 No authentication. 1 Kerberos authentication. 2 User name authentication. 3 User name and password authentication. 4 User name and password authentication with SSL enabled. java -jar squirrel-sql-#.#.#-standard.jar jdbc:impala://hdpserver:21050;AuthMech=3;KrbRealm=REALM.COMPANY.COM;KrbHostFQDN=hdpserver;KrbServiceName=impala;UseNativeQuery=1 jdbc:hive2://hdpserver:10000/default;authMech=3; https://www.cloudera.com/downloads/connectors/hive/jdbc/2-6-15.html Chapter 4 - A Teensy-Weensy Chat about Hadoop [hdpserver:21000] prod_schema> show files in prod_schema.dim_postal_code; +-----------------------------------------------------------------------------------------------------------------------+----------+ | Path | Size | +-----------------------------------------------------------------------------------------------------------------------+----------+ | hdfs://hdpserver/data/prod/teams/prod_schema/dim_postal_code/a04dfbd11c2688bf-f6b48dc300000000_1117506544_data.0.parq | 132.33MB | | hdfs://hdpserver/data/prod/teams/prod_schema/dim_postal_code/a04dfbd11c2688bf-f6b48dc300000001_1011983093_data.0.parq | 24.68MB | +-----------------------------------------------------------------------------------------------------------------------+----------+ [hdpserver:21000] prod_schema> desc formatted dim_postal_code; +------------------------------+------------------------------------------------------------+----------------------+ | name | type | comment | +------------------------------+------------------------------------------------------------+----------------------+ | # col_name | data_type | comment | | | NULL | NULL | | postal_code | string | NULL | | city | string | NULL | | state_code | string | NULL | | latitude | double | NULL | | longitude | double | NULL | | | NULL | NULL | | # Detailed Table Information | NULL | NULL | | Database: | prod_schema | NULL | | OwnerType: | USER | NULL | | Owner: | smithbob | NULL | | CreateTime: | Thu Sep 09 09:35:23 CDT 2021 | NULL | | LastAccessTime: | UNKNOWN | NULL | | Retention: | 0 | NULL | | Location: | hdfs://hdpserver/data/prod/teams/prod_schema/dim_postal_code| NULL | | Table Type: | MANAGED_TABLE | NULL | | Table Parameters: | NULL | NULL | | | transient_lastDdlTime | 1631198123 | | | NULL | NULL | | # Storage Information | NULL | NULL | | SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL | | InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL | | OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL | | Compressed: | No | NULL | | Num Buckets: | 0 | NULL | | Bucket Columns: | [] | NULL | | Sort Columns: | [] | NULL | +------------------------------+------------------------------------------------------------+----------------------+ [hdpserver:21000] prod_schema> desc formatted bigmike_output; +------------------------------+------------------------------------------------------------+----------------------+ | name | type | comment | +------------------------------+------------------------------------------------------------+----------------------+ | # col_name | data_type | comment | | | NULL | NULL | | postal_code | string | NULL | | city | string | NULL | | state_code | string | NULL | | latitude | double | NULL | | longitude | double | NULL | | state_name | string | NULL | | | NULL | NULL | | # Detailed Table Information | NULL | NULL | | Database: | prod_schema | NULL | | OwnerType: | USER | NULL | | Owner: | smithbob | NULL | | CreateTime: | Thu Sep 09 13:38:59 CDT 2021 | NULL | | LastAccessTime: | UNKNOWN | NULL | | Retention: | 0 | NULL | | Location: | hdfs://hdpserver/data/prod/teams/prod_schema/bigmike_output| NULL | | Table Type: | EXTERNAL_TABLE | NULL | | Table Parameters: | NULL | NULL | | | EXTERNAL | TRUE | | | OBJCAPABILITIES | EXTREAD,EXTWRITE | | | serialization.null.format | | | | transient_lastDdlTime | 1631212739 | | | NULL | NULL | | # Storage Information | NULL | NULL | | SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL | | InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL | | OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL | | Compressed: | No | NULL | | Num Buckets: | 0 | NULL | | Bucket Columns: | [] | NULL | | Sort Columns: | [] | NULL | | Storage Desc Params: | NULL | NULL | | | field.delim | \t | | | serialization.format | \t | +------------------------------+------------------------------------------------------------+----------------------+ | SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL | | InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL | | OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL | INVALIDATE METADATA PROD_SCHEMA.MY_NEW_TABLE; Chapter 5 - Creating Your Very Own Hadoop Playground https://docs.docker.com/desktop/windows/install/ https://kudu.apache.org/docs/quickstart.html kudu-master-1: image: apache/kudu:${KUDU_QUICKSTART_VERSION:-latest} ports: - "7051:7051" - "8051:8051" ...snip... --stderrthreshold=0 --use_hybrid_clock=false --unlock-unsafe-flags docker-compose -f docker\quickstart.yml up -d docker run -d --name kudu-impala --network="docker_default" -p 21000:21000 -p 21050:21050 -p 25000:25000 -p 25010:25010 -p 25020:25020 --memory=4096m apache/kudu:impala-latest impala docker exec -it kudu-impala impala-shell docker pull cloudera/quickstart:latest C:\Users\smithbob>docker pull cloudera/quickstart:latest latest: Pulling from cloudera/quickstart 1d00652ce734: Pull complete Digest: sha256:f91...snip...b63 Status: Downloaded newer image for cloudera/quickstart:latest docker.io/cloudera/quickstart:latest [wsl2] kernelCommandLine = vsyscall=emulate docker run -m 8G --memory-reservation 3G --memory-swap 8G --hostname=quickstart.cloudera --privileged=true -t -i -p 80:80 -p 7180:7180 -p 8888:8888 -d docker.io/cloudera/quickstart:latest /usr/bin/docker-quickstart PART II - Querying the Hadoop Database Chapter 6 - Introduction to SQL PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT 1 M Y 130 70.52 2 F N 180 81.04 3 F N 230 91.56 4 M N 280 102.08 5 F Y 330 112.60 6 M N 380 123.12 SELECT columns FROM tables WHERE/ON statements GROUP BY columns HAVING criteria ORDER BY columns Our First SQL DML Example (SELECT/FROM/WHERE/ORDER BY) PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT 1 M Y 130 70.52 2 F N 180 81.04 3 F N 230 91.56 4 M N 280 102.08 5 F Y 330 112.6 6 M N 380 123.12 SELECT PATIENT_KEY,PAT_GENDER FROM PATIENTMASTER SELECT * FROM PATIENTMASTER SELECT * FROM PATIENTMASTER WHERE PAT_GENDER='M' PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT 1 M Y 130 70.52 6 M N 380 123.12 4 M N 280 102.08 SELECT * FROM PATIENTMASTER WHERE PAT_GENDER='M' AND PAT_DEAD='N' PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT 6 M N 380 123.12 4 M N 280 102.08 SELECT * FROM PATIENTMASTER WHERE PAT_GENDER='M' AND PAT_DEAD='N' AND PAT_WEIGHT>300 PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT 6 M N 380 123.12 SELECT PATIENT_KEY, PAT_GENDER, PAT_DEAD, PAT_WEIGHT, PAT_HEIGHT, 703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT) AS BMI FROM PATIENTMASTER PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT BMI 1 M Y 130 70.52 18.3769769 2 F N 180 81.04 19.2676596 3 F N 230 91.56 19.287307 4 M N 280 102.08 18.8900033 5 F Y 330 112.6 18.2975307 6 M N 380 123.12 17.6230758 SELECT PATIENT_KEY, PAT_GENDER, PAT_DEAD, PAT_WEIGHT, PAT_HEIGHT, 703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT) AS BMI FROM PATIENTMASTER WHERE 703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT)<=19 PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT BMI 1 M Y 130 70.52 18.3769769 4 M N 280 102.08 18.8900033 5 F Y 330 112.6 18.2975307 6 M N 380 123.12 17.6230758 SELECT * FROM PATIENTMASTER WHERE PATIENT_KEY=1 OR PATIENT_KEY=3 OR PATIENT_KEY=5 PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT 1 M Y 130 70.52 3 F N 230 91.56 5 F Y 330 112.6 SELECT * FROM PATIENTMASTER WHERE PATIENT_KEY IN (1,3,5) SELECT * FROM PATIENTMASTER WHERE PATIENT_KEY IS NOT NULL PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT 1 M Y 130 70.52 2 F N 180 81.04 3 F N 230 91.56 4 M N 280 102.08 5 F Y 330 112.6 6 M N 380 123.12 SELECT PATIENT_KEY,PAT_GENDER,PAT_DEAD,PAT_WEIGHT,PAT_HEIGHT FROM PATIENTMASTER WHERE PAT_DEAD='N' ORDER BY PAT_WEIGHT PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT 2 F N 180 81.04 3 F N 230 91.56 4 M N 280 102.08 6 M N 380 123.12 SELECT * FROM PATIENTMASTER WHERE PAT_DEAD='N' ORDER BY PAT_WEIGHT DESC SELECT DISTINCT PATIENT_KEY FROM PATIENTMASTER ORDER BY 1 PATIENT_KEY 1 2 3 4 ...snip... SELECT DISTINCT PAT_GENDER,PAT_DEAD FROM PATIENTMASTER ORDER BY 1,2 PAT_GENDER PAT_DEAD F N F Y M N M Y SELECT PATIENT_KEY, PAT_GENDER, PAT_DEAD, PAT_WEIGHT, PAT_HEIGHT, 703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT) AS BMI FROM PATIENTMASTER WHERE PATIENT_KEY IN (1,2,3,4,5,6) AND PAT_GENDER IS NOT NULL AND PAT_DEAD='N' AND 703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT)<=20 ORDER BY PAT_GENDER,PAT_WEIGHT DESC PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT BMI 3 F N 230 91.56 19.287307 2 F N 180 81.04 19.2676596 6 M N 380 123.12 17.6230758 4 M N 280 102.08 18.8900033 Our Second SQL DML Example (FROM/JOIN/ON) PATIENT_KEY PAT_ADDR PAT_CITY PAT_STATE 1 123 Main St. Philadelphia PA 2 234 Second St. Neward NJ 3 567 Third St. Blobby DE 7 890 Fourth St. Crazzi CA SELECT A.PATIENT_KEY, A.PAT_GENDER, A.PAT_DEAD, A.PAT_WEIGHT, A.PAT_HEIGHT, B.PAT_ADDR, B.PAT_CITY, B.PAT_STATE FROM PATIENTMASTER A INNER JOIN PATADDRINFO B ON A.PATIENT_KEY=B.PATIENT_KEY PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT PAT_ADDR PAT_CITY PAT_STATE 1 M Y 130 70.52 123 Main St. Philadelphia PA 2 F N 180 81.04 234 Second St. Neward NJ 3 F N 230 91.56 567 Third St. Blobby DE ON PATIENTMASTER.PATIENT_KEY=PATADDRINFO.PATIENT_KEY SELECT A.PATIENT_KEY, A.PAT_GENDER, A.PAT_DEAD, A.PAT_WEIGHT, A.PAT_HEIGHT, B.PAT_ADDR, B.PAT_CITY, B.PAT_STATE FROM PATIENTMASTER A LEFT JOIN PATADDRINFO B ON A.PATIENT_KEY=B.PATIENT_KEY PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT PAT_ADDR PAT_CITY PAT_STATE 1 M Y 130 70.52 123 Main St. Philadelphia PA 2 F N 180 81.04 234 Second St. Neward NJ 3 F N 230 91.56 567 Third St. Blobby DE 4 M N 280 102.08 5 F Y 330 112.6 6 M N 380 123.12 SELECT A.PATIENT_KEY, A.PAT_GENDER, A.PAT_DEAD, A.PAT_WEIGHT, A.PAT_HEIGHT, B.PAT_ADDR, B.PAT_CITY, B.PAT_STATE FROM PATIENTMASTER A RIGHT JOIN PATADDRINFO B ON A.PATIENT_KEY=B.PATIENT_KEY PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT PAT_ADDR PAT_CITY PAT_STATE 1 M Y 130 70.52 123 Main St. Philadelphia PA 2 F N 180 81.04 234 Second St. Neward NJ 3 F N 230 91.56 567 Third St. Blobby DE SELECT A.PATIENT_KEY, A.PAT_GENDER, A.PAT_DEAD, A.PAT_WEIGHT, A.PAT_HEIGHT, B.PAT_ADDR, B.PAT_CITY, B.PAT_STATE FROM PATIENTMASTER A FULL JOIN PATADDRINFO B ON A.PATIENT_KEY=B.PATIENT_KEY PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT PAT_ADDR PAT_CITY PAT_STATE 1 M Y 130 70.52 123 Main St. Philadelphia PA 2 F N 180 81.04 234 Second St. Neward NJ 3 F N 230 91.56 567 Third St. Blobby DE 6 M N 380 123.12 4 M N 280 102.08 5 F Y 330 112.6 890 Fourth St. Crazzi CA PATIENT_KEY PAT_FUNGUS 1 Y 3 N 8 Y SELECT A.PATIENT_KEY, A.PAT_GENDER, A.PAT_DEAD, A.PAT_WEIGHT, A.PAT_HEIGHT, B.PAT_ADDR, B.PAT_CITY, B.PAT_STATE, C.PAT_FUNGUS FROM PATIENTMASTER A INNER JOIN PATADDRINFO B ON A.PATIENT_KEY=B.PATIENT_KEY INNER JOIN PATFUNGUSINFO C ON A.PATIENT_KEY=C.PATIENT_KEY PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT PAT_ADDR PAT_CITY PAT_STATE PAT_FUNGUS 1 M Y 130 70.52 123 Main St. Philadelphia PA Y 3 F N 230 91.56 567 Third St. Blobby DE N SELECT A.PATIENT_KEY, A.PAT_GENDER, A.PAT_DEAD, A.PAT_WEIGHT, A.PAT_HEIGHT, B.PAT_ADDR, B.PAT_CITY, B.PAT_STATE, C.PAT_FUNGUS FROM PATIENTMASTER A INNER JOIN PATADDRINFO B ON A.PATIENT_KEY=B.PATIENT_KEY RIGHT JOIN PATFUNGUSINFO C ON A.PATIENT_KEY=C.PATIENT_KEY PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT PAT_ADDR PAT_CITY PAT_STATE PAT_FUNGUS 1 M Y 130 70.52 123 Main St. Philadelphia PA Y 3 F N 230 91.56 567 Third St. Blobby DE N Y SELECT A.PATIENT_KEY, A.PAT_GENDER, A.PAT_DEAD, A.PAT_WEIGHT, A.PAT_HEIGHT, B.PAT_ADDR, B.PAT_CITY, B.PAT_STATE, C.PAT_FUNGUS FROM PATIENTMASTER A INNER JOIN PATADDRINFO B ON A.PATIENT_KEY=B.PATIENT_KEY RIGHT JOIN PATFUNGUSINFO C ON A.PATIENT_KEY=C.PATIENT_KEY WHERE A.PAT_GENDER='M' AND B.PAT_STATE='PA' AND C.PAT_FUNGUS='Y' PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT PAT_ADDR PAT_CITY PAT_STATE PAT_FUNGUS 1 M Y 130 70.52 123 Main St. Philadelphia PA Y SELECT A.PATIENT_KEY, A.PAT_GENDER, A.PAT_DEAD, A.PAT_WEIGHT, A.PAT_HEIGHT, B.PAT_ADDR, B.PAT_CITY, B.PAT_STATE, C.PAT_FUNGUS, 703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT) AS BMI FROM PATIENTMASTER A INNER JOIN PATADDRINFO B ON A.PATIENT_KEY=B.PATIENT_KEY RIGHT JOIN PATFUNGUSINFO C ON A.PATIENT_KEY=C.PATIENT_KEY WHERE A.PAT_GENDER='M' AND B.PAT_STATE='PA' AND C.PAT_FUNGUS='Y' AND A.PATIENT_KEY IN (1,2,3,4,5,6) AND 703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT)>=10 ORDER BY A.PAT_GENDER,A.PAT_WEIGHT DESC PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT PAT_ADDR PAT_CITY PAT_STATE PAT_FUNGUS BMI 1 M Y 130 70.52 123 Main St. Philadelphia PA Y 18.37 Our Third SQL DML Example (GROUP BY/HAVING) SELECT COUNT(A.PATIENT_KEY) as FUNGUS_PATIENT_COUNT FROM PATIENTMASTER A INNER JOIN PATFUNGUSINFO B ON A.PATIENT_KEY=B.PATIENT_KEY WHERE B.PAT_FUNGUS='Y' FUNGUS_PATIENT_COUNT 1 SELECT COUNT(DISTINCT A.PATIENT_KEY) as FUNGUS_PATIENT_COUNT FROM PATIENTMASTER A INNER JOIN PATFUNGUSINFO B ON A.PATIENT_KEY=B.PATIENT_KEY WHERE B.PAT_FUNGUS='Y' FUNGUS_PATIENT_COUNT 1 SELECT COUNT(*) as ROWS_IN_PATIENTMASTER FROM PATIENTMASTER A ROWS_IN_PATIENTMASTER 6 SELECT SUM(A.PAT_WEIGHT) as TOTAL_FATNESS, MIN(A.PAT_WEIGHT) as MIN_FATNESS, MAX(A.PAT_WEIGHT) as MAX_FATNESS FROM PATIENTMASTER A TOTAL_FATNESS MIN_FATNESS MAX_FATNESS 1530 130 380 SELECT A.PAT_GENDER, SUM(A.PAT_WEIGHT) as TOTAL_FATNESS, MIN(A.PAT_WEIGHT) as MIN_FATNESS, MAX(A.PAT_WEIGHT) as MAX_FATNESS FROM PATIENTMASTER A GROUP BY A.PAT_GENDER PAT_GENDER TOTAL_FATNESS MIN_FATNESS MAX_FATNESS M 790 130 380 F 740 180 330 SELECT A.PAT_GENDER, A.PAT_DEAD, SUM(A.PAT_WEIGHT) as TOTAL_FATNESS, MIN(A.PAT_WEIGHT) as MIN_FATNESS, MAX(A.PAT_WEIGHT) as MAX_FATNESS FROM PATIENTMASTER A GROUP BY A.PAT_GENDER,A.PAT_DEAD PAT_GENDER PAT_DEAD TOTAL_FATNESS MIN_FATNESS MAX_FATNESS M Y 130 130 130 M N 660 280 380 F N 410 180 230 F Y 330 330 330 SELECT A.PAT_GENDER, A.PAT_DEAD, SUM(A.PAT_WEIGHT) as TOTAL_FATNESS, MIN(A.PAT_WEIGHT) as MIN_FATNESS, MAX(A.PAT_WEIGHT) as MAX_FATNESS FROM PATIENTMASTER A GROUP BY A.PAT_GENDER,A.PAT_DEAD HAVING SUM(A.PAT_WEIGHT) >= 400 PAT_GENDER PAT_DEAD TOTAL_FATNESS MIN_FATNESS MAX_FATNESS M N 660 280 380 F N 410 180 230 SELECT A.PAT_GENDER,C.PAT_FUNGUS, AVG(703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT)) AS AVG_BMI, COUNT(DISTINCT A.PATIENT_KEY) AS DISTINCT_PATS, COUNT(*) AS NBR_OF_ROWS FROM PATIENTMASTER A INNER JOIN PATADDRINFO B ON A.PATIENT_KEY=B.PATIENT_KEY RIGHT JOIN PATFUNGUSINFO C ON A.PATIENT_KEY=C.PATIENT_KEY WHERE A.PAT_GENDER='M' AND B.PAT_STATE='PA' AND C.PAT_FUNGUS='Y' AND A.PATIENT_KEY IN (1,2,3,4,5,6) GROUP BY A.PAT_GENDER,C.PAT_FUNGUS HAVING AVG(703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT))>1.0 ORDER BY A.PAT_GENDER,C.PAT_FUNGUS DESC PAT_GENDER PAT_FUNGUS AVG_BMI DISTINCT_PATS NBR_OF_ROWS M Y 18.3769769 1 1 Our Fourth SQL DML Example (Subqueries) SELECT * FROM PATIENTMASTER WHERE PATIENT_KEY IN (1,3,5) SELECT A.* FROM PATIENTMASTER A INNER JOIN DESIREDPATIENTS B ON A.PATIENT_KEY=B.PATIENT_KEY ORDER BY A.PATIENT_KEY PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT 1 M Y 130 70.52 3 F N 230 91.56 5 F Y 330 112.6 SELECT A.* FROM PATIENTMASTER A WHERE A.PATIENT_KEY IN (SELECT DISTINCT PATIENT_KEY FROM DESIREDPATIENTS) SELECT A.PATIENT_KEY,A.PAT_GENDER,B.PAT_FUNGUS FROM (SELECT PATIENT_KEY,PAT_GENDER FROM PATIENTMASTER) A INNER JOIN (SELECT PATIENT_KEY,PAT_FUNGUS FROM PATFUNGUSINFO WHERE PAT_FUNGUS='Y') B ON A.PATIENT_KEY=B.PATIENT_KEY WITH vwPATGENDER AS (SELECT PATIENT_KEY,PAT_GENDER FROM PATIENTMASTER), vwPATFUNGUS AS (SELECT PATIENT_KEY,PAT_FUNGUS FROM PATFUNGUSINFO WHERE PAT_FUNGUS='Y') SELECT A.PATIENT_KEY,A.PAT_GENDER,B.PAT_FUNGUS FROM vwPATGENDER A INNER JOIN vwPATFUNGUS B ON A.PATIENT_KEY=B.PATIENT_KEY WITH vwPATMSTR AS (SELECT * FROM PATIENTMASTER WHERE PAT_GENDER='M' AND PATIENT_KEY IN (SELECT DISTINCT PATIENT_KEY FROM PATIENTMASTER)), vwPATADDR AS (SELECT * FROM PATADDRINFO WHERE PAT_STATE='PA'), vwPATFUN AS (SELECT * FROM PATFUNGUSINFO WHERE PAT_FUNGUS='Y') SELECT A.PAT_GENDER,C.PAT_FUNGUS, AVG(703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT)) AS AVG_BMI, COUNT(A.PATIENT_KEY) AS DISTINCT_PATS FROM vwPATMSTR A INNER JOIN vwPATADDR B ON A.PATIENT_KEY=B.PATIENT_KEY RIGHT JOIN vwPATFUN C ON A.PATIENT_KEY=C.PATIENT_KEY GROUP BY A.PAT_GENDER,C.PAT_FUNGUS HAVING AVG(703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT))>1.0 ORDER BY A.PAT_GENDER,C.PAT_FUNGUS DESC PAT_GENDER PAT_FUNGUS AVG_BMI DISTINCT_PATS M Y 18.3769769 1 Our Fifth SQL DML Example (CASE Statement and Functions) SELECT PATIENT_KEY, PAT_GENDER, PAT_WEIGHT, CASE WHEN PAT_GENDER='M' AND PAT_WEIGHT <= 250 THEN 1 WHEN PAT_GENDER='M' AND PAT_WEIGHT > 250 THEN 2 WHEN PAT_GENDER='F' AND PAT_WEIGHT <= 250 THEN 3 WHEN PAT_GENDER='F' AND PAT_WEIGHT > 250 THEN 4 ELSE 5 END AS PAT_WEIGHT_CODING FROM PATIENTMASTER ORDER BY 4 PATIENT_KEY PAT_GENDER PAT_WEIGHT PAT_WEIGHT_CODING 1 M 130 1 4 M 280 2 6 M 380 2 2 F 180 3 3 F 230 3 5 F 330 4 SELECT PAT_GENDER, SUM( CASE WHEN PAT_WEIGHT < 250 THEN 1 ELSE 0 END ) AS NBR_PATS_UNDER_250 FROM PATIENTMASTER GROUP BY PAT_GENDER SELECT PAT_ADDR,LENGTH(PAT_ADDR) AS ADDR_LENGTH FROM PATADDRINFO PAT_ADDR ADDR_LENGTH 123 Main St. 12 234 Second St. 14 567 Third St. 13 890 Fourth St. 14 SELECT PAT_ADDR,SUBSTR(PAT_ADDR,1,3) AS HOUSE_NBR FROM PATADDRINFO PAT_ADDR HOUSE_NBR 123 Main St. 123 234 Second St. 234 567 Third St. 567 890 Fourth St. 890 SELECT PATIENT_KEY, PAT_GENDER, PAT_DEAD, PAT_WEIGHT, PAT_HEIGHT, 703*PAT_WEIGHT/(PAT_HEIGHT*PAT_HEIGHT) AS BMI_OLD_WAY, 703*PAT_WEIGHT/POWER(PAT_HEIGHT,2) AS BMI_NEW_WAY FROM PATIENTMASTER PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT BMI_OLD_WAY BMI_NEW_WAY 1 M Y 130 70.52 18.3769769 18.3769769 2 F N 180 81.04 19.2676596 19.2676596 5 F Y 330 112.6 18.2975307 18.2975307 6 M N 380 123.12 17.6230758 17.6230758 3 F N 230 91.56 19.287307 19.287307 4 M N 280 102.08 18.8900033 18.8900033 PAT_GENDER, PAT_DEAD, PAT_WEIGHT, PAT_HEIGHT, ROUND(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2),2) AS BMI_NEW_WAY_ROUNDED FROM PATIENTMASTER PATIENT_KEY PAT_GENDER PAT_DEAD PAT_WEIGHT PAT_HEIGHT BMI_NEW_WAY_ROUNDED 1 M Y 130 70.52 18.38 2 F N 180 81.04 19.27 ...snip... WITH vwPATMSTR AS (SELECT * FROM PATIENTMASTER WHERE PAT_GENDER IN ('M','F') AND PATIENT_KEY IN (SELECT DISTINCT PATIENT_KEY FROM PATIENTMASTER)), vwPATADDR AS (SELECT * FROM PATADDRINFO), vwPATFUN AS (SELECT * FROM PATFUNGUSINFO WHERE PAT_FUNGUS IN ('N','Y')) SELECT A.PAT_GENDER,C.PAT_FUNGUS, CASE WHEN ROUND(AVG(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2)),2) <= 19 THEN 'TEENSY' WHEN ROUND(AVG(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2)),2) > 19 THEN 'FAATSY' END AS AVG_BMI_RANK FROM vwPATMSTR A INNER JOIN vwPATADDR B ON A.PATIENT_KEY=B.PATIENT_KEY RIGHT JOIN vwPATFUN C ON A.PATIENT_KEY=C.PATIENT_KEY GROUP BY A.PAT_GENDER,C.PAT_FUNGUS HAVING AVG(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2))>1.0 PAT_GENDER PAT_FUNGUS AVG_BM F N FAATSY M Y TEENSY Our Sixth SQL DML Example (UNION/INTERSECT/MINUS/EXCEPT) To code a union in SQL, do this: SELECT FRUIT FROM SETA UNION SELECT FRUIT FROM SETB To code an intersection in SQL, do this: SELECT FRUIT FROM SETA INTERSECT SELECT FRUIT FROM SETB To code a minus (or except) in SQL, do this: SELECT FRUIT FROM SETA MINUS SELECT FRUIT FROM SETB SELECT FRUIT FROM SETA UNION ALL SELECT FRUIT FROM SETB SELECT PATIENT_KEY,703*PAT_WEIGHT/POWER(PAT_HEIGHT,2) AS BMI FROM PATIENTMASTER WHERE PAT_GENDER='M' UNION ALL SELECT PATIENT_KEY,703*PAT_WEIGHT/POWER(PAT_HEIGHT,2) AS BMI FROM PATIENTMASTER WHERE PAT_GENDER='F' WITH vwPATMSTR AS (SELECT * FROM PATIENTMASTER WHERE PAT_GENDER IN ('M','F') AND PATIENT_KEY IN (SELECT DISTINCT PATIENT_KEY FROM PATIENTMASTER)), vwPATADDR AS (SELECT * FROM PATADDRINFO), vwPATFUN AS (SELECT * FROM PATFUNGUSINFO WHERE PAT_FUNGUS IN ('N','Y')) SELECT 'ALL GENDERS' AS TITLE,A.PAT_GENDER,C.PAT_FUNGUS, CASE WHEN ROUND(AVG(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2)),2) <= 19 THEN 'TEENSY' WHEN ROUND(AVG(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2)),2) > 19 THEN 'FAATSY' END AS AVG_BMI_RANK FROM vwPATMSTR A INNER JOIN vwPATADDR B ON A.PATIENT_KEY=B.PATIENT_KEY RIGHT JOIN vwPATFUN C ON A.PATIENT_KEY=C.PATIENT_KEY GROUP BY A.PAT_GENDER,C.PAT_FUNGUS HAVING AVG(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2))>1.0 UNION ALL SELECT 'MALE GENDER' AS TITLE,A.PAT_GENDER,C.PAT_FUNGUS, CASE WHEN ROUND(AVG(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2)),2) <= 19 THEN 'TEENSY' WHEN ROUND(AVG(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2)),2) > 19 THEN 'FAATSY' END AS AVG_BMI_RANK FROM vwPATMSTR A INNER JOIN vwPATADDR B ON A.PATIENT_KEY=B.PATIENT_KEY RIGHT JOIN vwPATFUN C ON A.PATIENT_KEY=C.PATIENT_KEY WHERE A.PAT_GENDER='M' GROUP BY A.PAT_GENDER,C.PAT_FUNGUS HAVING AVG(703*PAT_WEIGHT/POWER(PAT_HEIGHT,2))>1.0 Our Seventh SQL DML Example (Correlated Subqueries and EXISTS) SELECT A.* FROM PATIENTMASTER A WHERE A.PATIENT_KEY IN (SELECT DISTINCT PATIENT_KEY FROM DESIREDPATIENTS) SELECT DISTINCT A.PATIENT_KEY FROM PATIENTMASTER A WHERE A.PATIENT_KEY IN (SELECT B.PATIENT_KEY FROM PATFUNGUSINFO B WHERE B.PATIENT_KEY=A.PATIENT_KEY AND B.PAT_FUNGUS='N') PATIENT_KEY 1 SELECT DISTINCT A.PATIENT_KEY FROM PATIENTMASTER A WHERE EXISTS (SELECT B.PATIENT_KEY FROM PATFUNGUSINFO B WHERE B.PATIENT_KEY=A.PATIENT_KEY AND B.PAT_FUNGUS='Y') PATIENT_KEY 1 SELECT DISTINCT A.PATIENT_KEY FROM PATIENTMASTER A WHERE NOT EXISTS (SELECT 1 FROM PATFUNGUSINFO B WHERE B.PATIENT_KEY=A.PATIENT_KEY AND B.PAT_FUNGUS='Y') PATIENT_KEY 6 2 5 4 3 Our Eighth SQL DML Example (LIKE) SELECT BRAND,LABEL FROM DRUG_MASTER WHERE LABEL LIKE '%MG%TABLET%' BRAND LABEL SIMPLEX SIMPLEX 5MG TABLET SIMPLEX SIMPLEX 10MG TABLET SIMPLEX SIMPLEX 15MG TABLET SAMPLE SELECT BRAND,LABEL FROM DRUG_MASTER WHERE LABEL LIKE '% _MG TABLET%'; BRAND LABEL SIMPLEX SIMPLEX 5MG TABLET Our First SQL DDL Example (CREATE/DROP/TRUNCATE) CREATE TABLE PATIENTMASTER(PATIENT_KEY BIGINT, PAT_GENDER STRING, PAT_DEAD STRING, PAT_WEIGHT DOUBLE, PAT_HEIGHT DOUBLE) CREATE TABLE PATIENTMASTER_WITH_DOB(PATIENT_KEY BIGINT, PAT_GENDER STRING, PAT_DEAD STRING, PAT_WEIGHT DOUBLE, PAT_HEIGHT DOUBLE, DATE_OF_BIRTH TIMESTAMP) DROP TABLE PATIENTMASTER_WITH_DOB TRUNCATE TABLE PATIENTMASTER_WITH_DOB Our Second SQL DDL Example (DELETE/INSERT) DELETE FROM PATIENTMASTER WHERE PAT_GENDER='M' CREATE TABLE GENDER(PAT_GENDER VARCHAR2(1), GENDER_DESC VARCHAR2(50)) INSERT INTO GENDER VALUES('M','Male') INSERT INTO GENDER VALUES('F','Female') INSERT INTO GENDER VALUES('U','Unknown') CREATE TABLE PATIENTMASTER_BACKUP(PATIENT_KEY BIGINT, PAT_GENDER STRING, PAT_DEAD STRING, PAT_WEIGHT DOUBLE, PAT_HEIGHT DOUBLE) INSERT INTO PATIENTMASTER_BACKUP SELECT PATIENT_KEY,PAT_GENDER,PAT_DEAD,PAT_WEIGHT,PAT_HEIGHT FROM PATIENTMASTER CREATE TABLE PATIENTMASTER_BACKUP AS SELECT * FROM PATIENTMASTER Our Third SQL DDL Example (UPDATE) UPDATE PATIENTMASTER SET PAT_DEAD='Y' WHERE PATIENT_KEY=6 Chapter 7 - Querying the Hadoop Database (Hue and SQL Clients) Colleagues: As many of you are aware by now, the department has moved off of the legacy database to our new technologically-advanced Hadoop database. With great advancements come great features such as faster query runtimes, the ability to run more detailed analytics and much, much more. But, the feature that will impress you the most is the database web interface Hue. This gives you the ability to run SQL queries against the Hadoop database yourself using nothing but a web browser...all without having to wait in line for my department to run your requests. Naturally, more complicated requests should proceed through our standard request process. Below, we introduce Hue, but in the next few weeks, we'll have a meeting to discuss how to use Hue as well as work with the tables in our schema (named prod_schema) in more detail. To access Hue, follow these instructions: 1. Start your web browser. 2. Insert the following URL in the Address Bar at the top of the web browser: http:///hue/accounts/login 3. When the Sign In web page appears, enter your Windows username and password into the appropriate input boxes and click the Sign In button. Note that your username/password should be the same as those used to log into your company laptop each morning. 4. On the left side, click Impala under Sources. 5. On the left side, click the schema name prod_schema under Databases. 6. At this point, on the left side, you'll see a list of tables you can query. If you click on a table, the columns will appear just below the table name. You should see something like this: 7. To query a table, enter a SQL query in the textbox at the top-center of the web page. Then, click the arrow to the left of the textbox to run the query. The results will appear below the SQL query textbox. You will see something like this: 8. You can export these results by clicking on the Export results button (last button to the left of the query results) and selecting the desired option from the popup menu: CSV, Excel, Clipboard, Export. Your results will be available almost immediately. Any additional questions, please feel free to contact me at any time. Thanks, Bob USE PROD_SCHEMA; USE PROD_SCHEMA; SELECT * FROM DIM_US_STATE_MAPPING; SELECT * FROM PROD_SCHEMA.DIM_US_STATE_MAPPING; Chapter 8 - The One About ImpalaSQL [smithbob@lnxserver ~]$ impala-shell Starting Impala Shell without Kerberos authentication Opened TCP connection to lnxserver.com:21000 Connected to lnxserver.com:21000 Server version: impalad version 3.4.0-SNAPSHOT RELEASE (build 68b919fc8a5907648349aa48eefc894e15a5a1a5) ****************************************************************************** Welcome to the Impala shell. (Impala Shell v3.4.0-SNAPSHOT (27b919f) built on Tue Aug 3 21:19:39 UTC 2021) The SET command shows the current value of all shell and query options. ****************************************************************************** [lnxserver.com:21000] default> [smithbob@lnxserver ~]$ impala-shell --impalad=lnxserver.com:21000 [smithbob@lnxserver ~]$ impala-shell -i lnxserver.com:21000 [smithbob@lnxserver ~]$ impala-shell -i lnxserver.com:21000 -d prod_schema [smithbob@lnxserver ~]$ impala-shell -i lnxserver.com:21000 -d prod_schema_export -u smithbob [smithbob@lnxserver ~]$ impala-shell -k -i lnxserver.com:21000 -d prod_schema Data Manipulation Language (DML) WITH Clause SELECT col1,col2,... FROM tbl_name WHERE subsetting_conditions GROUP BY col1,... [ CUBE() | ROLLUP() | GROUPING SETS() ] HAVING post_subsetting_conditions ORDER BY col1,... LIMIT # OFFSET # TABLESAMPLE SYSTEM(percent) REPEATABLE(seed) WITH vwPC AS ( SELECT POSTAL_CODE,STATE_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE IN ('NJ','PA') ), vwUSM AS ( SELECT STATE_CODE,STATE_NAME FROM PROD_SCHEMA.DIM_US_STATE_MAPPING WHERE STATE_CODE IN ('NJ','PA') ) SELECT A.POSTAL_CODE,A.STATE_CODE,B.STATE_NAME FROM vwPC A INNER JOIN vwUSM B ON A.STATE_CODE=B.STATE_CODE; SELECT COUNT(*) AS ROW_COUNTS,5*2 AS TEN FROM PROD_SCHEMA.DIM_US_STATE_MAPPING; SELECT .25 * 35000; SELECT * FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE IN ('HI','GU') AND LONGITUDE>100; SELECT STATE_CODE,COUNT(*) AS NBR_ZIPS FROM PROD_SCHEMA.DIM_POSTAL_CODE GROUP BY STATE_CODE; SELECT STATE_CODE,COUNT(*) AS NBR_ZIPS FROM PROD_SCHEMA.DIM_POSTAL_CODE GROUP BY STATE_CODE HAVING COUNT(*) >= 1000; SELECT COL1 FROM ( SELECT 1 AS COL1 UNION SELECT 2 AS COL1 UNION SELECT NULL AS COL1 ) A ORDER BY COL1 DESC NULLS LAST; +------+ | col1 | +------+ | 2 | | 1 | | NULL | +------+ SELECT * FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE IN ('HI','GU') AND LONGITUDE>100 ORDER BY POSTAL_CODE LIMIT 10; +-------------+---------------+------------+----------+-----------+ | postal_code | city | state_code | latitude | longitude | +-------------+---------------+------------+----------+-----------+ | 96910 | HAGATNA | GU | 13.47 | 144.74 | | 96912 | DEDEDO | GU | 13.51 | 144.83 | | 96913 | BARRIGADA | GU | 13.46 | 144.79 | | 96915 | SANTA RITA | GU | 13.38 | 144.66 | | 96916 | MERIZO | GU | 13.26 | 144.66 | | 96917 | INARAJAN | GU | 13.27 | 144.74 | | 96919 | AGANA HEIGHTS | GU | 13.46 | 144.74 | | 96921 | BARRIGADA | GU | 13.46 | 144.79 | | 96923 | MANGILAO | GU | 13.43 | 144.79 | | 96928 | AGAT | GU | 13.38 | 144.65 | +-------------+---------------+------------+----------+-----------+ SELECT * FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE IN ('HI','GU') AND LONGITUDE>100 ORDER BY POSTAL_CODE LIMIT 10 OFFSET 5; +-------------+---------------+------------+----------+-----------+ | postal_code | city | state_code | latitude | longitude | +-------------+---------------+------------+----------+-----------+ | 96917 | INARAJAN | GU | 13.27 | 144.74 | | 96919 | AGANA HEIGHTS | GU | 13.46 | 144.74 | | 96921 | BARRIGADA | GU | 13.46 | 144.79 | | 96923 | MANGILAO | GU | 13.43 | 144.79 | | 96928 | AGAT | GU | 13.38 | 144.65 | | 96929 | YIGO | GU | 13.53 | 144.88 | | 96931 | TAMUNING | GU | 13.48 | 144.77 | | 96932 | HAGATNA | GU | 13.47 | 144.74 | +-------------+---------------+------------+----------+-----------+ SELECT COUNT(*) AS TOTAL_ROWS FROM DIM_POSTAL_CODE; +------------+ | total_rows | +------------+ | 43689 | +------------+ SELECT COUNT(*) AS SAMPLE_ROWS FROM DIM_POSTAL_CODE TABLESAMPLE SYSTEM(10); +-------------+ | sample_rows | +-------------+ | 43689 | +-------------+ SELECT COUNT(*) AS TOTAL_ROWS FROM DIM_US_STATE_MAPPING; +------------+ | total_rows | +------------+ | 65 | +------------+ SELECT COUNT(*) AS TOTAL_ROWS FROM DIM_US_STATE_MAPPING TABLESAMPLE SYSTEM(10); +------------+ | total_rows | +------------+ | 7 | +------------+ SHOW FILES IN DIM_POSTAL_CODE; +------------------------------------------------------------------------------------------------------+--------+ | Path | Size | +---------------------------------------------------------------------------------------------------------------+ | ...snip.../dim_postal_code/delta_1_1/534a24bb518304f4-4750ea1b00000000_273855379_data.0.parq | 1.28MB | +---------------------------------------------------------------------------------------------------------------+ SHOW FILES IN DIM_US_STATE_MAPPING; +------------------------------------------------------------------------------------------------------+--------+ | Path | Size | +---------------------------------------------------------------------------------------------------------------+ | ...snip.../dim_us_state_mapping/delta_10_10/cb43aac629622559-3e83bb3e00000000_1993903573_data.0.parq | 643B | | ...snip.../dim_us_state_mapping/delta_11_11/dc4a57323621dc6f-c63ac57b00000000_2051461713_data.0.parq | 664B | ...snip... | ...snip.../dim_us_state_mapping/delta_12_12/6442b559135f45ac-4526597b00000000_693829157_data.0.parq | 708B | | ...snip.../dim_us_state_mapping/delta_13_13/b34bd5a18f3edb85-e4cf27e600000000_716542679_data.0.parq | 729B | | ...snip.../dim_us_state_mapping/delta_9_9/a14856e144701a23-1b4bd2e200000000_1973402835_data.0.parq | 657B | +------------------------------------------------------------------------------------------------------+--------+ SELECT * FROM DIM_US_STATE_MAPPING TABLESAMPLE SYSTEM(10) REPEATABLE(31415) ORDER BY STATE_CODE; +------------+--------------------------+ | state_code | state_name | +------------+--------------------------+ | CO | COLORADO | | KY | KENTUCKY | | LA | LOUISIANA | | MH | MARSHALL ISLANDS | | MI | MICHIGAN | | MP | NORTHERN MARIANA ISLANDS | | SD | SOUTH DAKOTA | +------------+--------------------------+ SELECT A.POSTAL_CODE,A.STATE_CODE,B.STATE_NAME FROM ( SELECT POSTAL_CODE,STATE_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE IN ('NJ','PA') ) A INNER JOIN ( SELECT STATE_CODE,STATE_NAME FROM PROD_SCHEMA.DIM_US_STATE_MAPPING WHERE STATE_CODE IN ('NJ','PA') ) B ON A.STATE_CODE=B.STATE_CODE; SELECT STATE_CODE,POSTAL_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE IN ( SELECT STATE_CODE FROM DIM_US_STATE_MAPPING WHERE SUBSTR(STATE_CODE,1,1)='P' ); SELECT COUNT(*) FROM PROD_SCHEMA.DIM_US_STATE_MAPPING, PROD_SCHEMA.DIM_US_STATE_MAPPING; SELECT COUNT(*) FROM PROD_SCHEMA.DIM_US_STATE_MAPPING CROSS JOIN PROD_SCHEMA.DIM_US_STATE_MAPPING; WITH vwPC AS ( SELECT POSTAL_CODE,STATE_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE IN ('NJ','PA') ), vwUSM AS ( SELECT STATE_CODE,STATE_NAME FROM PROD_SCHEMA.DIM_US_STATE_MAPPING WHERE STATE_CODE IN ('NJ','PA') ) SELECT A.POSTAL_CODE,A.STATE_CODE,B.STATE_NAME FROM vwPC A INNER JOIN vwUSM B USING (STATE_CODE); WITH vwNJ AS ( SELECT POSTAL_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE='NJ' ), vwPA AS ( SELECT POSTAL_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE='PA' ) SELECT POSTAL_CODE FROM vwNJ UNION ALL SELECT POSTAL_CODE FROM vwPA; SELECT * FROM PROD_SCHEMA.DIM_US_STATE_MAPPING WHERE STATE_NAME LIKE '%ARMED%FORCES%'; +------------+------------------------------+ | state_code | state_name | +------------+------------------------------+ | AA | U.S. ARMED FORCES - AMERICAS | | AE | U.S. ARMED FORCES - EUROPE | | AP | U.S. ARMED FORCES - PACIFIC | +------------+------------------------------+ SELECT POSTAL_CODE,STATE_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE IN ('NJ','PA') SELECT A.* FROM PROD_SCHEMA.DIM_POSTAL_CODE A WHERE NOT EXISTS ( SELECT B.* FROM B WHERE A.STATE_CODE=B.STATE_CODE ) SELECT STATE_CODE,COUNT(*) AS NBR_ZIPS FROM PROD_SCHEMA.DIM_POSTAL_CODE GROUP BY STATE_CODE HAVING COUNT(*) >= 10 AND COUNT(*) <= 20; …is equivalent to… SELECT STATE_CODE,COUNT(*) AS NBR_ZIPS FROM PROD_SCHEMA.DIM_POSTAL_CODE GROUP BY STATE_CODE HAVING COUNT(*) BETWEEN 10 AND 20; SELECT STATE_CODE,POSTAL_CODE, CASE WHEN POSTAL_CODE='90027' THEN 'Hollywood' WHEN POSTAL_CODE='10018' THEN 'Broadway' ELSE 'No Culture' END AS CULTURE FROM PROD_SCHEMA.DIM_POSTAL_CODE; SELECT CASE WHEN POSTAL_CODE='90027' THEN 'Hollywood' WHEN POSTAL_CODE='10018' THEN 'Broadway' ELSE 'No Culture' END AS CULTURE, COUNT(*) AS ROW_CNTS FROM PROD_SCHEMA.DIM_POSTAL_CODE GROUP BY CASE WHEN POSTAL_CODE='90027' THEN 'Hollywood' WHEN POSTAL_CODE='10018' THEN 'Broadway' ELSE 'No Culture' END; Data Definition Language (DDL) CREATE TABLE Statement (TEXTFILE/PARQUET) CREATE EXTERNAL TABLE database_name.table_name ( column_name_1 data_type_1 COMMENT 'column comment 1', column_name_2 data_type_2 COMMENT 'column comment 2', ... column_name_n data_type_n COMMENT 'column comment n' ) PARTITIONED BY ( column_name_p1 data_type_p1 COMMENT 'column comment p1', column_name_p2 data_type_p2 COMMENT 'column comment p2', ... column_name_pk data_type_pk COMMENT 'column comment pk' ) SORT BY (column_name_i, column_name_j, ...) COMMENT 'table-comment' ROW FORMAT row-format WITH SERDEPROPERTIES ( 'key-1','value-1', 'key-2','value-2', ... 'key-m','value-m' ) STORED AS storage-format LOCATION 'HDFS-path-to-data-file-directory' CACHED IN 'cache-pool-name' WITH REPLICATION = replication-value | UNCACHED TBLPROPERTIES ( 'key-1','value-1', 'key-2','value-2', ... 'key-r','value-r' ) ; CREATE TABLE PROD_SCHEMA.DIM_POSTAL_CODE( POSTAL_CODE STRING, CITY STRING, STATE_CODE STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE ); CREATE TABLE PROD_SCHEMA.DIM_POSTAL_CODE( POSTAL_CODE STRING, CITY STRING, STATE_CODE STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE ) STORED AS PARQUET; USE PROD_SCHEMA; CREATE TABLE DIM_POSTAL_CODE( POSTAL_CODE STRING COMMENT '5-DIGIT POSTAL CODE', CITY STRING COMMENT 'CITY NAME', STATE_CODE STRING COMMENT '2-LETTER STATE CODE', LATITUDE DOUBLE COMMENT 'LATITUDE', LONGITUDE DOUBLE COMMENT 'LONGITUDE' ) COMMENT 'UNITED STATES POSTAL CODE TABLE' STORED AS PARQUET; +-------------+--------+---------------------+ | name | type | comment | +-------------+--------+---------------------+ | postal_code | string | 5-DIGIT POSTAL CODE | | city | string | CITY NAME | | state_code | string | 2-LETTER STATE CODE | | latitude | double | LATITUDE | | longitude | double | LONGITUDE | +-------------+--------+---------------------+ CREATE TABLE AS Statement (TEXTFILE/PARQUET) CREATE EXTERNAL TABLE database_name.table_name PARTITIONED BY (column_name_p1,column_name_p2...column_name_pk) SORT BY (column_name_i, column_name_j, ...) COMMENT 'table-comment' ROW FORMAT row-format WITH SERDEPROPERTIES ( 'key-1','value-1', 'key-2','value-2', ... 'key-m','value-m' ) STORED AS storage-format LOCATION 'HDFS-path-to-data-file-directory' CACHED IN 'cache-pool-name' WITH REPLICATION = replication-value | UNCACHED TBLPROPERTIES ( 'key-1','value-1', 'key-2','value-2', ... 'key-r','value-r' ) AS select-statement ; CREATE TABLE PROD_SCHEMA.DIM_POSTAL_CODE_BKUP AS SELECT * FROM PROD_SCHEMA.DIM_POSTAL_CODE; CREATE TABLE PROD_SCHEMA.DIM_POSTAL_CODE_BKUP STORED AS PARQUET AS SELECT * FROM PROD_SCHEMA.DIM_POSTAL_CODE; CREATE TABLE PROD_SCHEMA.DIM_POSTAL_CODE_BKUP AS SELECT * FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE FALSE; DROP TABLE Statement (TEXTFILE/PARQUET/KUDU) DROP TABLE IF EXISTS database_name.table_name PURGE; CREATE EXTERNAL TABLE PROD_SCHEMA.DIM_POSTAL_CODE( POSTAL_CODE STRING, CITY STRING, STATE_CODE STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE ) STORED AS PARQUET TBLPROPERTIES('external.table.purge'='true'); INSERT Statement (TEXTFILE/PARQUET/KUDU) INSERT INTO PROD_SCHEMA.DIM_POSTAL_CODE SELECT * FROM PROD_SCHEMA.MISSING_POSTAL_CODES; INSERT OVERWRITE PROD_SCHEMA.DIM_POSTAL_CODE SELECT * FROM PROD_SCHEMA.DIM_POSTAL_CODE_NOT_CRAP_LIKE_THE_OTHER_TABLE; INSERT INTO PROD_SCHEMA.DIM_POSTAL_CODE VALUES('99997','???','??',NULL,NULL); INSERT INTO PROD_SCHEMA.DIM_POSTAL_CODE VALUES('99998','???','??',NULL,NULL); INSERT INTO PROD_SCHEMA.DIM_POSTAL_CODE VALUES('99999','???','??',NULL,NULL); INSERT INTO|OVERWRITE database_name.table_name_1(column-1, column-2, ..., column-n) SELECT column-1,column-2,...,column-n FROM database_name.table_name_2 WHERE where-condition ; INSERT INTO|OVERWRITE database_name.table_name(column-1, column-2, ..., column-n) VALUES(value-1,value-2,...,value-n); TRUNCATE TABLE (TEXTFILE/PARQUET) TRUNCATE TABLE IF EXISTS database_name.table_name; TRUNCATE TABLE PROD_SCHEMA.DIM_POSTAL_CODE; CREATE TABLE Statement (KUDU) CREATE TABLE database_name.table_name ( column_name_1 data_type_1 kudu_column_attr_1 COMMENT 'column-comment-1', column_name_2 data_type_2 kudu_column_attr_2 COMMENT 'column-comment-2', ... column_name_n data_type_n kudu_column_attr_n COMMENT 'column-comment-n' PRIMARY KEY (column_name_i,...,column_name_k) ) PARTITION BY kudu_partition_clause COMMENT 'table-comment' STORED AS KUDU TBLPROPERTIES ( 'key-1','value-1', 'key-2','value-2', ... 'key-r','value-r' ) ; USE PROD_SCHEMA; CREATE TABLE DIM_POSTAL_CODE_PART( POSTAL_CODE STRING, CITY STRING, STATE_CODE STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE, PRIMARY KEY (POSTAL_CODE) ) PARTITION BY HASH (POSTAL_CODE) PARTITIONS 4 STORED AS KUDU; INSERT INTO DIM_POSTAL_CODE_PART SELECT * FROM DIM_POSTAL_CODE; | SerDe Library: | org.apache.hadoop.hive.kudu.KuduSerDe | NULL | | InputFormat: | org.apache.hadoop.hive.kudu.KuduInputFormat | NULL | | OutputFormat: | org.apache.hadoop.hive.kudu.KuduOutputFormat | NULL | DELETE,UPDATE and UPSERT (KUDU ONLY) DELETE FROM PROD_SCHEMA.DIM_POSTAL_CODE; DELETE FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE SUBSTR(POSTAL_CODE,1,3)='999'; DELETE FROM database_name.table_name WHERE where-condition; DELETE database_name.table_name_1 FROM database_name.table_name_1 JOIN database_name.table_name_2 ON join-criteria WHERE where-condition; DELETE PROD_SCHEMA.DIM_POSTAL_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE A JOIN PROD_SCHEMA.BAD_POSTAL_CODES B ON A.POSTAL_CODE=B.POSTAL_CODE; DELETE PROD_SCHEMA.DIM_POSTAL_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE A JOIN PROD_SCHEMA.BAD_POSTAL_CODES B ON A.POSTAL_CODE=B.POSTAL_CODE WHERE B.OK_TO_DELETE='Y'; UPDATE PROD_SCHEMA.BAD_POSTAL_CODES SET OK_TO_DELETE='Y'; UPDATE PROD_SCHEMA.BAD_POSTAL_CODES SET OK_TO_DELETE='Y' WHERE UNATTRACTIVE_CITIZENS='Y'; UPDATE database_name.table_name SET column-1 = value-1, column-2 = value-2, ... column-n = value-n WHERE where-condition; UPDATE database_name.table_name_1 SET column-1 = value-1, column-2 = value-2, ... column-n = value-n FROM database_name.table_name_1 JOIN database_name.table_name_2 ON join-criteria WHERE where-condition; UPDATE PROD_SCHEMA.DIM_POSTAL_CODE SET A.STATE_CODE=B.CORRECTED_TWO_LETTER_STATE_CODE FROM PROD_SCHEMA.DIM_POSTAL_CODE A JOIN PROD_SCHEMA.BAD_POSTAL_CODES B ON A.POSTAL_CODE=B.POSTAL_CODE WHERE B.UNATTRACTIVE_CITIZENS IN ('Y','N'); UPSERT INTO PROD_SCHEMA.DIM_POSTAL_CODE SELECT * FROM DIM_POSTAL_CODE_NEW_AND_FIXES; UPSERT INTO database_name.table_name_1(column-1, column-2, ..., column-n) SELECT * FROM database_name.table_name_2 WHERE where-condition; UPSERT INTO database_name.table_name_1(column-1, column-2, ..., column-n) VALUES(value-1,value-2,...,value-n); ALTER TABLE Statement (TEXTFILE/PARQUET/KUDU) ALTER TABLE original-table-name RENAME TO new-table-name; [hdpserver.com:21000] prod_schema> ALTER TABLE DIM_POSTAL_CODE2 RENAME TO DIM_POSTAL_CODE_BACKUP_1; Query: ALTER TABLE DIM_POSTAL_CODE2 RENAME TO DIM_POSTAL_CODE_BACKUP_1 +--------------------------+ | summary | +--------------------------+ | Renaming was successful. | +--------------------------+ [hdpserver.com:21000] prod_schema> DESC FORMATTED DIM_POSTAL_CODE_BACKUP_1; ...snip... Location: hdfs://lnxserver.com:8020/warehouse/tablespace/ managed/hive/dim_postal_code_backup_1 Table Type: MANAGED_TABLE ...snip... CREATE EXTERNAL TABLE EXT_POSTAL_CODE STORED AS PARQUET AS SELECT * FROM DIM_POSTAL_CODE; Location: hdfs://lnxserver.com:8020/warehouse/tablespace/ external/hive/EXT_POSTAL_CODE [hdpserver.com:21000] prod_schema> ALTER TABLE EXT_POSTAL_CODE RENAME TO TMP_POSTAL_CODE; Query: ALTER TABLE EXT_POSTAL_CODE RENAME TO TMP_POSTAL_CODE +--------------------------+ | summary | +--------------------------+ | Renaming was successful. | +--------------------------+ Location: hdfs://lnxserver.com:8020/warehouse/tablespace/ external/hive/EXT_POSTAL_CODE +-------------+--------+---------------------+ | name | type | comment | +-------------+--------+---------------------+ | postal_code | string | 5-DIGIT POSTAL CODE | | city | string | CITY NAME | | state_code | string | 2-LETTER STATE CODE | | latitude | double | LATITUDE | | longitude | double | LONGITUDE | +-------------+--------+---------------------+ [hdpserver.com:21000] prod_schema> ALTER TABLE DIM_POSTAL_CODE_BACKUP_2 CHANGE LONGITUDE LONGITUDE_CENTROID DOUBLE; Query: ALTER TABLE DIM_POSTAL_CODE_BACKUP_2 CHANGE LONGITUDE LONGITUDE_CENTROID DOUBLE +--------------------------+ | summary | +--------------------------+ | Column has been altered. | +--------------------------+ [hdpserver.com:21000] prod_schema> ALTER TABLE DIM_POSTAL_CODE_BACKUP_2 CHANGE LATITUDE LATITUDE_CENTROID DOUBLE; Query: ALTER TABLE DIM_POSTAL_CODE_BACKUP_2 CHANGE LATITUDE LATITUDE_CENTROID DOUBLE +--------------------------+ | summary | +--------------------------+ | Column has been altered. | +--------------------------+ +--------------------+--------+---------+ | name | type | comment | +--------------------+--------+---------+ | postal_code | string | | | city | string | | | state_code | string | | | latitude_centroid | double | | | longitude_centroid | double | | +--------------------+--------+---------+ ALTER TABLE table-name CHANGE column-name new_column-name column-data-type; ALTER TABLE table-name DROP COLUMN column-name; [hdpserver.com:21000] prod_schema> ALTER TABLE DIM_POSTAL_CODE_BACKUP_2 DROP COLUMN STATE_CODE; Query: ALTER TABLE DIM_POSTAL_CODE_BACKUP_2 DROP COLUMN STATE_CODE +--------------------------+ | summary | +--------------------------+ | Column has been dropped. | +--------------------------+ +--------------------+--------+---------+ | name | type | comment | +--------------------+--------+---------+ | postal_code | string | | | city | string | | | latitude_centroid | double | | | longitude_centroid | double | | +--------------------+--------+---------+ ALTER TABLE table-name ADD COLUMNS (column-name-1 data-type-1, column-name-2 data-type-2, ..., column-name-n data-type-n); [hdpserver.com:21000] prod_schema> ALTER TABLE DIM_POSTAL_CODE_BACKUP_2 ADD COLUMNS (CITY_NAME STRING, POP_IN_POSTAL_CODE BIGINT); Query: ALTER TABLE DIM_POSTAL_CODE_BACKUP_2 ADD COLUMNS (CITY_NAME STRING,POP_IN_POSTAL_CODE BIGINT) +---------------------------------------------+ | summary | +---------------------------------------------+ | New column(s) have been added to the table. | +---------------------------------------------+ +--------------------+--------+---------+ | name | type | comment | +--------------------+--------+---------+ | postal_code | string | | | city | string | | | latitude_centroid | double | | | longitude_centroid | double | | | city_name | string | | | pop_in_postal_code | bigint | | +--------------------+--------+---------+ COMMENT ON TABLE table-name IS 'table-comment'; COMMENT ON COLUMN table-name.column-name IS 'column-comment'; [hdpserver.com:21000] prod_schema> COMMENT ON TABLE DIM_POSTAL_CODE_BACKUP_2 IS 'BACKUP TABLE #2'; +----------------+ | summary | +----------------+ | Updated table. | +----------------+ [hdpserver.com:21000] prod_schema> COMMENT ON COLUMN DIM_POSTAL_CODE_BACKUP_2.POP_IN_POSTAL_CODE IS 'POPULATION (000s)'; +--------------------------+ | summary | +--------------------------+ | Column has been altered. | +--------------------------+ +------------------------------+------------+--------------------+ | name | type | comment | +------------------------------+------------+--------------------+ | # col_name | data_type | comment | | | NULL | NULL | | postal_code | string | NULL | | city | string | NULL | | latitude_centroid | double | NULL | | longitude_centroid | double | NULL | | city_name | string | NULL | | pop_in_postal_code | bigint | POPULATION (000s) | | | comment | BACKUP TABLE #2 | +------------------------------+------------+--------------------+ CREATE VIEW Statement CREATE VIEW IF NOT EXISTS view-name AS SQL-select-query; CREATE VIEW PROD_SCHEMA.V_POSTAL_CODE_INFO AS SELECT A.POSTAL_CODE,A.CITY,A.LATITUDE,A.LONGITUDE,A.STATE_CODE,B.STATE_NAME FROM PROD_SCHEMA.DIM_POSTAL_CODE A LEFT JOIN PROD_SCHEMA.DIM_US_STATE_MAPPING B ON A.STATE_CODE=B.STATE_CODE WHERE A.STATE_CODE NOT IN ('AE','AP','AS','FM','GU','MH','MP','PW','VI'); [hdpserver.com:21000] prod_schema > DESC V_POSTAL_CODE_INFO; +-------------+--------+---------+ | name | type | comment | +-------------+--------+---------+ | postal_code | string | | | city | string | | | latitude | double | | | longitude | double | | | state_code | string | | | state_name | string | | +-------------+--------+---------+ [hdpserver.com:21000] prod_schema> SELECT * > FROM V_POSTAL_CODE_INFO > WHERE STATE_CODE='GU'; Fetched 0 row(s) in 0.13s DROP VIEW IF EXISTS view-name; Using the SHOW and SET Statements [hdpserver.com:21000] prod_schema> SHOW DATABASES; +------------------+----------------------------------------------+ | name | comment | +------------------+----------------------------------------------+ | _impala_builtins | System database for Impala builtin functions | | default | Default Hive database | | prod_schema | Bob Smith`s Department database | +------------------+----------------------------------------------+ [hdpserver.com:21000] prod_schema> SHOW TABLES; +---------------------------+ | name | +---------------------------+ | bob1 | | bob2 | | bob3 | | bob4 | | candybar_consumption_data | | dim_calendar | | dim_postal_code | ...snip... | state_code_jamboree | | state_code_jamboree2 | | state_code_jamboree3 | | tacobellinfo | | tmp_postal_code | | zzz1 | | zzz4 | | zzz5 | +---------------------------+ [hdpserver.com:21000] prod_schema> SHOW TABLES LIKE 'dim*'; +--------------------------+ | name | +--------------------------+ | dim_calendar | | dim_postal_code | | dim_postal_code_backup_1 | | dim_postal_code_backup_2 | | dim_postal_code_part | | dim_us_state_mapping | +--------------------------+ [hdpserver.com:21000] prod_schema> SHOW CREATE TABLE DIM_CALENDAR; CREATE TABLE prod_schema.dim_calendar ( date_id DATE, day TINYINT, month TINYINT, year INT, quarter TINYINT, yyyyddd STRING, ddd STRING, first_day_of_month DATE, first_day_of_quarter DATE, first_day_of_year DATE, month_name STRING, weekday_name STRING, yyyyqq STRING, yyyymm STRING, yyyymmdd STRING, date_long STRING, date_short STRING ) STORED AS PARQUET LOCATION 'hdfs://lnxserver.com:8020/warehouse/tablespace/managed/ hive/dim_calendar' TBLPROPERTIES( 'OBJCAPABILITIES'='HIVEMANAGEDINSERTREAD,HIVEMANAGEDINSERTWRITE', 'STATS_GENERATED'='TASK', 'impala.events.catalogServiceId'='---:---', 'impala.events.catalogVersion'='41', 'impala.lastComputeStatsTime'='1648475816', 'numRows'='31', 'totalSize'='5892', 'transactional'='true', 'transactional_properties'='insert_only' ) [hdpserver.com:21000] prod_schema> SHOW CREATE TABLE V_POSTAL_CODE_INFO; CREATE VIEW `prod_schema`.v_postal_code_info AS SELECT A.POSTAL_CODE, A.CITY, A.LATITUDE, A.LONGITUDE, A.STATE_CODE, B.STATE_NAME FROM `prod_schema`.dim_postal_code a LEFT OUTER JOIN `prod_schema`.dim_us_state_mapping b ON A.STATE_CODE = B.STATE_CODE WHERE A.STATE_CODE NOT IN ( 'AE', 'AP', 'AS', 'FM', 'GU', 'MH', 'MP', 'PW', 'VI' ) [hdpserver.com:21000] prod_schema> SHOW PARTITIONS DIM_POSTAL_CODE_PART; +------------+-------+--------+---------+----------------------------------------+ | state_code | #Rows | #Files | Size | Location | +------------+-------+--------+---------+----------------------------------------+ | AE | -1 | 1 | 1.14KB | .../dim_postal_code_part/state_code=AE | | AK | -1 | 1 | 9.96KB | .../dim_postal_code_part/state_code=AK | ...snip... | WV | -1 | 1 | 29.84KB | .../dim_postal_code_part/state_code=WV | | WY | -1 | 1 | 7.61KB | .../dim_postal_code_part/state_code=WY | | Total | -1 | 61 | 1.31MB | | +------------+-------+--------+---------+----------------------------------------+ [hdpserver:21000] prod_schema> SHOW FILES IN PROD_SCHEMA.DIM_POSTAL_CODE; +--------------------------------------------------------------------------------------+----------+ | Path | Size | +--------------------------------------------------------------------------------------+----------+ | hdfs://hdpserver/data/prod/teams/prod_schema/dim_postal_code/a1117506544_data.0.parq | 132.33MB | | hdfs://hdpserver/data/prod/teams/prod_schema/dim_postal_code/a1011983093_data.0.parq | 24.68MB | +--------------------------------------------------------------------------------------+----------+ [hdpserver.com:21000] prod_schema> SET; Query options (defaults shown in []): ABORT_ON_ERROR: [0] COMPRESSION_CODEC: [] DEFAULT_FILE_FORMAT: [PARQUET] ...snip... THREAD_RESERVATION_AGGREGATE_LIMIT: [0] THREAD_RESERVATION_LIMIT: [3000] TIMEZONE: [America/New_York] Advanced Query Options: APPX_COUNT_DISTINCT: [0] BROADCAST_BYTES_LIMIT: [34359738368] BUFFER_POOL_LIMIT: [] ...snip... SHUFFLE_DISTINCT_EXPRS: [1] SUPPORT_START_OVER: [false] TOPN_BYTES_LIMIT: [536870912] Shell Options WRITE_DELIMITED: False VERBOSE: True LIVE_SUMMARY: False OUTPUT_FILE: None DELIMITER: \t LIVE_PROGRESS: True Variables: No variables defined. [hdpserver.com:21000] prod_schema> SET ALL; ...snip... Development Query Options: ALLOW_ERASURE_CODED_FILES: [0] BATCH_SIZE: [0] CPU_LIMIT_S: [0] ...snip... PLANNER_TESTCASE_MODE: [0] SPOOL_QUERY_RESULTS: [0] STRICT_MODE: [0] ...snip... SET option=value; set compression_codec=snappy; set sync_ddl=1; set disable_codegen=true; Chapter 9 - ImpalaSQL Functions Parade select state_code,count(*) as nbr_zips from prod_schema.dim_postal_code group by state_code; select state_code,count(distinct postal_code) as nbr_zips from prod_schema.dim_postal_code group by state_code; select count(distinct case when postal_code='90027' then 'Hollywood' when postal_code='10018' then 'Broadway' else null end) as culture_count from prod_schema.dim_postal_code; select state_code,group_concat(postal_code,',') as zip_in_state from prod_schema.dim_postal_code where state_code in ('NJ','PA') group by state_code; +------------+------------------------------+ | state_code | zip_in_state | +------------+------------------------------+ | NJ | 07024,07054,07088,...snip... | | PA | 15004,15009,15030,...snip... | +------------+------------------------------+ with vwDAT as ( select state_code,postal_code from prod_schema.dim_postal_code where state_code in ('NJ','PA') order by state_code,postal_code limit 10000000 ) select state_code,group_concat(postal_code,',') as zip_in_state from vwDAT group by state_code; +------------+------------------------------+ | state_code | zip_in_state | +------------+------------------------------+ | PA | 15001,15003,15004,...snip... | | NJ | 07001,07002,07003,...snip... | +------------+------------------------------+ Chapter 10 - Voyage of the Damned (Dates & Times - ImpalaSQL Edition) SELECT DATE '1962-03-21' AS TBDAY; +------------+ | tbday | +------------+ | 1962-03-21 | +------------+ SELECT '1962-03-21 12:00:00' AS TBTIME; +---------------------+ | tbtime | +---------------------+ | 1962-03-21 12:00:00 | +---------------------+ SELECT CAST('1962-03-21' AS DATE) AS TBDAY; +------------+ | tbday | +------------+ | 1962-03-21 | +------------+ SELECT CAST('1962-03-21 12:00:00' AS TIMESTAMP) AS TBTIME; +---------------------+ | tbtime | +---------------------+ | 1962-03-21 12:00:00 | +---------------------+ CREATE TABLE TACOBELLINFO STORED AS PARQUET AS SELECT CAST('1962-03-21' AS DATE) AS TBDAY, CAST('1962-03-21 12:00:00' AS TIMESTAMP) AS TBTIME DESC TACOBELLINFO; +--------+-----------+---------+ | name | type | comment | +--------+-----------+---------+ | tbday | date | | | tbtime | timestamp | | +--------+-----------+---------+ SELECT CAST('March 21, 1963' AS DATE FORMAT 'Month dd, yyyy') AS TBDAY; +------------+ | tbday | +------------+ | 1963-03-21 | +------------+ SELECT CAST('March 21, 1963 11:30 P.M.' AS TIMESTAMP FORMAT 'Month dd, yyyy hh:mi p.m.') AS TBTIME; +---------------------+ | tbtime | +---------------------+ | 1963-03-21 23:30:00 | +---------------------+ SELECT CAST('1963080' AS DATE FORMAT 'yyyyddd') AS TBDAY; +------------+ | tbday | +------------+ | 1963-03-21 | +------------+ SELECT CAST('1963-03' AS DATE FORMAT 'yyyy-mm') AS TBDAY; ERROR: UDF ERROR: String to Date parse failed. Invalid string val: "1963-03" SELECT CAST(concat('1963-03','-01') AS DATE FORMAT 'yyyy-mm-dd') AS TBDAY; +------------+ | tbday | +------------+ | 1963-03-01 | +------------+ SELECT CAST('1963-1' AS DATE FORMAT 'yyyy-q') AS TBDAY; ERROR: PARSE_ERROR: Quarter token is not allowed in a string to datetime conversion SELECT CAST( CAST('1963080' AS DATE FORMAT 'yyyyddd') AS STRING FORMAT 'yyyy:q' ) AS TBQTR; +--------+ | tbqtr | +--------+ | 1963:1 | +--------+ SELECT CAST( CAST('1963080' AS DATE FORMAT 'yyyyddd') AS STRING FORMAT 'yyyy:q/ww/DAY/Dy' ) AS TBFRANKENDATE +-------------------------+ | tbfrankendate | +-------------------------+ | 1963:1/12/THURSDAY /Thu | +-------------------------+ SELECT CAST( CAST('1963077' AS DATE FORMAT 'yyyyddd') AS STRING FORMAT 'yyyy:q/ww/DAY/Dy' ) AS TBFRANKENDATE; +-------------------------+ | tbfrankendate | +-------------------------+ | 1963:1/11/MONDAY /Mon | +-------------------------+ SELECT CAST( CAST('1963077' AS DATE FORMAT 'yyyyddd') AS STRING FORMAT 'yyyy:q/ww/fmDAY/Dy' ) AS TBFRANKENDATE; +----------------------+ | tbfrankendate | +----------------------+ | 1963:1/11/MONDAY/Mon | +----------------------+ SELECT TO_TIMESTAMP('Mar 21, 1963 12:00:00', 'MMM dd, yyyy HH:mm:ss') AS TS1; +---------------------+ | ts1 | +---------------------+ | 1963-03-21 12:00:00 | +---------------------+ select FROM_TIMESTAMP( TO_TIMESTAMP('Mar 21, 1963 12:00:00', 'MMM dd, yyyy HH:mm:ss'), 'MMM dd, yyyy') AS STR1; +--------------+ | str1 | +--------------+ | Mar 21, 1963 | +--------------+ SELECT CURRENT_DATE(); +----------------+ | current_date() | +----------------+ | 2022-02-12 | +----------------+ SELECT CURRENT_TIMESTAMP(),NOW(); +-------------------------------+-------------------------------+ | current_timestamp() | now() | +-------------------------------+-------------------------------+ | 2022-02-12 15:21:18.886155000 | 2022-02-12 15:21:18.886155000 | +-------------------------------+-------------------------------+ SELECT TIMEOFDAY(); +------------------------------+ | timeofday() | +------------------------------+ | Sat Feb 12 15:22:35 2022 EST | +------------------------------+ SELECT EXTRACT(YEAR FROM NOW()) AS YYYY, EXTRACT(QUARTER FROM NOW()) AS QUARTER +------+---------+ | yyyy | quarter | +------+---------+ | 2022 | 1 | +------+---------+ SELECT YEAR(NOW()) AS YYYY, QUARTER(NOW()) AS QUARTER +------+---------+ | yyyy | quarter | +------+---------+ | 2022 | 1 | +------+---------+ SELECT CAST('March 21, 1963 11:30:35 A.M.' AS TIMESTAMP FORMAT 'Month dd, yyyy HH:MI:SS AM') AS TB_FOUNDERS_DT; +---------------------+ | tb_founders_dt | +---------------------+ | 1963-03-21 11:30:35 | +---------------------+ SELECT TRUNC( CAST('March 21, 1963 11:30:35 A.M.' AS TIMESTAMP FORMAT 'Month dd, yyyy HH:MI:SS AM'), 'MONTH') AS TB_FOUNDERS_MNTH; +---------------------+ | tb_founders_mnth | +---------------------+ | 1963-03-01 00:00:00 | +---------------------+ SELECT TRUNC( CAST('March 21, 1963 11:30:35 A.M.' AS TIMESTAMP FORMAT 'Month dd, yyyy HH:MI:SS AM'), 'YEAR') AS TB_FOUNDERS_YEAR; +---------------------+ | tb_founders_year | +---------------------+ | 1963-01-01 00:00:00 | +---------------------+ SELECT DATE_CMP( CAST('April 15, 1955' AS DATE FORMAT 'Month, dd, yyyy'), CAST('March 21, 1963' AS DATE FORMAT 'Month, dd, yyyy') ) AS AND_THE_WINNER_IS +-------------------+ | and_the_winner_is | +-------------------+ | -1 | +-------------------+ SELECT MONTHS_BETWEEN(DATE '2022-01-01',DATE '2022-02-01'); +------------------------------------------------------+ | months_between(date '2022-01-01', date '2022-02-01') | +------------------------------------------------------+ | -1 | +------------------------------------------------------+ SELECT MONTHS_BETWEEN(DATE '2022-02-01',DATE '2022-01-01'); +------------------------------------------------------+ | months_between(date '2022-02-01', date '2022-01-01') | +------------------------------------------------------+ | 1 | +------------------------------------------------------+ SELECT MONTHS_BETWEEN(DATE '2022-02-02',DATE '2022-01-01'); +------------------------------------------------------+ | months_between(date '2022-02-02', date '2022-01-01') | +------------------------------------------------------+ | 1.032258064516129 | +------------------------------------------------------+ 1/31 = 0.032258064516129 SELECT INT_MONTHS_BETWEEN(DATE '2022-02-02',DATE '2022-01-01'); +----------------------------------------------------------+ | int_months_between(date '2022-02-02', date '2022-01-01') | +----------------------------------------------------------+ | 1 | +----------------------------------------------------------+ SELECT DATEDIFF(DATE '2022-02-02',DATE '2022-01-01'); +------------------------------------------------+ | datediff(date '2022-02-02', date '2022-01-01') | +------------------------------------------------+ | 32 | +------------------------------------------------+ SELECT DATEDIFF(DATE '2022-02-02',DATE '2022-01-01') + 1; +----------------------------------------------------+ | datediff(date '2022-02-02', date '2022-01-01') + 1 | +----------------------------------------------------+ | 33 | +----------------------------------------------------+ SELECT UNIX_TIMESTAMP('2022-02-02 00:00:00') - UNIX_TIMESTAMP('2022-01-01 00:00:00') AS SS ; +---------+ | ss | +---------+ | 2764800 | +---------+ INTERVAL amount units SELECT CURRENT_DATE() AS TODAY, CURRENT_DATE() + INTERVAL 10 YEARS AS TEN_YEARS_IN_THE_FUTURE, CURRENT_DATE() - INTERVAL 10 YEARS AS TEN_YEAR_IN_THE_PAST +------------+-------------------------+----------------------+ | today | ten_years_in_the_future | ten_year_in_the_past | +------------+-------------------------+----------------------+ | 2022-02-13 | 2032-02-13 | 2012-02-13 | +------------+-------------------------+----------------------+ SELECT NOW() AS TODAY, NOW() + INTERVAL 10 YEARS AS TEN_YEARS_IN_THE_FUTURE, NOW() - INTERVAL 10 YEARS AS TEN_YEAR_IN_THE_PAST +-------------------------------+-------------------------------+-------------------------------+ | today | ten_years_in_the_future | ten_year_in_the_past | +-------------------------------+-------------------------------+-------------------------------+ | 2022-02-13 14:54:09.636130000 | 2032-02-13 14:54:09.636130000 | 2012-02-13 14:54:09.636130000 | +-------------------------------+-------------------------------+-------------------------------+ SELECT NOW() AS TODAY, NOW() + INTERVAL 10 YEARS + INTERVAL 5 MINUTES + INTERVAL 32 SECONDS AS TEN_YEARS_5_MINS_32_SECS_IN_THE_FUTURE; +-------------------------------+----------------------------------------+ | today | ten_years_5_mins_32_secs_in_the_future | +-------------------------------+----------------------------------------+ | 2022-02-13 15:00:41.484874000 | 2032-02-13 15:06:13.484874000 | +-------------------------------+----------------------------------------+ SELECT NOW() AS TODAY, DATE_ADD(NOW(),INTERVAL 10 YEARS) AS TEN_YEARS_IN_THE_FUTURE; +-------------------------------+-------------------------------+ | today | ten_years_in_the_future | +-------------------------------+-------------------------------+ | 2022-02-14 16:38:25.208342000 | 2032-02-14 16:38:25.208342000 | +-------------------------------+-------------------------------+ SELECT NOW() AS TODAY, DATE_SUB(NOW(),INTERVAL 10 YEARS) AS TEN_YEARS_IN_THE_PAST; +-------------------------------+-------------------------------+ | today | ten_years_in_the_past | +-------------------------------+-------------------------------+ | 2022-02-14 16:44:34.571261000 | 2012-02-14 16:44:34.571261000 | +-------------------------------+-------------------------------+ SELECT NOW() AS TODAY,NEXT_DAY(NOW(),'Friday') AS THIS_FRIDAY +-------------------------------+-------------------------------+ | today | this_friday | +-------------------------------+-------------------------------+ | 2022-02-15 10:00:34.820998000 | 2022-02-18 10:00:34.820998000 | +-------------------------------+-------------------------------+ SELECT NOW() AS TODAY,LAST_DAY(NOW()) AS LAST_DAY_OF_MONTH +-------------------------------+---------------------+ | today | last_day_of_month | +-------------------------------+---------------------+ | 2022-02-15 10:03:44.365151000 | 2022-02-28 00:00:00 | +-------------------------------+---------------------+ SELECT NOW() AS TODAY, ADD_MONTHS(NOW(),120) AS TEN_YEARS_IN_THE_FUTURE; +-------------------------------+-------------------------------+ | today | ten_years_in_the_future | +-------------------------------+-------------------------------+ | 2022-02-14 17:29:50.284031000 | 2032-02-14 17:29:50.284031000 | +-------------------------------+-------------------------------+ SELECT NOW() AS TODAY, HOURS_ADD( MINUTES_ADD(NOW(),5), 1) AS ONE_HOUR_FIVE_MINUTES_FROM_NOW; +-------------------------------+--------------------------------+ | today | one_hour_five_minutes_from_now | +-------------------------------+--------------------------------+ | 2022-02-15 09:53:29.243206000 | 2022-02-15 10:58:29.243206000 | +-------------------------------+--------------------------------+ SELECT NOW() AS TODAY,TO_DATE(NOW()) AS NICE_LOOKING_TODAY +-------------------------------+--------------------+ | today | nice_looking_today | +-------------------------------+--------------------+ | 2022-02-15 10:16:39.950483000 | 2022-02-15 | +-------------------------------+--------------------+ SELECT NOW() AS TODAY,UTC_TIMESTAMP() AS UTC_TODAY +-------------------------------+-------------------------------+ | today | utc_today | +-------------------------------+-------------------------------+ | 2022-02-15 10:20:36.732856000 | 2022-02-15 15:20:36.732856000 | +-------------------------------+-------------------------------+ SELECT FROM_UTC_TIMESTAMP('2022-02-15 15:20:36.732856000','EST') AS TODAY +-------------------------------+ | today | +-------------------------------+ | 2022-02-15 10:20:36.732856000 | +-------------------------------+ Chapter 11 - Regular Expressions in ImpalaSQL AllianceBernstein Income Fund, Inc.;ACG Avenue Income Credit Strategies;ACP The Adams Express Company;ADX AllianceBernstein National Municipal Income Fund, Inc;AFB Apollo Senior Floating Rate Fund, Inc.;AFT Advent Claymore Convertible Security;AGC Alpine Global Dynamic Dividend Fund;AGD Alliance California Municipal Income Fund Inc.;AKP Alpine Total Dynamic Dividend Fund;AOD Asia Pacific Fund Inc.;APB Morgan Stanley Asia-Pacific Fund Inc.;APF Ares Dynamic Credit Allocation;ARD BlackRock Senior High Income Fund, Inc.;ARK ASA Gold and Precious Metals Limited;ASA Liberty All Star Growth Fund Inc.;ASG American Strategic Income Portfolio Inc.;ASP ACG;AllianceBernstein Income Fund, Inc. ACP;Avenue Income Credit Strategies ADX;The Adams Express Company AFB;AllianceBernstein National Municipal Income Fund, Inc AFT;Apollo Senior Floating Rate Fund, Inc. AGC;Advent Claymore Convertible Security AGD;Alpine Global Dynamic Dividend Fund AKP;Alliance California Municipal Income Fund Inc. AOD;Alpine Total Dynamic Dividend Fund APB;Asia Pacific Fund Inc. APF;Morgan Stanley Asia-Pacific Fund Inc. ARD;Ares Dynamic Credit Allocation ARK;BlackRock Senior High Income Fund, Inc. ASA;ASA Gold and Precious Metals Limited ASG;Liberty All Star Growth Fund Inc. ASP;American Strategic Income Portfolio Inc. INSERT INTO MYTABLE VALUES\('\2','\1'\); INSERT INTO MYTABLE VALUES('ACG','AllianceBernstein Income Fund, Inc.'); INSERT INTO MYTABLE VALUES('ACP','Avenue Income Credit Strategies'); INSERT INTO MYTABLE VALUES('ADX','The Adams Express Company'); INSERT INTO MYTABLE VALUES('AFB','AllianceBernstein National Municipal Income Fund, Inc'); INSERT INTO MYTABLE VALUES('AFT','Apollo Senior Floating Rate Fund, Inc.'); INSERT INTO MYTABLE VALUES('AGC','Advent Claymore Convertible Security'); INSERT INTO MYTABLE VALUES('AGD','Alpine Global Dynamic Dividend Fund'); INSERT INTO MYTABLE VALUES('AKP','Alliance California Municipal Income Fund Inc.'); INSERT INTO MYTABLE VALUES('AOD','Alpine Total Dynamic Dividend Fund'); INSERT INTO MYTABLE VALUES('APB','Asia Pacific Fund Inc.'); INSERT INTO MYTABLE VALUES('APF','Morgan Stanley Asia-Pacific Fund Inc.'); INSERT INTO MYTABLE VALUES('ARD','Ares Dynamic Credit Allocation'); INSERT INTO MYTABLE VALUES('ARK','BlackRock Senior High Income Fund, Inc.'); INSERT INTO MYTABLE VALUES('ASA','ASA Gold and Precious Metals Limited'); INSERT INTO MYTABLE VALUES('ASG','Liberty All Star Growth Fund Inc.'); INSERT INTO MYTABLE VALUES('ASP','American Strategic Income Portfolio Inc.'); dir bunny*.jpg SELECT IMAGE_NAME FROM IMAGE_NAME_TABLE WHERE IMAGE_NAME LIKE 'bunny%.jpg'; SELECT IMAGE_NAME FROM IMAGE_NAME_TABLE WHERE IMAGE_NAME LIKE 'bunny_.jpg'; 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 123 Main St, Apt A1, AnyCity, AnyState 12345 123A Main St, Apt A1, AnyCity, AnyState 12345 Example #1 - Non-Metacharacters Act As Themselves SELECT SOURCE_STRING, REGEXP_INSTR(SOURCE_STRING,'Main') AS RESULT FROM REGEX_DATA; SOURCE_STRING RESULT 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 5 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 5 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 5 123 Main St, Apt A1, AnyCity, AnyState 12345 5 123A Main St, Apt A1, AnyCity, AnyState 12345 6 Example #2 - Using the Period (.) and Asterisk (*) SELECT SOURCE_STRING, REGEXP_INSTR(SOURCE_STRING,'.*') AS RESULT FROM REGEX_DATA; SOURCE_STRING RESULT 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 1 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 1 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 1 123 Main St, Apt A1, AnyCity, AnyState 12345 1 123A Main St, Apt A1, AnyCity, AnyState 12345 1 Example #3 - Using the Period (.) and Asterisk (*) Again SELECT SOURCE_STRING, REGEXP_INSTR(SOURCE_STRING,'t.*') AS RESULT FROM REGEX_DATA; SOURCE_STRING RESULT 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 11 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 11 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 11 123 Main St, Apt A1, AnyCity, AnyState 12345 11 123A Main St, Apt A1, AnyCity, AnyState 12345 12 Example #4 - Finding the Zipcode Using a Character List ([]) SELECT SOURCE_STRING, REGEXP_INSTR(SOURCE_STRING,'[0123456789]{5}') AS RESULT FROM REGEX_DATA; SOURCE_STRING RESULT 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 45 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 41 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 40 123 Main St, Apt A1, AnyCity, AnyState 12345 40 123A Main St, Apt A1, AnyCity, AnyState 12345 41 Example #5 - Finding the Zipcode Using a Character List ([]) Again SELECT SOURCE_STRING, REGEXP_INSTR(SOURCE_STRING,'[0123456789]{5}-?[0123456789]{4}') AS RESULT FROM REGEX_DATA; SOURCE_STRING RESULT 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 45 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 41 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 40 123 Main St, Apt A1, AnyCity, AnyState 12345 0 123A Main St, Apt A1, AnyCity, AnyState 12345 0 Example #6 - Finding the Zipcode Using a Character List ([]) Again Again SELECT SOURCE_STRING, REGEXP_INSTR(SOURCE_STRING,'[0123456789]{5}(-?[0123456789]{4})?') AS RESULT FROM REGEX_DATA; SOURCE_STRING RESULT 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 45 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 41 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 40 123 Main St, Apt A1, AnyCity, AnyState 12345 40 123A Main St, Apt A1, AnyCity, AnyState 12345 41 Example #7 - Finding Alternatives SELECT SOURCE_STRING, REGEXP_INSTR(SOURCE_STRING,'(Street|St){1}') AS RESULT FROM REGEX_DATA; SOURCE_STRING RESULT 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 10 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 10 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 10 123 Main St, Apt A1, AnyCity, AnyState 12345 10 123A Main St, Apt A1, AnyCity, AnyState 12345 11 Example #8 - Matching the Address Number SELECT SOURCE_STRING, REGEXP_INSTR(SOURCE_STRING,'^[0-9]+[A-Z]?') AS RESULT FROM REGEX_DATA; SOURCE_STRING RESULT 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 1 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 1 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 1 123 Main St, Apt A1, AnyCity, AnyState 12345 1 123A Main St, Apt A1, AnyCity, AnyState 12345 1 SELECT * FROM REGEX_DATA WHERE REGEXP_LIKE(SOURCE_STRING,'[0-9]{5}-[0-9]{4}'); +--------------------------------------------------------+ | source_string | +--------------------------------------------------------+ | 123 Main St, Apt A1, AnyCity, AnyState 12345-6789 | | 123 Main St, Apt #A1, AnyCity, AnyState 12345-6789 | | 123 Main Street, Apt #A1, AnyCity, AnyState 12345-6789 | +--------------------------------------------------------+ SELECT REGEXP_REPLACE(SOURCE_STRING,'([0-9]+[A-Z]?) ([a-zA-Z]+) ((Street|St)+).*', '\\1') AS HOUSE_NUMBER, REGEXP_REPLACE(SOURCE_STRING,'([0-9]+[A-Z]?) ([a-zA-Z]+) ((Street|St)+).*', '\\2') AS STREET_NAME, REGEXP_REPLACE(SOURCE_STRING,'([0-9]+[A-Z]?) ([a-zA-Z]+) ((Street|St)+).*', '\\3') AS STREET_TYPE FROM REGEX_DATA; +--------------+-------------+-------------+ | house_number | street_name | street_type | +--------------+-------------+-------------+ | 123 | Main | St | | 123A | Main | St | | 123 | Main | St | | 123 | Main | St | | 123 | Main | Street | +--------------+-------------+-------------+ SELECT REGEXP_EXTRACT(SOURCE_STRING,'([0-9]+[A-Z]?) ([a-zA-Z]+) ((Street|St)+).*',1) AS HOUSE_NUMBER, REGEXP_EXTRACT(SOURCE_STRING,'([0-9]+[A-Z]?) ([a-zA-Z]+) ((Street|St)+).*',2) AS STREET_NAME, REGEXP_EXTRACT(SOURCE_STRING,'([0-9]+[A-Z]?) ([a-zA-Z]+) ((Street|St)+).*',3) AS STREET_TYPE FROM REGEX_DATA; +--------------+-------------+-------------+ | house_number | street_name | street_type | +--------------+-------------+-------------+ | 123 | Main | St | | 123A | Main | St | | 123 | Main | St | | 123 | Main | St | | 123 | Main | Street | +--------------+-------------+-------------+ select '123 MAIN STREET' regexp '^[0-9]{3}.*'; SELECT REGEXP_EXTRACT('boulevard ard ard','(b.*d).*',1); +----------------------------------------------------+ | regexp_extract('boulevard ard ard', '(b.*d).*', 1) | +----------------------------------------------------+ | boulevard ard ard | +----------------------------------------------------+ SELECT REGEXP_EXTRACT('boulevard ard ard','(b.*?d).*',1); +-----------------------------------------------------+ | regexp_extract('boulevard ard ard', '(b.*?d).*', 1) | +-----------------------------------------------------+ | boulevard | +-----------------------------------------------------+ Chapter 12 - SQL Analytic (Windowing) Functions in ImpalaSQL +-----------+--------+---------------------+--------+--------+ | firstname | gender | birthdate | height | weight | +-----------+--------+---------------------+--------+--------+ | ROSEMARY | F | 2000-05-08 00:00:00 | 35 | 123 | | TOMMY | M | 1998-12-11 00:00:00 | 78 | 167 | | BUDDY | M | 1998-10-02 00:00:00 | 45 | 189 | | ALBERT | M | 2000-08-02 00:00:00 | 45 | 150 | | SIMON | M | 1999-01-03 00:00:00 | 87 | 256 | | FARQUAR | M | 1998-11-05 00:00:00 | 76 | 198 | | LAUREN | F | 2000-06-10 00:00:00 | 54 | 876 | +-----------+--------+---------------------+--------+--------+ SELECT GENDER,COUNT(*) AS KID_COUNT FROM FATKIDS GROUP BY GENDER ORDER BY GENDER; +--------+-----------+ | gender | kid_count | +--------+-----------+ | F | 2 | | M | 5 | +--------+-----------+ WITH vwTOT AS ( SELECT GENDER,COUNT(*) AS KID_COUNT FROM FATKIDS GROUP BY GENDER ) SELECT A.FIRSTNAME,A.GENDER,A.BIRTHDATE,A.HEIGHT,A.WEIGHT, B.KID_COUNT FROM FATKIDS A INNER JOIN vwTOT B ON A.GENDER=B.GENDER ORDER BY KID_COUNT; +-----------+--------+---------------------+--------+--------+-----------+ | firstname | gender | birthdate | height | weight | kid_count | +-----------+--------+---------------------+--------+--------+-----------+ | ROSEMARY | F | 2000-05-08 00:00:00 | 35 | 123 | 2 | | LAUREN | F | 2000-06-10 00:00:00 | 54 | 876 | 2 | | SIMON | M | 1999-01-03 00:00:00 | 87 | 256 | 5 | | BUDDY | M | 1998-10-02 00:00:00 | 45 | 189 | 5 | | ALBERT | M | 2000-08-02 00:00:00 | 45 | 150 | 5 | | TOMMY | M | 1998-12-11 00:00:00 | 78 | 167 | 5 | | FARQUAR | M | 1998-11-05 00:00:00 | 76 | 198 | 5 | +-----------+--------+---------------------+--------+--------+-----------+ SELECT A.FIRSTNAME,A.GENDER,A.BIRTHDATE,A.HEIGHT,A.WEIGHT, COUNT(*) OVER (PARTITION BY A.GENDER) AS KID_COUNT FROM FATKIDS A; SELECT A.GENDER,A.FIRSTNAME,A.WEIGHT, SUM(A.WEIGHT) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT) AS WT_RUN FROM FATKIDS A ORDER BY A.GENDER,A.WEIGHT; +--------+-----------+--------+--------+ | gender | firstname | weight | wt_run | +--------+-----------+--------+--------+ | F | ROSEMARY | 123 | 123 | | F | LAUREN | 876 | 999 | | M | ALBERT | 150 | 150 | | M | TOMMY | 167 | 317 | | M | BUDDY | 189 | 506 | | M | FARQUAR | 198 | 704 | | M | SIMON | 256 | 960 | +--------+-----------+--------+--------+ function(…) OVER (PARTITION BY col1,col2,…) SELECT A.FIRSTNAME,A.GENDER,A.BIRTHDATE,A.HEIGHT,A.WEIGHT, COUNT(*) OVER () AS TOTAL_ROWS FROM FATKIDS A; +-----------+--------+---------------------+--------+--------+------------+ | firstname | gender | birthdate | height | weight | total_rows | +-----------+--------+---------------------+--------+--------+------------+ | FARQUAR | M | 1998-11-05 00:00:00 | 76 | 198 | 7 | | ALBERT | M | 2000-08-02 00:00:00 | 45 | 150 | 7 | | BUDDY | M | 1998-10-02 00:00:00 | 45 | 189 | 7 | | TOMMY | M | 1998-12-11 00:00:00 | 78 | 167 | 7 | | LAUREN | F | 2000-06-10 00:00:00 | 54 | 876 | 7 | | SIMON | M | 1999-01-03 00:00:00 | 87 | 256 | 7 | | ROSEMARY | F | 2000-05-08 00:00:00 | 35 | 123 | 7 | +-----------+--------+---------------------+--------+--------+------------+ SELECT A.FIRSTNAME,A.GENDER,A.BIRTHDATE,A.HEIGHT,A.WEIGHT, COUNT(*) OVER (PARTITION BY GENDER,EXTRACT(A.BIRTHDATE,'YEAR')) AS TOTAL_ROWS FROM FATKIDS A; +-----------+--------+---------------------+--------+--------+------------+ | firstname | gender | birthdate | height | weight | total_rows | +-----------+--------+---------------------+--------+--------+------------+ | LAUREN | F | 2000-06-10 00:00:00 | 54 | 876 | 2 | | ROSEMARY | F | 2000-05-08 00:00:00 | 35 | 123 | 2 | | TOMMY | M | 1998-12-11 00:00:00 | 78 | 167 | 3 | | FARQUAR | M | 1998-11-05 00:00:00 | 76 | 198 | 3 | | BUDDY | M | 1998-10-02 00:00:00 | 45 | 189 | 3 | | SIMON | M | 1999-01-03 00:00:00 | 87 | 256 | 1 | | ALBERT | M | 2000-08-02 00:00:00 | 45 | 150 | 1 | +-----------+--------+---------------------+--------+--------+------------+ function(…) OVER ( … ORDER BY col1,col2, … ) SELECT A.*, ROW_NUMBER() OVER (ORDER BY A.FIRSTNAME) AS RNUM FROM FATKIDS A; +-----------+--------+---------------------+--------+--------+------+ | firstname | gender | birthdate | height | weight | rnum | +-----------+--------+---------------------+--------+--------+------+ | ALBERT | M | 2000-08-02 00:00:00 | 45 | 150 | 1 | | BUDDY | M | 1998-10-02 00:00:00 | 45 | 189 | 2 | | FARQUAR | M | 1998-11-05 00:00:00 | 76 | 198 | 3 | | LAUREN | F | 2000-06-10 00:00:00 | 54 | 876 | 4 | | ROSEMARY | F | 2000-05-08 00:00:00 | 35 | 123 | 5 | | SIMON | M | 1999-01-03 00:00:00 | 87 | 256 | 6 | | TOMMY | M | 1998-12-11 00:00:00 | 78 | 167 | 7 | +-----------+--------+---------------------+--------+--------+------+ SELECT A.*, ROW_NUMBER() OVER (PARTITION BY A.GENDER ORDER BY A.FIRSTNAME) AS RNUM FROM FATKIDS A; +-----------+--------+---------------------+--------+--------+------+ | firstname | gender | birthdate | height | weight | rnum | +-----------+--------+---------------------+--------+--------+------+ | LAUREN | F | 2000-06-10 00:00:00 | 54 | 876 | 1 | | ROSEMARY | F | 2000-05-08 00:00:00 | 35 | 123 | 2 | | ALBERT | M | 2000-08-02 00:00:00 | 45 | 150 | 1 | | BUDDY | M | 1998-10-02 00:00:00 | 45 | 189 | 2 | | FARQUAR | M | 1998-11-05 00:00:00 | 76 | 198 | 3 | | SIMON | M | 1999-01-03 00:00:00 | 87 | 256 | 4 | | TOMMY | M | 1998-12-11 00:00:00 | 78 | 167 | 5 | +-----------+--------+---------------------+--------+--------+------+ +--------+-----------+--------+ | gender | firstname | weight | +--------+-----------+--------+ | F | ROSEMARY | 123 | | F | LAUREN | 876 | | M | ALBERT | 150 | <-- Lag 1 Row | M | TOMMY | 167 | <--Current Row | M | BUDDY | 189 | | M | FARQUAR | 198 | <-- Lead 2 Rows | M | SIMON | 256 | +--------+-----------+--------+ LEAD(column-name,nbr-rows-to-lead,default-value) OVER (…) LAG(column-name,nbr-rows-to-lag,default-value) OVER (…) SELECT A.FIRSTNAME,A.WEIGHT, LEAD(A.WEIGHT,1,-1) OVER (ORDER BY A.WEIGHT) AS LEAD_1_WT, LAG(A.WEIGHT,2,-1) OVER (ORDER BY A.WEIGHT) AS LAG_2_WT FROM FATKIDS A ORDER BY A.WEIGHT; +-----------+--------+-----------+----------+ | firstname | weight | lead_1_wt | lag_2_wt | +-----------+--------+-----------+----------+ | ROSEMARY | 123 | 150 | -1 | | ALBERT | 150 | 167 | -1 | | TOMMY | 167 | 189 | 123 | | BUDDY | 189 | 198 | 150 | | FARQUAR | 198 | 256 | 167 | | SIMON | 256 | 876 | 189 | | LAUREN | 876 | -1 | 198 | +-----------+--------+-----------+----------+ SELECT A.FIRSTNAME,A.GENDER,A.WEIGHT, LEAD(A.WEIGHT,1,-1) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT) AS LEAD_1_WT, LAG(A.WEIGHT,2,-1) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT) AS LAG_2_WT FROM FATKIDS A ORDER BY A.GENDER,A.WEIGHT; +-----------+--------+--------+-----------+----------+ | firstname | gender | weight | lead_1_wt | lag_2_wt | +-----------+--------+--------+-----------+----------+ | ROSEMARY | F | 123 | 876 | -1 | | LAUREN | F | 876 | -1 | -1 | | ALBERT | M | 150 | 167 | -1 | | TOMMY | M | 167 | 189 | -1 | | BUDDY | M | 189 | 198 | 150 | | FARQUAR | M | 198 | 256 | 167 | | SIMON | M | 256 | -1 | 189 | +-----------+--------+--------+-----------+----------+ RANK() OVER ( … ORDER BY col1,col2, … ) DENSE_RANK() OVER ( … ORDER BY col1,col2, … ) SELECT A.FIRSTNAME,A.GENDER,A.HEIGHT, RANK() OVER (PARTITION BY A.GENDER ORDER BY A.HEIGHT) AS HT_RANK, DENSE_RANK() OVER (PARTITION BY A.GENDER ORDER BY A.HEIGHT) AS HT_DENSERANK FROM FATKIDS A ORDER BY A.GENDER,A.HEIGHT; +-----------+--------+--------+---------+--------------+ | firstname | gender | height | ht_rank | ht_denserank | +-----------+--------+--------+---------+--------------+ | ROSEMARY | F | 35 | 1 | 1 | | LAUREN | F | 54 | 2 | 2 | | ALBERT | M | 45 | 1 | 1 | | BUDDY | M | 45 | 1 | 1 | | FARQUAR | M | 76 | 3 | 2 | | TOMMY | M | 78 | 4 | 3 | | SIMON | M | 87 | 5 | 4 | +-----------+--------+--------+---------+--------------+ FIRST_VALUE(column-name) OVER ( … ORDER BY col1, col2, … ) LAST_VALUE(column-name) OVER ( … ORDER BY col1, col2, … ) SELECT A.FIRSTNAME,A.GENDER,A.WEIGHT, FIRST_VALUE(A.FIRSTNAME) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT) AS LT_CHILD, LAST_VALUE(A.FIRSTNAME) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT) AS HV_CHILD FROM FATKIDS A ORDER BY A.GENDER,A.WEIGHT; +-----------+--------+--------+----------+----------+ | firstname | gender | weight | lt_child | hv_child | +-----------+--------+--------+----------+----------+ | ROSEMARY | F | 123 | ROSEMARY | ROSEMARY | | LAUREN | F | 876 | ROSEMARY | LAUREN | | ALBERT | M | 150 | ALBERT | ALBERT | | TOMMY | M | 167 | ALBERT | TOMMY | | BUDDY | M | 189 | ALBERT | BUDDY | | FARQUAR | M | 198 | ALBERT | FARQUAR | | SIMON | M | 256 | ALBERT | SIMON | +-----------+--------+--------+----------+----------+ SELECT A.GENDER,A.FIRSTNAME,A.WEIGHT, SUM(A.WEIGHT) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT) AS WT_RUN FROM FATKIDS A ORDER BY A.GENDER,A.WEIGHT; +--------+-----------+--------+--------+ | gender | firstname | weight | wt_run | +--------+-----------+--------+--------+ | F | ROSEMARY | 123 | 123 | | F | LAUREN | 876 | 999 | | M | ALBERT | 150 | 150 | | M | TOMMY | 167 | 317 | | M | BUDDY | 189 | 506 | | M | FARQUAR | 198 | 704 | | M | SIMON | 256 | 960 | +--------+-----------+--------+--------+ SELECT A.GENDER,A.FIRSTNAME,A.WEIGHT, SUM(A.WEIGHT) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS WT_RUN FROM FATKIDS A ORDER BY A.GENDER,A.WEIGHT; SELECT A.FIRSTNAME,A.GENDER,A.WEIGHT, FIRST_VALUE(A.FIRSTNAME) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS LT_CHILD, LAST_VALUE(A.FIRSTNAME) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS HV_CHILD FROM FATKIDS A ORDER BY A.GENDER,A.WEIGHT; +-----------+--------+--------+----------+----------+ | firstname | gender | weight | lt_child | hv_child | +-----------+--------+--------+----------+----------+ | ROSEMARY | F | 123 | ROSEMARY | ROSEMARY | | LAUREN | F | 876 | ROSEMARY | LAUREN | | ALBERT | M | 150 | ALBERT | ALBERT | | TOMMY | M | 167 | ALBERT | TOMMY | | BUDDY | M | 189 | ALBERT | BUDDY | | FARQUAR | M | 198 | ALBERT | FARQUAR | | SIMON | M | 256 | ALBERT | SIMON | +-----------+--------+--------+----------+----------+ SELECT A.FIRSTNAME,A.GENDER,A.WEIGHT, FIRST_VALUE(A.FIRSTNAME) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS LT_CHILD, LAST_VALUE(A.FIRSTNAME) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS HV_CHILD FROM FATKIDS A ORDER BY A.GENDER,A.WEIGHT; +-----------+--------+--------+----------+----------+ | firstname | gender | weight | lt_child | hv_child | +-----------+--------+--------+----------+----------+ | ROSEMARY | F | 123 | ROSEMARY | LAUREN | | LAUREN | F | 876 | ROSEMARY | LAUREN | | ALBERT | M | 150 | ALBERT | SIMON | | TOMMY | M | 167 | ALBERT | SIMON | | BUDDY | M | 189 | ALBERT | SIMON | | FARQUAR | M | 198 | ALBERT | SIMON | | SIMON | M | 256 | ALBERT | SIMON | +-----------+--------+--------+----------+----------+ ROWS BETWEEN m PRECEDING | UNBOUNDED PRECEDING | CURRENT ROW AND CURRENT ROW | UNBOUNDED FOLLOWING | n FOLLOWING SELECT A.FIRSTNAME,A.GENDER,A.WEIGHT, AVG(A.WEIGHT) OVER (PARTITION BY A.GENDER ORDER BY A.WEIGHT ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS AVG_3 FROM FATKIDS A ORDER BY A.GENDER,A.WEIGHT; +-----------+--------+--------+-------------------+ | firstname | gender | weight | avg_3 | +-----------+--------+--------+-------------------+ | ROSEMARY | F | 123 | 499.5 | | LAUREN | F | 876 | 499.5 | | ALBERT | M | 150 | 158.5 | | TOMMY | M | 167 | 168.6666666666667 | | BUDDY | M | 189 | 184.6666666666667 | | FARQUAR | M | 198 | 214.3333333333333 | | SIMON | M | 256 | 227 | +-----------+--------+--------+-------------------+ SELECT A.FIRSTNAME,A.HEIGHT,A.WEIGHT, AVG(A.WEIGHT) OVER (ORDER BY A.HEIGHT ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS SUM_10_5 FROM FATKIDS A ORDER BY A.HEIGHT; +-----------+--------+--------+-------------------+ | firstname | height | weight | sum_10_5 | +-----------+--------+--------+-------------------+ | ROSEMARY | 35 | 123 | 123 | | ALBERT | 45 | 150 | 136.5 | | BUDDY | 45 | 189 | 154 | | LAUREN | 54 | 876 | 334.5 | | FARQUAR | 76 | 198 | 307.2 | | TOMMY | 78 | 167 | 283.8333333333333 | | SIMON | 87 | 256 | 279.8571428571428 | +-----------+--------+--------+-------------------+ SELECT A.FIRSTNAME,A.HEIGHT,A.WEIGHT, AVG(A.WEIGHT) OVER (ORDER BY A.HEIGHT RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS SUM_10_5 FROM FATKIDS A ORDER BY A.HEIGHT; +-----------+--------+--------+-------------------+ | firstname | height | weight | sum_10_5 | +-----------+--------+--------+-------------------+ | ROSEMARY | 35 | 123 | 123 | | ALBERT | 45 | 150 | 154 | | BUDDY | 45 | 189 | 154 | | LAUREN | 54 | 876 | 334.5 | | FARQUAR | 76 | 198 | 307.2 | | TOMMY | 78 | 167 | 283.8333333333333 | | SIMON | 87 | 256 | 279.8571428571428 | +-----------+--------+--------+-------------------+ NTILE(buckets) OVER ( … ORDER BY col1,col2, … ) SELECT A.FIRSTNAME,A.HEIGHT, NTILE(4) OVER (ORDER BY A.HEIGHT) AS GRP4_HT FROM FATKIDS A ORDER BY A.HEIGHT; +-----------+--------+---------+ | firstname | height | grp4_ht | +-----------+--------+---------+ | ROSEMARY | 35 | 1 | | ALBERT | 45 | 1 | | BUDDY | 45 | 2 | | LAUREN | 54 | 2 | | FARQUAR | 76 | 3 | | TOMMY | 78 | 3 | | SIMON | 87 | 4 | +-----------+--------+---------+ PERCENT_RANK() OVER ( … ORDER BY col1,col2,… ) 100*PERCENT_RANK() OVER ( … ORDER BY col1,col2,… ) SELECT A.FIRSTNAME,A.HEIGHT, RANK() OVER (ORDER BY A.HEIGHT) AS RANK_HEIGHT, PERCENT_RANK() OVER (ORDER BY A.HEIGHT) AS PCTDIST_HEIGHT FROM FATKIDS A ORDER BY A.HEIGHT; +-----------+--------+-------------+--------------------+ | firstname | height | rank_height | pctdist_height | +-----------+--------+-------------+--------------------+ | ROSEMARY | 35 | 1 | 0 | | BUDDY | 45 | 2 | 0.1666666666666667 | | ALBERT | 45 | 2 | 0.1666666666666667 | | LAUREN | 54 | 4 | 0.5 | | FARQUAR | 76 | 5 | 0.6666666666666666 | | TOMMY | 78 | 6 | 0.8333333333333334 | | SIMON | 87 | 7 | 1 | +-----------+--------+-------------+--------------------+ CUME_DIST(column) OVER ( … ORDER BY col1,col2,… ) SELECT A.FIRSTNAME,A.HEIGHT, CUME_DIST() OVER (ORDER BY A.HEIGHT) AS CUMDIST_HEIGHT FROM FATKIDS A ORDER BY A.HEIGHT; +-----------+--------+--------------------+ | firstname | height | cumdist_height | +-----------+--------+--------------------+ | ROSEMARY | 35 | 0.1428571428571428 | | ALBERT | 45 | 0.4285714285714285 | | BUDDY | 45 | 0.4285714285714285 | | LAUREN | 54 | 0.5714285714285714 | | FARQUAR | 76 | 0.7142857142857143 | | TOMMY | 78 | 0.8571428571428571 | | SIMON | 87 | 1 | +-----------+--------+--------------------+ SELECT FIRSTNAME,GENDER,BIRTHDATE,HEIGHT,WEIGHT, COUNT(*) OVER (PARTITION BY GENDER) AS KID_COUNT FROM FATKIDS; SELECT A.* FROM ( SELECT FIRSTNAME,GENDER,BIRTHDATE,HEIGHT,WEIGHT, COUNT(*) OVER (PARTITION BY GENDER) AS KID_COUNT FROM FATKIDS ) A WHERE KID_COUNT=2; SELECT FIRSTNAME,GENDER,BIRTHDATE,HEIGHT,WEIGHT, COUNT(*) OVER (PARTITION BY GENDER) AS KID_COUNT FROM FATKIDS QUALIFY KID_COUNT=2; SELECT FIRSTNAME,GENDER,BIRTHDATE,HEIGHT,WEIGHT, COUNT(*) OVER (PARTITION BY GENDER) AS KID_COUNT FROM FATKIDS QUALIFY COUNT(*) OVER (PARTITION BY GENDER)=2; Chapter 13 - Extensions to the GROUP BY Clause in ImpalaSQL [hdpserver.com:21000] prod_schema> desc candybar_consumption_data; +----------------------+----------+---------+ | name | type | comment | +----------------------+----------+---------+ | consumer_id | tinyint | | | candybar_name | string | | | survey_year | smallint | | | gender | string | | | overall_rating | tinyint | | | number_bars_consumed | smallint | | +----------------------+----------+---------+ +-------------+---------------+-------------+--------+----------------+----------------------+ | consumer_id | candybar_name | survey_year | gender | overall_rating | number_bars_consumed | +-------------+---------------+-------------+--------+----------------+----------------------+ | 1 | MARS BAR | 2009 | M | 10 | 252 | | 1 | MARS BAR | 2010 | M | 10 | 352 | | 1 | MARS BAR | 2011 | M | 10 | 452 | | 1 | TWIX BAR | 2009 | M | 10 | 6 | | 1 | TWIX BAR | 2010 | M | 7 | 60 | | 1 | TWIX BAR | 2011 | M | 8 | 600 | | 2 | HERSHEY BAR | 2009 | F | 5 | 2 | | 2 | HERSHEY BAR | 2010 | F | 5 | 3 | | 2 | HERSHEY BAR | 2011 | F | 5 | 1 | | 2 | MARS BAR | 2009 | F | 8 | 25 | | 2 | MARS BAR | 2010 | F | 8 | 12 | | 2 | MARS BAR | 2011 | F | 8 | 13 | | 3 | MARS BAR | 2009 | M | 8 | 25 | | 3 | MARS BAR | 2010 | M | 7 | 12 | | 3 | MARS BAR | 2011 | M | 8 | 13 | | 3 | TWIX BAR | 2009 | M | 7 | 6 | | 3 | TWIX BAR | 2010 | M | 8 | 60 | | 3 | TWIX BAR | 2011 | M | 9 | 600 | | 4 | HERSHEY BAR | 2009 | F | 7 | 20 | | 4 | HERSHEY BAR | 2010 | F | 7 | 30 | | 4 | HERSHEY BAR | 2011 | F | 7 | 10 | | 4 | MARS BAR | 2009 | F | 7 | 25 | | 4 | MARS BAR | 2010 | F | 7 | 35 | | 4 | MARS BAR | 2011 | F | 7 | 15 | | 4 | TWIX BAR | 2009 | F | 7 | 20 | | 4 | TWIX BAR | 2010 | F | 7 | 30 | | 4 | TWIX BAR | 2011 | F | 7 | 10 | | 5 | HERSHEY BAR | 2009 | M | 8 | 15 | | 5 | HERSHEY BAR | 2010 | M | 8 | 15 | | 5 | HERSHEY BAR | 2011 | M | 6 | 5 | | 5 | SNICKERS BAR | 2009 | M | 8 | 55 | | 5 | SNICKERS BAR | 2010 | M | 8 | 65 | | 5 | SNICKERS BAR | 2011 | M | 8 | 75 | | 5 | TWIX BAR | 2009 | M | 9 | 75 | | 5 | TWIX BAR | 2010 | M | 9 | 85 | | 5 | TWIX BAR | 2011 | M | 9 | 95 | +-------------+---------------+-------------+--------+----------------+----------------------+ SELECT SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING ORDER BY 1,2,3,4; +-------------+---------------+--------+----------------+---------------------+ | survey_year | candybar_name | gender | overall_rating | total_bars_consumed | +-------------+---------------+--------+----------------+---------------------+ | 2009 | HERSHEY BAR | F | 5 | 2 | | 2009 | HERSHEY BAR | F | 7 | 20 | | 2009 | HERSHEY BAR | M | 8 | 15 | | 2009 | MARS BAR | F | 7 | 25 | | 2009 | MARS BAR | F | 8 | 25 | ...snip... | 2011 | TWIX BAR | F | 7 | 10 | | 2011 | TWIX BAR | M | 8 | 600 | | 2011 | TWIX BAR | M | 9 | 695 | +-------------+---------------+--------+----------------+---------------------+ SELECT SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING UNION ALL SELECT SURVEY_YEAR,CANDYBAR_NAME,GENDER,NULL AS OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY SURVEY_YEAR,CANDYBAR_NAME,GENDER UNION ALL SELECT SURVEY_YEAR,CANDYBAR_NAME,NULL AS GENDER,NULL AS OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY SURVEY_YEAR,CANDYBAR_NAME UNION ALL SELECT SURVEY_YEAR,NULL AS CANDYBAR_NAME,NULL AS GENDER,NULL AS OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY SURVEY_YEAR UNION ALL SELECT NULL AS SURVEY_YEAR,NULL AS CANDYBAR_NAME,NULL AS GENDER,NULL AS OVERALL_RATING,SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA; +-------------+---------------+--------+----------------+---------------------+ | survey_year | candybar_name | gender | overall_rating | total_bars_consumed | +-------------+---------------+--------+----------------+---------------------+ | 2009 | HERSHEY BAR | F | 5 | 2 | ...snip... | 2010 | TWIX BAR | M | NULL | 205 | ...snip... | 2009 | MARS BAR | NULL | NULL | 327 | ...snip... | 2010 | NULL | NULL | NULL | 759 | ...snip... | NULL | NULL | NULL | NULL | 3174 | +-------------+---------------+--------+----------------+---------------------+ SELECT SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY ROLLUP(SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING); GROUP BY GROUPING SETS(A,B,C,…) GROUP BY A UNION ALL GROUP BY B UNION ALL GROUP BY C …and so on… SELECT SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY GROUPING SETS( (SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING), (SURVEY_YEAR,CANDYBAR_NAME,GENDER), (SURVEY_YEAR,CANDYBAR_NAME), (SURVEY_YEAR), () ) SELECT SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY GROUPING SETS( (SURVEY_YEAR,CANDYBAR_NAME,GENDER), (SURVEY_YEAR), () ) GROUP BY ROLLUP(A,B,C,…) GROUPING SETS( (), (A), (A,B), (A,B,C), … ) SELECT SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY ROLLUP(SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING) GROUP BY CUBE(A,B,C,…) GROUPING SETS( (), (A),(B),(C),… (A,B),(A,C),(B,C),… (A,B,C),… … ) SELECT SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TOTAL_BARS_CONSUMED FROM CANDYBAR_CONSUMPTION_DATA GROUP BY CUBE(SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING) SELECT SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING, SUM(NUMBER_BARS_CONSUMED) AS TBC, GROUPING(SURVEY_YEAR) AS gS, GROUPING(CANDYBAR_NAME) AS gC, GROUPING(GENDER) AS gG, GROUPING(OVERALL_RATING) AS gO FROM CANDYBAR_CONSUMPTION_DATA GROUP BY ROLLUP(SURVEY_YEAR,CANDYBAR_NAME,GENDER,OVERALL_RATING); +-------------+---------------+--------+----------------+--------+------+------+------+------+ | survey_year | candybar_name | gender | overall_rating | tbc | gS | gC | gG | gO | +-------------+---------------+--------+----------------+--------+------+------+------+------+ ...snip... | 2009 | HERSHEY BAR | F | 5 | 2 | 0 | 0 | 0 | 0 | | 2010 | TWIX BAR | M | NULL | 205 | 0 | 0 | 0 | 1 | | 2009 | MARS BAR | NULL | NULL | 327 | 0 | 0 | 1 | 1 | | 2010 | NULL | NULL | NULL | 759 | 0 | 1 | 1 | 1 | | NULL | NULL | NULL | NULL | 3174 | 1 | 1 | 1 | 1 | +-------------+---------------+--------+----------------+--------+------+------+------+------+ +------+------+------+------+ | gS | gC | gG | gO | +------+------+------+------+ | 0 | 0 | 0 | 0 | --> 0000 | 0 | 0 | 0 | 1 | --> 0001 | 0 | 0 | 1 | 1 | --> 0011 | 0 | 1 | 1 | 1 | --> 0111 | 1 | 1 | 1 | 1 | --> 1111 +------+------+------+------+ +------+------+------+------+ | gS | gC | gG | gO | +------+------+------+------+ | 0 | 0 | 0 | 0 | --> 0000 --> 0 | 0 | 0 | 0 | 1 | --> 0001 --> 1 | 0 | 0 | 1 | 1 | --> 0011 --> 3 | 0 | 1 | 1 | 1 | --> 0111 --> 7 | 1 | 1 | 1 | 1 | --> 1111 --> 15 +------+------+------+------+ Chapter 14 - The One About HiveQL [smithbob@lnxserver ~]$ beeline beeline> beeline> show tables; No current connection [smithbob@lnxserver ~]$ beeline -u jdbc:hive2://hdpserver:10000/schema username password beeline -u jdbc:hive2://hdpserver:10000/schema;principle=... -n username -p password WITH Clause SELECT col1,col2,... FROM tbl_name WHERE subsetting_conditions GROUP BY col1,... [ CUBE() | ROLLUP() | GROUPING SETS() ] HAVING post_subsetting_conditions ORDER BY col1,... LIMIT offset,rows TABLESAMPLE(# ROWS) beeline> select category_id from categories limit 10; +--------------+ | category_id | +--------------+ | 1 | | 2 | | 3 | | 4 | | 5 | | 6 | | 7 | | 8 | | 9 | | 10 | +--------------+ beeline> select category_id from categories limit 1,10; +--------------+ | category_id | +--------------+ | 2 | | 3 | | 4 | | 5 | | 6 | | 7 | | 8 | | 9 | | 10 | | 11 | +--------------+ beeline > select category_id from categories tablesample(10 rows); +--------------+ | category_id | +--------------+ | 1 | | 2 | | 3 | | 4 | | 5 | | 6 | | 7 | | 8 | | 9 | | 10 | +--------------+ beeline> select category_id from categories order by rand() limit 10; +--------------+ | category_id | +--------------+ | 13 | | 29 | | 10 | | 52 | | 19 | | 9 | | 41 | | 6 | | 48 | | 36 | +--------------+ with vwBOYS as ( select survey_year,candybar_name, avg(number_bars_consumed) as avgbars_boys from candybar_consumption_data where gender='M' group by survey_year,candybar_name ), vwGALS as ( select survey_year,candybar_name, avg(number_bars_consumed) as avgbars_gals from candybar_consumption_data where gender='F' group by survey_year,candybar_name ), vwDATA as ( select A.survey_year,A.candybar_name, A.avgbars_boys,B.avgbars_gals from vwBOYS A inner join vwGALS B on A.survey_year=B.survey_year and A.candybar_name=B.candybar_name ) select corr(avgbars_boys,avgbars_gals) as corr_avgbars_boys_gals from vwDATA; +-------------------------+ | corr_avgbars_boys_gals | +-------------------------+ | -0.10161457962249516 | +-------------------------+ create table candybar_consumption_data_new(consumer_id tinyint, candybar_name string, survey_year smallint, gender string, overall_rating tinyint, number_bars_consumed smallint, primary key(consumer_id,candybar_name,survey_year) disable novalidate) stored as parquet; | # Primary Key | NULL | NULL | | Table: | prod_schema.candybar_consumption_data_new | NULL | | Constraint Name: | pk_1809580168_1654543315847_0 | NULL | | Column Names: | consumer_id | candybar_name | +-------------------------------+----------------------------------------------------+-----------------------------+ CREATE INDEX index_name ON TABLE table_name (col1,col2,...) AS index_type WITH DEFERRED REBUILD CREATE INDEX IX_CCD_GENDER ON TABLE CANDYBAR_CONSUMPTION_DATA(GENDER) AS 'BITMAP' WITH DEFERRED REBUILD; beeline> show index on candybar_consumption_data; +-----------------------+----------------------------+-----------------------+-----------------------+----------+ | idx_name | tab_name | col_names | idx_type | comment | +-----------------------+----------------------------+-----------------------+-----------------------+----------+ | ix_ccd_gender | candybar_consumption_data | gender | bitmap | | +-----------------------+----------------------------+-----------------------+-----------------------+----------+ ALTER INDEX index_name ON table-name REBUILD; ALTER INDEX IX_CCD_GENDER ON CANDYBAR_CONSUMPTION_DATA REBUILD; DROP INDEX IX_CCD_GENDER ON CANDYBAR_CONSUMPTION_DATA; ANALYZE TABLE table-name COMPUTE STATISTICS; ANALYZE TABLE table-name COMPUTE STATISTICS FOR COLUMNS; ANALYZE TABLE table-name PARTITION(partition-column-1=value-1, partition-column-2=value-2, ... partition-column-n=value-n) COMPUTE STATISTICS FOR COLUMNS; beeline> analyze table candybar_consumption_data compute statistics for columns; INFO : OK No rows affected (39.507 seconds) beeline> analyze table candybar_consumption_data_part partition(gender='M') compute statistics for columns; Chapter 15 - Complex Data Types in HiveQL and ImpalaSQL SELECT ARRAY('A','B') AS COL1; +------------+--+ | col1 | +------------+--+ | ["A","B"] | +------------+--+ SELECT MAP('A',1,'B',2,'C',3) AS COL1; +----------------------+--+ | col1 | +----------------------+--+ | {"A":1,"B":2,"C":3} | +----------------------+--+ SELECT STRUCT('PA','PENNSYLVANIA') AS COL1; +--------------------------------------+--+ | col1 | +--------------------------------------+--+ | {"col1":"PA","col2":"PENNSYLVANIA"} | +--------------------------------------+-- SELECT NAMED_STRUCT('STATE_CODE','PA', 'STATE_NAME','PENNSYLVANIA') AS COL1; +--------------------------------------------------+--+ | col1 | +--------------------------------------------------+--+ | {"state_code":"PA","state_name":"PENNSYLVANIA"} | +--------------------------------------------------+--+ SELECT SPLIT('PA,NJ,DE,TX,ME',',') AS COL1; +-----------------------------+--+ | col1 | +-----------------------------+--+ | ["PA","NJ","DE","TX","ME"] | +-----------------------------+--+ SELECT STR_TO_MAP('A:1,B:2,C:3',',',':') AS COL1; +----------------------------+--+ | col1 | +----------------------------+--+ | {"A":"1","B":"2","C":"3"} | +----------------------------+--+ CREATE TABLE COMPLEX_TYPES(aSTRINGTHINGS ARRAY, mDICTIONARY MAP, aSTRUCT STRUCT); CREATE TABLE COMPLEX_TYPES(ROW_ID SMALLINT, aSTRINGTHINGS ARRAY, mDICTIONARY MAP, aSTRUCT STRUCT); CREATE TABLE COMPLEX_TAB_1(ROW_ID SMALLINT, aSTATECODES ARRAY) STORED AS PARQUET; INSERT INTO COMPLEX_TAB_1 SELECT 1,ARRAY('AK','AL','PA','TX'); SELECT * FROM COMPLEX_TAB_1; +-----------------------+----------------------------+--+ | complex_tab_1.row_id | complex_tab_1.astatecodes | +-----------------------+----------------------------+--+ | 1 | ["AK","AL","PA","TX"] | +-----------------------+----------------------------+--+ CREATE TABLE COMPLEX_TAB_2(ROW_ID SMALLINT,mDICTIONARY MAP); INSERT INTO COMPLEX_TAB_2 SELECT 1,MAP('IA','MIDWEST', 'IL','MIDWEST', 'IN','MIDWEST', 'KS','MIDWEST', 'MI','MIDWEST', 'MN','MIDWEST', 'MO','MIDWEST', 'ND','MIDWEST', 'NE','MIDWEST'); SELECT * FROM COMPLEX_TAB_2; +-----------------------+----------------------------------------+--+ | complex_tab_2.row_id | complex_tab_2.mdictionary | +-----------------------+----------------------------------------+--+ | 1 | {"IA":"MIDWEST","IL":"MIDWEST",...} | +-----------------------+----------------------------------------+--+ CREATE TABLE COMPLEX_TAB_3(ROW_ID INT, aSTRUCT STRUCT); INSERT INTO COMPLEX_TAB_3 SELECT 0, NAMED_STRUCT('state_code','ZZ','state_name','ZORBA'); SELECT * FROM COMPLEX_TAB_3; +-----------------------+---------------------------------------------+--+ | complex_tab_3.row_id | complex_tab_3.astruct | +-----------------------+---------------------------------------------+--+ | 0 | {"state_code":"ZZ","state_name":"ZORBA"} | +-----------------------+---------------------------------------------+--+ SELECT ROW_NUMBER() OVER (ORDER BY STRUCT_COL) AS ROW_ID,STRUCT_COL FROM ( SELECT NAMED_STRUCT('STATE_CODE',STATE_CODE, 'STATE_NAME',STATE_NAME) AS STRUCT_COL FROM DIM_US_STATE_MAPPING ) A; +---------+--------------------------------------------------------------+--+ | row_id | struct_col | +---------+--------------------------------------------------------------+--+ | 1 | {"state_code":"AA","state_name":"U.S. ARMED FORCES - AMERICAS"} | | 2 | {"state_code":"AE","state_name":"U.S. ARMED FORCES - EUROPE"} | | 3 | {"state_code":"AK","state_name":"ALASKA"} | | 4 | {"state_code":"AL","state_name":"ALABAMA"} | | 5 | {"state_code":"AP","state_name":"U.S. ARMED FORCES - PACIFIC"} | | 6 | {"state_code":"AR","state_name":"ARKANSAS"} | | 7 | {"state_code":"AS","state_name":"AMERICAN SAMOA"} | | 8 | {"state_code":"AZ","state_name":"ARIZONA"} | | 9 | {"state_code":"CA","state_name":"CALIFORNIA"} | | 10 | {"state_code":"CO","state_name":"COLORADO"} | ...snip... SELECT COLLECT_SET(STATE_CODE) FROM DIM_US_STATE_MAPPING; +----------------------------------------------------+--+ | _c0 | +----------------------------------------------------+--+ | ["SA","WA","NT","AB","BC","MB","NB",...snip...] | +----------------------------------------------------+--+ SELECT ROW_ID,aSTATECODES[0] AS STATE_CODE FROM COMPLEX_TAB_1; +---------+-------------+--+ | row_id | state_code | +---------+-------------+--+ | 1 | AK | +---------+-------------+--+ SELECT 'NE' AS STATE_CODE,mDICTIONARY['NE'] AS REGION FROM COMPLEX_TAB_2; +-------------+---------+--+ | state_code | region | +-------------+---------+--+ | NE | MIDWEST | +-------------+---------+--+ SELECT ROW_ID,aSTRUCT.STATE_CODE,aSTRUCT.STATE_NAME FROM COMPLEX_TAB_3; +---------+-------------+-----------------------------------------+--+ | row_id | state_code | state_name | +---------+-------------+-----------------------------------------+--+ | 0 | ZZ | ZORBA | +---------+-------------+-----------------------------------------+--+ SELECT ROW_ID,aSTRUCT.STATE_CODE,aSTRUCT.STATE_NAME FROM COMPLEX_TAB_3 WHERE aSTRUCT.STATE_CODE IN ('CT','PA','ZZ'); SELECT MAP_KEYS(mDICTIONARY) AS KEYS FROM COMPLEX_TAB_2; +------------------------------------------------------+--+ | keys | +------------------------------------------------------+--+ | ["IA","IL","IN","KS","MI","MN","MO","ND",...snip...] | +------------------------------------------------------+--+ SELECT MAP_VALUES(mDICTIONARY) AS VALUES FROM COMPLEX_TAB_2; +------------------------------------------------------+--+ | values | +------------------------------------------------------+--+ | ["MIDWEST","MIDWEST","MIDWEST","MIDWEST",...snip...] | +------------------------------------------------------+--+ SELECT SIZE(mDICTIONARY) AS NBR_OF_KEYS FROM COMPLEX_TAB_2; +--------------+--+ | nbr_of_keys | +--------------+--+ | 51 | +--------------+--+ SELECT SORT_ARRAY(aSTATECODES) AS aSTATECODES_ASC FROM COMPLEX_TAB_1; +------------------------+--+ | astatecodes_asc | +------------------------+--+ | ["AK","AL","PA","TX"] | +------------------------+--+ SELECT ARRAY_CONTAINS(aSTATECODES,'ZZ') AS ZZ_IN, ARRAY_CONTAINS(aSTATECODES,'TX') AS TX_IN FROM COMPLEX_TAB_1; +--------+--------+--+ | zz_in | tx_in | +--------+--------+--+ | false | true | +--------+--------+--+ SELECT EXPLODE(mDICTIONARY) FROM COMPLEX_TAB_2; +------+------------+--+ | key | value | +------+------------+--+ | IA | MIDWEST | | IL | MIDWEST | | IN | MIDWEST | | KS | MIDWEST | | MI | MIDWEST | | MN | MIDWEST | ...snip... SELECT POSEXPLODE(aSTATECODES) AS (INDX,STATE_CODE) FROM COMPLEX_TAB_1; +-------+-------------+--+ | indx | state_code | +-------+-------------+--+ | 0 | AK | | 1 | AL | | 2 | PA | | 3 | TX | +-------+-------------+--+ SELECT A.RNBR FROM ( SELECT POSEXPLODE(SPLIT(REPEAT("X`",20),"`")) AS (RNBR,COL) ) A WHERE A.RNBR>=1 ORDER BY A.RNBR; +---------+--+ | a.rnbr | +---------+--+ | 1 | | 2 | | 3 | ...snip... | 18 | | 19 | | 20 | +---------+--+ CREATE TABLE TEST_RNBR STORED AS PARQUET AS SELECT A.RNBR FROM ( SELECT POSEXPLODE(SPLIT(REPEAT("X`",20),"`")) AS (RNBR,COL) ) A WHERE A.RNBR>=1; CREATE TABLE COMPLEX_TAB_4(asSTUFF ARRAY>); +-----------+----------------------------------------------------+----------+--+ | col_name | data_type | comment | +-----------+----------------------------------------------------+----------+--+ | asSTUFF | array> | | +-----------+----------------------------------------------------+----------+--+ SELECT COLLECT_LIST(mySTRUCT) AS myARRAY FROM ( SELECT NAMED_STRUCT('STATE_CODE',A.STATE_CODE, 'STATE_NAME',A.STATE_NAME) AS mySTRUCT FROM ( SELECT STATE_CODE,STATE_NAME FROM DIM_US_STATE_MAPPING ) A ) B; +----------------------------------------------------------------------+--+ | myarray | +----------------------------------------------------------------------+--+ | [{"state_code":"AK","state_name":"ALASKA"},...snip...] | +----------------------------------------------------------------------+--+ INSERT INTO COMPLEX_TAB_4 SELECT COLLECT_LIST(mySTRUCT) AS myARRAY FROM ( SELECT NAMED_STRUCT('STATE_CODE',A.STATE_CODE, 'STATE_NAME',A.STATE_NAME) AS mySTRUCT FROM ( SELECT STATE_CODE,STATE_NAME FROM DIM_US_STATE_MAPPING ) A ) B; Chapter 16 - SQL Performance Improvements COMPUTE STATS prod_schema.table_name; INSERT INTO MYTABLE VALUES(1,2,3); INSERT INTO MYTABLE VALUES(4,5,6); INSERT INTO MYTABLE VALUES(7,8,9); INSERT INTO MYTABLE VALUES(1,2,3), (4,5,6), (7,8,9); SET COMPRESSION_CODEC=codec_option; SET COMPRESSION_CODEC=snappy; INSERT /*+ APPEND */ INTO ... SELECT STRAIGHT_JOIN A.COLUMN_1, B.COLUMN_2, C.COLUMN_3 FROM TABLE_A A LEFT JOIN TABLE_B B ON A.COLUMN_1 = B.COLUMN_2 LEFT JOIN TABLE_C C ON A.COLUMN_1 = C.COLUMN_1 LEFT JOIN TABLE_D D ON C.COLUMN_2 = D.COLUMN_2 LEFT JOIN /* +SHUFFLE */ TABLE_E E ON A.COLUMN_1 = E.COLUMN_1; CREATE TABLE DIM_POSTAL_CODE_SORTED(POSTAL_CODE STRING, CITY STRING, STATE_CODE STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE) SORT BY (STATE_CODE,POSTAL_CODE) STORED AS PARQUET; INSERT INTO DIM_POSTAL_CODE_SORTED SELECT * FROM DIM_POSTAL_CODE; COMPUTE STATS DIM_POSTAL_CODE_SORTED; [hdpserver.com:21000] prod_schema> SELECT * FROM DIM_POSTAL_CODE_SORTED LIMIT 20; +-------------+-----------+------------+-----------+-------------+ | postal_code | city | state_code | latitude | longitude | +-------------+-----------+------------+-----------+-------------+ | 09323 | APO | AE | -44.25 | 33.53 | | 99501 | ANCHORAGE | AK | 61.216799 | -149.87828 | | 99502 | ANCHORAGE | AK | 61.153693 | -149.95932 | | 99503 | ANCHORAGE | AK | 61.19026 | -149.89341 | | 99504 | ANCHORAGE | AK | 61.204466 | -149.74633 | | 99505 | JBER | AK | 61.261518 | -149.66336 | | 99506 | JBER | AK | 61.224384 | -149.77461 | | 99507 | ANCHORAGE | AK | 61.154834 | -149.82865 | | 99508 | ANCHORAGE | AK | 61.203953 | -149.8144 | | 99509 | ANCHORAGE | AK | 61.108864 | -149.440311 | | 99510 | ANCHORAGE | AK | 61.144568 | -149.878418 | | 99511 | ANCHORAGE | AK | 61.068324 | -149.800476 | | 99512 | ANCHORAGE | AK | 61.203954 | -149.808426 | | 99513 | ANCHORAGE | AK | 61.214877 | -149.88617 | | 99514 | ANCHORAGE | AK | 61.108864 | -149.440311 | | 99515 | ANCHORAGE | AK | 61.122943 | -149.88852 | | 99516 | ANCHORAGE | AK | 61.101142 | -149.77311 | | 99517 | ANCHORAGE | AK | 61.188276 | -149.93438 | | 99518 | ANCHORAGE | AK | 61.156565 | -149.88335 | | 99519 | ANCHORAGE | AK | 61.108864 | -149.440311 | +-------------+-----------+------------+-----------+-------------+ [hdpserver.com:21000] prod_schema> SELECT * FROM DIM_POSTAL_CODE LIMIT 20; +-------------+-------------+------------+-----------+--------------------+ | postal_code | city | state_code | latitude | longitude | +-------------+-------------+------------+-----------+--------------------+ | 00623 | CABO ROJO | PR | 18.08643 | -67.15222 | | 00633 | CAYEY | PR | 18.194527 | -66.18346699999999 | | 00640 | COAMO | PR | 18.077197 | -66.359104 | | 00676 | MOCA | PR | 18.37956 | -67.08423999999999 | | 00728 | PONCE | PR | 18.013353 | -66.65218 | | 00734 | PONCE | PR | 17.999499 | -66.643934 | | 00735 | CEIBA | PR | 18.258444 | -65.65987 | | 00748 | FAJARDO | PR | 18.326732 | -65.652484 | | 00766 | VILLALBA | PR | 18.126023 | -66.48208 | | 00771 | LAS PIEDRAS | PR | 18.18744 | -65.87088 | | 00791 | HUMACAO | PR | 18.147257 | -65.82268999999999 | | 00901 | SAN JUAN | PR | 18.465426 | -66.10786 | | 00906 | SAN JUAN | PR | 18.46454 | -66.10079 | | 00909 | SAN JUAN | PR | 18.442282 | -66.06764 | | 00922 | SAN JUAN | PR | 18.410462 | -66.06053300000001 | | 00924 | SAN JUAN | PR | 18.401917 | -66.01194 | | 00961 | BAYAMON | PR | 18.412462 | -66.16033 | | 01704 | FRAMINGHAM | MA | 42.446396 | -71.459405 | | 01731 | HANSCOM AFB | MA | 42.459085 | -71.27556 | | 01746 | HOLLISTON | MA | 42.196065 | -71.43797000000001 | +-------------+-------------+------------+-----------+--------------------+ hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/ dim_postal_code/state_code=NC CREATE TABLE DIM_POSTAL_CODE_PART( POSTAL_CODE STRING, CITY STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE ) PARTITIONED BY ( STATE_CODE STRING ) COMMENT 'DIM_POSTAL_CODE PARTITIONED BY STATE_CODE' STORED AS PARQUET TBLPROPERTIES('transactional'='false'); ERROR: AnalysisException: ALTER TABLE not supported on transactional (ACID) table: prod_schema.dim_postal_code_part. hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_postal_code_part INSERT INTO DIM_POSTAL_CODE_PART(POSTAL_CODE, CITY, LATITUDE, LONGITUDE, STATE_CODE) SELECT POSTAL_CODE,CITY,LATITUDE,LONGITUDE,STATE_CODE FROM DIM_POSTAL_CODE; [hdpserver.com:21000] prod_schema> SHOW PARTITIONS DIM_POSTAL_CODE_PART; +------------+-------+--------+---------+----------------------------------------+ | state_code | #Rows | #Files | Size | Location | +------------+-------+--------+---------+----------------------------------------+ | AE | -1 | 1 | 1.14KB | .../dim_postal_code_part/state_code=AE | | AK | -1 | 1 | 9.96KB | .../dim_postal_code_part/state_code=AK | ...snip... | WV | -1 | 1 | 29.84KB | .../dim_postal_code_part/state_code=WV | | WY | -1 | 1 | 7.61KB | .../dim_postal_code_part/state_code=WY | | Total | -1 | 61 | 1.31MB | | +------------+-------+--------+---------+----------------------------------------+ COMPUTE INCREMENTAL STATS DIM_POSTAL_CODE_PART; [hdpserver.com:21000] prod_schema> SHOW PARTITIONS DIM_POSTAL_CODE_PART; +------------+-------+--------+---------+----------------------------------------+ | state_code | #Rows | #Files | Size | Location | +------------+-------+--------+---------+----------------------------------------+ | AE | 1 | 1 | 1.14KB | .../dim_postal_code_part/state_code=AE | | AK | 278 | 1 | 9.96KB | .../dim_postal_code_part/state_code=AK | ...snip... | WV | 943 | 1 | 29.84KB | .../dim_postal_code_part/state_code=WV | | WY | 203 | 1 | 7.61KB | .../dim_postal_code_part/state_code=WY | | Total | 43689 | 61 | 1.31MB | | +------------+-------+--------+---------+----------------------------------------+ [hdpserver.com:21000] prod_schema> SHOW TABLE STATS DIM_POSTAL_CODE; +-------+--------+--------+--------------+------------+---------+------------+---------------------+ | #Rows | #Files | Size | Bytes Cached | Cache Repl | Format | Incr stats | Location | +-------+--------+--------+--------------+------------+---------+------------+---------------------+ | 43689 | 1 | 1.31MB | NOT CACHED | NOT CACHED | PARQUET | false | .../dim_postal_code | +-------+--------+--------+--------------+------------+---------+------------+---------------------+ [hdpserver.com:21000] prod_schema> SHOW PARTITIONS DIM_POSTAL_CODE; ERROR: AnalysisException: Table is not partitioned: prod_schema.dim_postal_code [smithbob@lnxserver ~]$ hadoop fs -ls hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_postal_code_part Found 61 items drwxrwx---+ - impala hive 0 2022-04-04 13:42 hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_postal_code_part/state_code=AE drwxrwx---+ - impala hive 0 2022-04-04 13:42 hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_postal_code_part/state_code=AK ...snip... drwxrwx---+ - impala hive 0 2022-04-04 13:42 hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_postal_code_part/state_code=WV drwxrwx---+ - impala hive 0 2022-04-04 13:42 hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_postal_code_part/state_code=WY EXPLAIN select-statement; [hdpserver.com:21000] prod_schema> explain select * from dim_postal_code where state_code in ('NJ','PA'); +------------------------------------------------------------+ | Explain String | +------------------------------------------------------------+ | Max Per-Host Resource Reservation: Memory=2.00MB Threads=3 | | Per-Host Resource Estimates: Memory=80MB | | Codegen disabled by planner | | | | PLAN-ROOT SINK | | | | | 01:EXCHANGE [UNPARTITIONED] | | | | | 00:SCAN HDFS [prod_schema.dim_postal_code] | | HDFS partitions=1/1 files=1 size=1.28MB | | predicates: state_code IN ('NJ', 'PA') | | row-size=68B cardinality=1.43K | +------------------------------------------------------------+ [hdpserver.com:21000] prod_schema> explain select * from dim_postal_code_part where state_code in ('NJ','PA'); +--------------------------------------------------------------+ | Explain String | +--------------------------------------------------------------+ | Max Per-Host Resource Reservation: Memory=128.00KB Threads=3 | | Per-Host Resource Estimates: Memory=64MB | | Codegen disabled by planner | | | | PLAN-ROOT SINK | | | | | 01:EXCHANGE [UNPARTITIONED] | | | | | 00:SCAN HDFS [prod_schema.dim_postal_code_part] | | partition predicates: state_code IN ('NJ', 'PA') | | HDFS partitions=2/61 files=2 size=96.41KB | | row-size=66B cardinality=3.05K | +--------------------------------------------------------------+ ALTER TABLE DIM_POSTAL_CODE_PART ADD IF NOT EXISTS PARTITION (STATE_CODE='CL'); [hdpserver.com:21000] prod_schema> SHOW PARTITIONS DIM_POSTAL_CODE_PART; +------------+-------+--------+---------+-------------------+----------------------------------------+ | state_code | #Rows | #Files | Size | Incremental Stats | Location | +------------+-------+--------+---------+------------------------------------------------------------+ | AE | 1 | 1 | 1.14KB | true | .../dim_postal_code_part/state_code=AE | | AK | 278 | 1 | 9.96KB | true | .../dim_postal_code_part/state_code=AK | ...snip... | CL | -1 | 0 | 0B | false | .../dim_postal_code_part/state_code=CL | ...snip... | WV | 943 | 1 | 29.84KB | true | .../dim_postal_code_part/state_code=WV | | WY | 203 | 1 | 7.61KB | true | .../dim_postal_code_part/state_code=WY | | Total | 43689 | 61 | 1.31MB | true | | +------------+-------+--------+---------+-------------------+----------------------------------------+ INSERT INTO DIM_POSTAL_CODE_PART PARTITION(STATE_CODE='CL') SELECT POSTAL_CODE,CITY,LATITUDE,LONGITUDE FROM DIM_POSTAL_CODE WHERE STATE_CODE='PA'; Modified 2287 row(s) in 0.21s [hdpserver.com:21000] prod_schema> SHOW PARTITIONS DIM_POSTAL_CODE_PART; +------------+-------+--------+---------+-------------------+----------------------------------------+ | state_code | #Rows | #Files | Size | Incremental Stats | Location | +------------+-------+--------+---------+------------------------------------------------------------+ | AE | 1 | 1 | 1.14KB | true | .../dim_postal_code_part/state_code=AE | | AK | 278 | 1 | 9.96KB | true | .../dim_postal_code_part/state_code=AK | ...snip... | CL | -1 | 1 | 72.20KB | false | .../dim_postal_code_part/state_code=CL | ...snip... | WY | 203 | 1 | 7.61KB | true | .../dim_postal_code_part/state_code=WY | | Total | 43689 | 62 | 1.38MB | true | | +------------+-------+--------+---------+-------------------+----------------------------------------+ [hdpserver.com:21000] prod_schema> COMPUTE INCREMENTAL STATS DIM_POSTAL_CODE_PART; +-----------------------------------------+ | summary | +-----------------------------------------+ | Updated 1 partition(s) and 4 column(s). | +-----------------------------------------+ [hdpserver.com:21000] prod_schema> SHOW PARTITIONS DIM_POSTAL_CODE_PART; +------------+-------+--------+---------+-------------------+----------------------------------------+ | state_code | #Rows | #Files | Size | Incremental Stats | Location | +------------+-------+--------+---------+------------------------------------------------------------+ | AE | 1 | 1 | 1.14KB | true | .../dim_postal_code_part/state_code=AE | | AK | 278 | 1 | 9.96KB | true | .../dim_postal_code_part/state_code=AK | ...snip... | CL | 2287 | 1 | 72.20KB | true | .../dim_postal_code_part/state_code=CL | ...snip... | WV | 943 | 1 | 29.84KB | true | .../dim_postal_code_part/state_code=WV | | WY | 203 | 1 | 7.61KB | true | .../dim_postal_code_part/state_code=WY | | Total | 43689 | 62 | 1.38MB | true | | +------------+-------+--------+---------+-------------------+----------------------------------------+ abs(fnv_hash(column-name)) abs(fnv_hash(column-name)) % nbr-partitions cast(abs(fnv_hash(column-name)) % nbr-partitions) as data-type) USE PROD_SCHEMA; CREATE TABLE DIM_POSTAL_CODE_HASH(POSTAL_CODE STRING, CITY STRING, STATE_CODE STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE) PARTITIONED BY (PARTKEY TINYINT) STORED AS PARQUET; INSERT INTO DIM_POSTAL_CODE_HASH(POSTAL_CODE, CITY, STATE_CODE, LATITUDE, LONGITUDE, PARTKEY) SELECT A.POSTAL_CODE,A.CITY,A.STATE_CODE,A.LATITUDE,A.LONGITUDE,A.PARTKEY FROM ( SELECT POSTAL_CODE,CITY,STATE_CODE,LATITUDE,LONGITUDE, CAST(ABS(FNV_HASH(LATITUDE)) % 10 AS TINYINT) AS PARTKEY FROM DIM_POSTAL_CODE ) A; COMPUTE STATS DIM_POSTAL_CODE_HASH; [hdpserver.com:21000] default> SHOW PARTITIONS DIM_POSTAL_CODE_HASH; +---------+-------+--------+----------+--------------------------------------------------------------+ | partkey | #Rows | #Files | Size | Location | +---------+-------+--------+----------+--------------------------------------------------------------+ | 0 | 4463 | 1 | 141.15KB | hdfs://lnxserver.com:8020/.../dim_postal_code_hash/partkey=0 | | 1 | 4369 | 1 | 139.32KB | hdfs://lnxserver.com:8020/.../dim_postal_code_hash/partkey=1 | ...snip... | 8 | 4434 | 1 | 142.00KB | hdfs://lnxserver.com:8020/.../dim_postal_code_hash/partkey=8 | | 9 | 4255 | 1 | 137.69KB | hdfs://lnxserver.com:8020/.../dim_postal_code_hash/partkey=9 | | Total | 43689 | 10 | 1.37MB | | +---------+-------+--------+----------+--------------------------------------------------------------+ SELECT COUNT(*) FROM DIM_POSTAL_CODE_HASH WHERE LATITUDE=47.376884 AND PARTKEY=CAST(ABS(FNV_HASH(LATITUDE)) % 10 AS TINYINT); CREATE TABLE KUDU_TBL_01(COL1 STRING PRIMARY KEY) STORED AS KUDU; +-------------------------+ | summary | +-------------------------+ | Table has been created. | +-------------------------+ WARNINGS: Unpartitioned Kudu tables are inefficient for large data sizes. CREATE TABLE DIM_POSTAL_CODE_KUDU(POSTAL_CODE STRING PRIMARY KEY, CITY STRING, STATE_CODE STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE) PARTITION BY HASH (POSTAL_CODE) PARTITIONS 50 STORED AS KUDU; CREATE TABLE DIM_POSTAL_CODE_KUDU(POSTAL_CODE STRING, STATE_CODE STRING, CITY STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE, PRIMARY KEY(POSTAL_CODE,STATE_CODE)) PARTITION BY HASH (STATE_CODE) PARTITIONS 3 STORED AS KUDU; CREATE TABLE DIM_POSTAL_CODE_KUDU(POSTAL_CODE STRING, STATE_CODE STRING, CITY STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE, PRIMARY KEY(POSTAL_CODE,STATE_CODE)) PARTITION BY RANGE (STATE_CODE) ( PARTITION VALUE = 'AK', PARTITION VALUE = 'AL', PARTITION VALUE = 'AR', ...skip... PARTITION VALUE = 'WI', PARTITION VALUE = 'WV', PARTITION VALUE = 'WY' ) STORED AS KUDU; CREATE TABLE DIM_POSTAL_CODE_KUDU(POSTAL_CODE STRING, STATE_CODE STRING, CITY STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE, PRIMARY KEY(POSTAL_CODE,STATE_CODE)) PARTITION BY HASH (POSTAL_CODE) PARTITIONS 100, RANGE (STATE_CODE) ( PARTITION VALUE = 'AK', PARTITION VALUE = 'AL', PARTITION VALUE = 'AR', ...skip... PARTITION VALUE = 'WI', PARTITION VALUE = 'WV', PARTITION VALUE = 'WY' ) STORED AS KUDU; CREATE TABLE DIM_POSTAL_CODE_KUDU(POSTAL_CODE STRING, STATE_CODE STRING, CITY STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE, PRIMARY KEY(POSTAL_CODE)) PARTITION BY RANGE (POSTAL_CODE) ( PARTITION '00000' <= VALUES < '09999', PARTITION '10000' <= VALUES < '19999', PARTITION '20000' <= VALUES < '29999', PARTITION '30000' <= VALUES < '39999', PARTITION '40000' <= VALUES < '49999', PARTITION '50000' <= VALUES < '59999', PARTITION '60000' <= VALUES < '69999', PARTITION '70000' <= VALUES < '79999', PARTITION '80000' <= VALUES < '89999', PARTITION '90000' <= VALUES < '99999' ) STORED AS KUDU; CREATE TABLE KUDU_TBL_01(COL1 STRING PRIMARY KEY) STORED AS KUDU; INSERT INTO KUDU_TBL_01 VALUES('ABC'); INSERT INTO KUDU_TBL_01 VALUES('DEF'); INSERT INTO KUDU_TBL_01 VALUES('GHI'); INSERT INTO KUDU_TBL_01 VALUES(NULL); WARNINGS: Row with null value violates nullability constraint on table 'impala::prod_schema.KUDU_TBL_01'. INSERT INTO KUDU_TBL_01 VALUES('JKL'); CREATE TABLE ADDL_ROWS(COL1 STRING) STORED AS PARQUET; INSERT INTO ADDL_ROWS VALUES('ZZZ'); INSERT INTO ADDL_ROWS VALUES('YYY'); INSERT INTO ADDL_ROWS VALUES('XXX'); INSERT INTO ADDL_ROWS VALUES(NULL); INSERT INTO ADDL_ROWS VALUES('WWW'); INSERT INTO KUDU_TBL_01 SELECT COL1 FROM ADDL_ROWS; WARNINGS: Row with null value violates nullability constraint on table 'impala::prod_schema.KUDU_TBL_01'. PART III - Working with the Linux Operating System Chapter 17 - PuTTY and the Linux Edge Node Server [smithbob@lnxserver ~]$ vi test1.sql [hdpserver:21000] prod_schema> select count(*) > alter connect delete describe exit help insert profile rerun set show src tip update use version compute create desc drop explain history load quit select shell source summary unset upsert values with > from prod_schema.dim_postal_code > > ; Chapter 18 - Introduction to the Linux Operating System [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ ls bigmike_output.tsv [smithbob@lnxserver ~]$ mkdir python_programs [smithbob@lnxserver ~]$ cd python_programs [smithbob@lnxserver python_programs]$ [smithbob@lnxserver python_programs]$ cd .. [smithbob@lnxserver ~]$ [smithbob@lnxserver python_programs]$ cd ../.. [smithbob@lnxserver home]$ [smithbob@lnxserver ~]$ cd [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ cd /home/smithbob/python_programs [smithbob@lnxserver python_programs]$ [smithbob@lnxserver ~]$ cd python_programs/ [smithbob@lnxserver python_programs]$ touch newfile [smithbob@lnxserver python_programs]$ ls newfile [smithbob@lnxserver python_programs]$ rm newfile [smithbob@lnxserver python_programs]$ cd [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ rmdir python_programs [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ cd [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ mkdir python_programs [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ cd python_programs [smithbob@lnxserver python_programs]$ pwd /home/smithbob/python_programs [smithbob@lnxserver python_programs]$ touch newfile [smithbob@lnxserver python_programs]$ ls newfile [smithbob@lnxserver python_programs]$ mv newfile newfile1 [smithbob@lnxserver python_programs]$ ls newfile1 [smithbob@lnxserver python_programs]$ [smithbob@lnxserver python_programs]$ pwd /home/smithbob/python_programs [smithbob@lnxserver python_programs]$ mkdir _archive [smithbob@lnxserver python_programs]$ ls _archive newfile1 [smithbob@lnxserver python_programs]$ mv newfile1 ./_archive [smithbob@lnxserver python_programs]$ ls _archive [smithbob@lnxserver python_programs]$ cd _archive [smithbob@lnxserver _archive]$ pwd /home/smithbob/python_programs/_archive [smithbob@lnxserver _archive]$ ls newfile1 [smithbob@lnxserver _archive]$ cd .. [smithbob@lnxserver python_programs]$ pwd /home/smithbob/python_programs [smithbob@lnxserver python_programs]$ ls _archive [smithbob@lnxserver python_programs]$ [smithbob@lnxserver python_programs]$ cd [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ cd python_programs [smithbob@lnxserver python_programs]$ pwd /home/smithbob/python_programs [smithbob@lnxserver python_programs]$ touch newfile2 [smithbob@lnxserver python_programs]$ ls _archive newfile2 [smithbob@lnxserver python_programs]$ cp newfile2 ./_archive [smithbob@lnxserver python_programs]$ ls _archive newfile2 [smithbob@lnxserver python_programs]$ cd _archive [smithbob@lnxserver _archive]$ ls newfile1 newfile2 [smithbob@lnxserver _archive]$ cd .. [smithbob@lnxserver python_programs]$ pwd /home/smithbob/python_programs [smithbob@lnxserver python_programs]$ ls _archive newfile2 [smithbob@lnxserver python_programs]$ cp newfile2 ./_archive/newfile2A [smithbob@lnxserver python_programs]$ ls _archive newfile2 [smithbob@lnxserver python_programs]$ cd _archive [smithbob@lnxserver _archive]$ ls newfile1 newfile2 newfile2A [smithbob@lnxserver _archive]$ [smithbob@lnxserver _archive]$ history ...snip... 1077 pwd 1078 cd python_programs 1079 pwd 1080 touch newfile2 1081 ls 1082 cp newfile2 ./_archive 1083 ls 1084 cd _archive 1085 ls 1086 cd .. 1087 pwd 1088 ls 1089 cp newfile2 ./_archive/newfile2A 1090 ls 1091 cd _archive 1092 ls 1093 history [smithbob@lnxserver _archive]$ [smithbob@lnxserver proc]$ cat /proc/cpuinfo processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 63 model name : Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz stepping : 2 microcode : 0x44 cpu MHz : 3172.558 cache size : 15360 KB physical id : 0 siblings : 12 core id : 0 cpu cores : 6 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 15 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep bogomips : 4789.07 clflush size : 64 cache_alignment : 64 address sizes : 46 bits physical, 48 bits virtual power management: ...snip... [smithbob@lnxserver proc]$ [smithbob@lnxserver proc]$ head /proc/cpuinfo processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 63 model name : Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz stepping : 2 microcode : 0x44 cpu MHz : 1200.000 cache size : 15360 KB physical id : 0 [smithbob@lnxserver proc]$ tail /proc/cpuinfo fpu_exception : yes cpuid level : 15 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep bogomips : 4794.10 clflush size : 64 cache_alignment : 64 address sizes : 46 bits physical, 48 bits virtual power management: [smithbob@lnxserver proc]$ head -2 /proc/cpuinfo processor : 0 vendor_id : GenuineIntel [smithbob@lnxserver proc]$ tail -2 /proc/cpuinfo power management:  blank line [smithbob@lnxserver proc]$ [smithbob@lnxserver proc]$ cd [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ cd python_programs [smithbob@lnxserver python_programs]$ ls _archive newfile2 [smithbob@lnxserver python_programs]$ rm -i newfile2 rm: remove regular empty file ‘newfile2’? y [smithbob@lnxserver python_programs]$ ls _archive [smithbob@lnxserver python_programs]$ [smithbob@lnxserver python_programs]$ grep "model name" /proc/cpuinfo model name : Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz model name : Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz model name : Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz ...snip... [smithbob@lnxserver python_programs]$ grep -i "intel" /proc/cpuinfo vendor_id : GenuineIntel model name : Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz flags : fpu vme de pse tsc msr pae mce cx8 apic sep intel_ppin intel_stibp flush_l1d vendor_id : GenuineIntel ...snip... grep -i "intel" /proc/cpuinfo [smithbob@lnxserver ~]$ cd [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ ls -R python_programs python_programs: _archive python_programs/_archive: newfile1 newfile2 newfile2A [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ ls -alF python_programs total 16 drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 22 10:33 ./ drwx------ 51 smithbob hdpbob_users 16384 Oct 22 09:40 ../ drwxr-xr-x 2 smithbob hdpbob_users 152 Oct 22 09:57 _archive/ [smithbob@lnxserver ~]$ ls -alFR python_programs python_programs: total 16 drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 22 10:33 ./ drwx------ 51 smithbob hdpbob_users 16384 Oct 22 09:40 ../ drwxr-xr-x 2 smithbob hdpbob_users 152 Oct 22 09:57 _archive/ python_programs/_archive: total 0 drwxr-xr-x 2 smithbob hdpbob_users 152 Oct 22 09:57 ./ drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 22 10:33 ../ -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 22 09:41 newfile1 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 22 09:56 newfile2 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 22 09:57 newfile2A [smithbob@lnxserver ~]$ alias lsf='ls -alF' [smithbob@lnxserver ~]$ lsf python_programs total 16 drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 22 10:33 ./ drwx------ 51 smithbob hdpbob_users 16384 Oct 22 09:40 ../ drwxr-xr-x 2 smithbob hdpbob_users 152 Oct 22 09:57 _archive/ [smithbob@lnxserver ~]$ cd python_programs [smithbob@lnxserver python_programs]$ cd _archive [smithbob@lnxserver _archive]$ ls newfile1 newfile2 newfile2A [smithbob@lnxserver _archive]$ alias rm='rm -i' [smithbob@lnxserver _archive]$ rm newfile2A rm: remove regular empty file ‘newfile2A’? y [smithbob@lnxserver _archive]$ ls newfile1 newfile2 [smithbob@lnxserver _archive]$ rm -f newfile2 [smithbob@lnxserver _archive]$ ls newfile1 [smithbob@lnxserver _archive]$ [smithbob@lnxserver _archive]$ wc /proc/cpuinfo 624 4848 27378 /proc/cpuinfo [smithbob@lnxserver _archive]$ wc -l /proc/cpuinfo 624 /proc/cpuinfo [smithbob@lnxserver _archive]$ dos2unix newfile1 dos2unix: converting file newfile1 to Unix format ... [smithbob@lnxserver _archive]$ [smithbob@lnxserver ~]$ cd [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ find /home/smithbob -name "newfile1" /home/smithbob/python_programs/_archive/newfile1 [smithbob@lnxserver ~]$ find . -name "newfile1" ./python_programs/_archive/newfile1 [smithbob@lnxserver ~]$ echo "Program complete." Program complete. [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ date Sat Oct 23 13:49:27 EST 2021 [smithbob@lnxserver ~]$ [smithbob@lnxserver jobs]$ date +%Y 2021 [smithbob@lnxserver jobs]$ [smithbob@lnxserver jobs]$ date +%Y%m 202110 [smithbob@lnxserver jobs]$ [smithbob@lnxserver jobs]$ date +%Y%m 202110 [smithbob@lnxserver jobs]$ date -d "-1 month" +%Y%m 202109 [smithbob@lnxserver jobs]$ [smithbob@lnxserver _archive]$ man ls LS(1) User Command LS(1) NAME ls - list directory contents SYNOPSIS ls [OPTION]... [FILE]... DESCRIPTION List information about the FILEs (the current directory by default). Sort entries alphabetically if none of -cftuvSUX nor --sort is specified. Mandatory arguments to long options are mandatory for short options too. -a, --all do not ignore entries starting with . -A, --almost-all do not list implied . and .. --author with -l, print the author of each file -b, --escape print C-style escapes for nongraphic characters ...snip... Manual page ls(1) line 1 (press h for help or q to quit) [smithbob@lnxserver _archive]$ man -k "sexy knickers" sexy knickers: nothing appropriate. [smithbob@lnxserver _archive]$ man -k "merge" envz_merge (3) - environment string support augenrules (8) - a script that merges component audit rule files intltool-merge (8) - merge translated strings into various types of file intltool-update (8) - updates PO template file and merge translations with it merge (1) - three-way file merge mergerepo (1) - Merge multiple repositories together msgmerge (1) - merge message catalog and template paste (1) - merge lines of files paste (1p) - merge corresponding or subsequent lines of files pdfunite (1) - Portable Document Format (PDF) page merger ppdmerge (1) - merge ppd files rcsmerge (1) - merge RCS revisions sdiff (1) - side-by-side merge of file differences sort (1p) - sort, merge, or sequence check text files stap-merge (1) - systemtap per-cpu binary merger strace-log-merge (1) - merge strace - ff - tt output tcpslice (8) - extract pieces of and/or merge together tcpdump files vgmerge (8) - Merge volume groups zipmerge (1) - merge zip archives [smithbob@lnxserver _archive]$ [smithbob@lnxserver ~]$ find . -name "newfile*" ./python_programs/_archive/newfile1 ./python_programs/_archive/newfile2 ./python_programs/_archive/newfile3 ./python_programs/_archive/newfile4A ./python_programs/_archive/newfile4 [smithbob@lnxserver ~]$ find . -name "newfile[1234]" ./python_programs/_archive/newfile1 ./python_programs/_archive/newfile2 ./python_programs/_archive/newfile3 ./python_programs/_archive/newfile4 [smithbob@lnxserver ~]$ cd [smithbob@lnxserver ~]$ cd python_programs [smithbob@lnxserver python_programs]$ cd _archive [smithbob@lnxserver _archive]$ lsf total 8 drwxr-xr-x 2 smithbob hdpbob_users 8192 Oct 23 13:15 ./ drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 22 10:33 ../ -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 10:24 newfile1 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:07 newfile2 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:07 newfile3 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:08 newfile4 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:08 newfile4A [smithbob@lnxserver _archive]$ lsf newfile[1234] -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 10:24 newfile1 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:07 newfile2 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:07 newfile3 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:08 newfile4 [smithbob@lnxserver _archive]$ [smithbob@lnxserver _archive]$ cat /proc/cpuinfo | wc -l 624 [smithbob@lnxserver _archive]$ [smithbob@lnxserver _archive]$ cat /proc/cpuinfo | wc -l | wc -w 1 [smithbob@lnxserver _archive]$ [smithbob@lnxserver ~]$ echo "Program starts..." > myprogram_20211015.log [smithbob@lnxserver ~]$ cat myprogram_20211015.log Program starts... [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ echo "Program ends." >> myprogram_20211015.log [smithbob@lnxserver ~]$ cat myprogram_20211015.log Program starts... Program ends. [smithbob@lnxserver ~]$ * STDIN - Standard Input - Indicates the input used by a command (occasionally, stdin). * STDOUT - Standard Output - Indicates the output produced by a command (occasionally, stdout). * STDERR - Standard Error - Indicates any error messages produced by a command (occasionally, stderr). * 0 - STDIN * 1 - STDOUT * 2 - STDERR [smithbob@lnxserver ~]$ python mypgm.py > mypgm.log 2>&1 [smithbob@lnxserver ~]$ python --help usage: python [option] ... [-c cmd | -m mod | file | -] [arg] ... Options and arguments (and corresponding environment variables): ...snip... -i : inspect interactively after running script; forces a prompt even if stdin does not appear to be a terminal; also PYTHONINSPECT=x -u : unbuffered binary stdout and stderr; also PYTHONUNBUFFERED=x - : program read from stdin (default; interactive mode if a tty) ...snip... [smithbob@lnxserver ~]$ python mypgm.py > client_output_data.txt 2>/dev/null [smithbob@lnxserver ~]$ echo "Program started at `date`" > myprogram_20211015.log [smithbob@lnxserver ~]$ cat myprogram_20211015.log Program started at Sat Oct 23 14:10:59 EDT 2021 [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ cat /proc/cpuinfo | sed -e 's/Xeon/386/g' processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 63 model name : Intel(R) 386(R) CPU E5-2620 v3 @ 2.40GHz stepping : 2 ...snip... [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ sed -e 's/Xeon/386/g' /proc/cpuinfo [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ sed -e 's/E5-\([0-9][0-9][0-9][0-9]\)/ZZ-\1/g' /proc/cpuinfo [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ cat /proc/cpuinfo | sed -e '/model name/d' [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ echo "1,2,3" > file_without_header.csv [smithbob@lnxserver ~]$ echo "4,5,6" >> file_without_header.csv [smithbob@lnxserver ~]$ echo "7,8,9" >> file_without_header.csv [smithbob@lnxserver ~]$ cat file_without_header.csv 1,2,3 4,5,6 7,8,9 [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ cat file_without_header.csv | sed -e '1 i COL1,COL2,COL3' > file_with_header.csv [smithbob@lnxserver ~]$ cat file_with_header.csv COL1,COL2,COL3 1,2,3 4,5,6 7,8,9 [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ cat /proc/cpuinfo | tr '[:upper:]' '[:lower:]' ...snip... vendor_id : genuineintel cpu family : 6 model : 63 model name : intel(r) xeon(r) cpu e5-2620 v3 @ 2.40ghz ...snip... [smithbob@lnxserver ~]$ tr 'from-set' 'to-set' [smithbob@lnxserver ~]$ cat /proc/cpuinfo | tr -d '[:upper:]' vendor_id : enuinentel cpu family : 6 model : 63 model name : ntel() eon() 5-2620 v3 @ 2.40z [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ wget -q "https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv" -O /home/smithbob/poptotal.zip [smithbob@lnxserver ~]$ unzip poptotal.zip Archive: poptotal.zip inflating: Metadata_Indicator_API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv inflating: API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv inflating: Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv [smithbob@lnxserver ~]$ cat log.log | mail -s "Subject Line" smithbob@company.com [smithbob@lnxserver ~]$ cat log.log | mail -s "Subject Line" -a us_state_mapping.csv smithbob@company.com [smithbob@lnxserver ~]$ lsf *.csv -rw-r--r-- 1 smithbob hdpbob_users 182389 Oct 11 11:07 API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv [smithbob@lnxserver ~]$ gzip API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv [smithbob@lnxserver ~]$ lsf *.gz -rw-r--r-- 1 smithbob hdpbob_users 74475 Oct 11 11:07 API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv.gz [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ gzip -9 API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv [smithbob@lnxserver ~]$ lsf *.gz -rw-r--r-- 1 smithbob hdpbob_users 73619 Oct 11 11:07 API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv.gz [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ gzip -d API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv.gz [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ zip all.zip *.csv adding: API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv (deflated 59%) adding: Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv (deflated 79%) adding: Metadata_Indicator_API_SP.POP.TOTL_DS2_en_csv_v2_3052518.csv (deflated 41%) adding: us_state_mapping.csv (deflated 45%) [smithbob@lnxserver ~]$ cd /tmp [smithbob@lnxserver tmp]$ tar -zvcf bu_pp.tgz /home/smithbob/python_programs /home/smithbob/python_programs/ /home/smithbob/python_programs/myprogram_20211015.log /home/smithbob/python_programs/_archive/ /home/smithbob/python_programs/_archive/zzz /home/smithbob/python_programs/_archive/newfile1 /home/smithbob/python_programs/_archive/newfile2 /home/smithbob/python_programs/_archive/newfile3 /home/smithbob/python_programs/_archive/newfile4A /home/smithbob/python_programs/_archive/newfile4 [smithbob@lnxserver tmp]$ lsf *.tgz -rw-r--r-- 1 smithbob hdpbob_users 375 Oct 25 09:42 bu_pp.tgz [smithbob@lnxserver tmp]$ mv bu_pp.tgz /home/smithbob [smithbob@lnxserver tmp]$ cd [smithbob@lnxserver ~]$ pwd /home/smithbob [smithbob@lnxserver ~]$ lsf *.tgz -rw-r--r-- 1 smithbob hdpbob_users 375 Oct 25 09:42 bu_pp.tgz [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ tar -tf bu_pp.tgz home/smithbob/python_programs/ home/smithbob/python_programs/myprogram_20211015.log home/smithbob/python_programs/_archive/ home/smithbob/python_programs/_archive/zzz home/smithbob/python_programs/_archive/newfile1 home/smithbob/python_programs/_archive/newfile2 home/smithbob/python_programs/_archive/newfile3 home/smithbob/python_programs/_archive/newfile4A home/smithbob/python_programs/_archive/newfile4 [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ cd [smithbob@lnxserver ~]$ mkdir tmp [smithbob@lnxserver ~]$ cd tmp [smithbob@lnxserver tmp]$ pwd /home/smithbob/tmp [smithbob@lnxserver tmp]$ mv ../bu_pp.tgz . [smithbob@lnxserver tmp]$ ls bu_pp.tgz [smithbob@lnxserver tmp]$ tar -xvf bu_pp.tgz home/smithbob/python_programs/ home/smithbob/python_programs/myprogram_20211015.log home/smithbob/python_programs/_archive/ home/smithbob/python_programs/_archive/zzz home/smithbob/python_programs/_archive/newfile1 home/smithbob/python_programs/_archive/newfile2 home/smithbob/python_programs/_archive/newfile3 home/smithbob/python_programs/_archive/newfile4A home/smithbob/python_programs/_archive/newfile4 [smithbob@lnxserver tmp]$ ls -alFR .: total 24 drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 25 10:03 ./ drwx------ 52 smithbob hdpbob_users 16384 Oct 25 10:02 ../ -rw-r--r-- 1 smithbob hdpbob_users 375 Oct 25 09:42 bu_pp.tgz drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 25 10:03 home/ ./home: total 0 drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 25 10:03 ./ drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 25 10:03 ../ drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 25 10:03 smithbob/ ./home/smithbob: total 0 drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 25 10:03 ./ drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 25 10:03 ../ drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 24 14:13 python_programs/ ./home/smithbob/python_programs: total 16 drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 24 14:13 ./ drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 25 10:03 ../ drwxr-xr-x 2 smithbob hdpbob_users 8192 Oct 23 13:37 _archive/ -rw-r--r-- 1 smithbob hdpbob_users 18 Oct 23 14:06 myprogram_20211015.log ./home/smithbob/python_programs/_archive: total 16 drwxr-xr-x 2 smithbob hdpbob_users 8192 Oct 23 13:37 ./ drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 24 14:13 ../ -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 10:24 newfile1 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:07 newfile2 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:07 newfile3 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:08 newfile4 -rw-r--r-- 1 smithbob hdpbob_users 0 Oct 23 13:08 newfile4A -rw-r--r-- 1 smithbob hdpbob_users 4 Oct 23 13:37 zzz [smithbob@lnxserver tmp]$ ./home/smithbob/python_programs: total 16 drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 24 14:13 ./ drwxr-xr-x 3 smithbob hdpbob_users 152 Oct 25 10:03 ../ drwxr-xr-x 2 smithbob hdpbob_users 8192 Oct 23 13:37 _archive/ -rw-r--r-- 1 smithbob hdpbob_users 18 Oct 23 14:06 myprogram_20211015.log [smithbob@lnxserver ~]$ cd python_programs/ [smithbob@lnxserver python_programs]$ touch myfile [smithbob@lnxserver python_programs]$ ls -alF total 12 drwxr-xr-x 2 smithbob hdpbob_users 4096 Nov 7 13:17 ./ drwxr-xr-x 45 smithbob hdpbob_users 4096 Nov 7 13:15 ../ -rw-r--r-- 1 smithbob hdpbob_users 0 Nov 7 13:17 myfile [smithbob@lnxserver python_programs]$ [smithbob@lnxserver python_programs]$ chmod +x myfile [smithbob@lnxserver python_programs]$ chmod u=rwx,g=rx,o=r myfile [smithbob@lnxserver python_programs]$ lsf total 12 drwxr-xr-x 2 smithbob hdpbob_users 4096 Nov 7 13:17 ./ drwxr-xr-x 45 smithbob smithbob 4096 Nov 7 13:15 ../ -rwxr-xr-- 1 smithbob hdpbob_users 0 Nov 7 13:17 myfile* [smithbob@lnxserver python_programs]$ [smithbob@lnxserver python_programs]$ chmod u+rwx,g+rx,o+r myfile [smithbob@lnxserver python_programs]$ chmod 754 myfile [smithbob@lnxserver python_programs]$ chmod 400 password_file Chapter 19 - Introduction to the vi Editor select state_code,state_name from prod_schema.dim_us_state_mapping order by state_code; [smithbob@lnxserver ~]$ vi bobsmith.txt ~ ~ ~ ~ ~ ~ ...tilde snip... ~ ~ ~ ~ ~ ~ "bobsmith.txt" [New File] alias lsf="ls -alF" alias rm="rm -i" [smithbob@lnxserver ~]$ source .bash_profile [smithbob@lnxserver ~]$ alias alias lsf='ls -alF' alias rm='rm -i' [smithbob@lnxserver ~]$ set shiftwidth=1 :set --- Options --- cscopetag helplang=en scroll=18 ttyfast window=36 cscopeverbose hlsearch shiftwidth=1 ttymouse=sgr t_Sb=^[[4%dm filetype=text ruler syntax=text viminfo='20,"50 t_Sf=^[[3%dm backspace=indent,eol,start cscopeprg=/usr/bin/cscope fileencodings=ucs-bom,utf-8,latin1 guicursor=n-v-c:block,o:hor50,i-ci:hor15,r-cr:hor30,sm:block,a:blinkon0 Press ENTER or type command to continue Chapter 20 - Working with Linux Scripts [smithbob@lnxserver ~]$ chmod u+x myscript1 #!/bin/bash # Create our log file by using echo with some text and the date command. echo "Program started at `date`." > /home/smithbob/myscript1.log # Append the contents of /proc/cpuinfo to the log file. cat /proc/cpuinfo >> /home/smithbob/myscript1.log # Close out our log file by appending some final text. echo "Program ended at `date`." >> /home/smithbob/myscript1.log exit #!/usr/bin/python [smithbob@lnxserver ~]$ chmod u+x myscript1 [smithbob@lnxserver ~]$ ./myscript1 [smithbob@lnxserver ~]$ head myscript1.log Program started at Mon Nov 8 14:04:42 EST 2021. processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 78 model name : Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz stepping : 3 cpu MHz : 2591.786 cache size : 4096 KB physical id : 0 [smithbob@lnxserver ~]$ tail myscript1.log wp : yes flags : fpu vme de cx8 ...snip... invpcid rdseed clflushopt bugs : bogomips : 5183.57 clflush size : 64 cache_alignment : 64 address sizes : 39 bits physical, 48 bits virtual power management: Program ended at Mon Nov 8 14:04:42 EST 2021. ./myscript1 & nohup ./myscript1 & [smithbob@lnxserver ~]$ ./myscript1 & [1] 7120 [smithbob@lnxserver ~]$ [1]+ Done ./myscript1 [smithbob@lnxserver ~]$ nohup ./myscript1 & [1] 7129 [smithbob@lnxserver ~]$ nohup: ignoring input and appending output to `nohup.out' [1]+ Done nohup ./myscript1 [smithbob@lnxserver ~]$ ps PID TTY TIME CMD 5526 pts/0 00:00:00 bash 7781 pts/0 00:00:00 ps [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ ps -ef | grep "smithbob" smithbob 5526 5522 0 13:19 pts/0 00:00:00 bash smithbob 7973 5526 0 15:06 pts/0 00:00:00 grep bash [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ kill 1234 [smithbob@lnxserver ~]$ kill -9 1234 cat /proc/cpuinfo >> /home/smithbob/myscript1.log 2>&1 impala-shell -i hdpserver --database prod_schema -f /home/smith/query1.sql & impala-shell -i hdpserver --database prod_schema -f /home/smith/query2.sql & impala-shell -i hdpserver --database prod_schema -f /home/smith/query3.sql & impala-shell -i hdpserver --database prod_schema -f /home/smith/query4.sql & wait top - 14:56:27 up 3 min, 2 users, load average: 2.13, 1.59, 0.68 Tasks: 270 total, 1 running, 269 sleeping, 0 stopped, 0 zombie Cpu(s): 10.9%us, 3.2%sy, 0.0%ni, 85.6%id, 0.2%wa, 0.0%hi, 0.2%si, 0.0%st Mem: 7911572k total, 3894092k used, 4017480k free, 50620k buffers Swap: 5324796k total, 0k used, 5324796k free, 551488k cached PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 5161 oracle 20 0 472m 24m 20m S 6.3 0.3 0:00.56 gnome-panel 3372 yarn 20 0 2699m 315m 28m S 3.6 4.1 0:14.18 java 4500 root 20 0 266m 50m 21m S 3.3 0.6 0:03.52 Xorg 5444 oracle 20 0 290m 22m 19m S 2.6 0.3 0:01.04 gnome-terminal 2813 hdfs 20 0 2782m 286m 29m S 2.0 3.7 0:12.38 java 4084 spark 20 0 3532m 307m 42m S 2.0 4.0 0:09.31 java 2695 hdfs 20 0 2750m 234m 28m S 1.3 3.0 0:10.84 java 3149 mapred 20 0 2773m 262m 29m S 1.3 3.4 0:11.97 java 3250 yarn 20 0 2573m 327m 29m S 1.3 4.2 0:11.95 java 4328 impala 20 0 5485m 167m 54m S 1.3 2.2 0:09.30 impalad 3880 hive 20 0 3930m 382m 51m S 1.0 4.9 0:16.40 java 4204 impala 20 0 359m 28m 24m S 0.7 0.4 0:00.50 statestored 4249 impala 20 0 3729m 164m 47m S 0.7 2.1 0:09.02 catalogd 5370 oracle 20 0 296m 18m 16m S 0.7 0.2 0:00.14 multiload-apple 5463 oracle 20 0 15224 2140 1704 R 0.7 0.0 0:00.18 top 7 root 20 0 0 0 0 S 0.3 0.0 0:00.42 rcu_sched 1869 dbus 20 0 30556 2940 2000 S 0.3 0.0 0:00.42 dbus-daemon 2066 root 20 0 22576 1036 912 S 0.3 0.0 0:00.26 hald-addon-inpu top - 14:58:12 up 5 min, 2 users, load average: 0.45, 1.14, 0.62 Tasks: 256 total, 2 running, 254 sleeping, 0 stopped, 0 zombie Cpu0 : 10.4%us, 2.2%sy, 0.0%ni, 87.4%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st Cpu1 : 5.7%us, 1.8%sy, 0.0%ni, 92.2%id, 0.4%wa, 0.0%hi, 0.0%si, 0.0%st Mem: 7911572k total, 3895952k used, 4015620k free, 50924k buffers Swap: 5324796k total, 0k used, 5324796k free, 551644k cached sTacoBell="http://www.tacobell.com" sTacoBellLocCnt=7072 echo "The Taco Bell website is $sTacoBell and there are $sTacoBellLocCnt locations worldwide." echo "The Taco Bell website is ${sTacoBell} and there are ${sTacoBellLocCnt} locations worldwide." sYYYY=`date +%Y` echo $sYYYY 2021 sRowCnt=`impala-shell -B --database=default -q 'select count(*) from prod_schema.dim_us_state_mapping;'` echo $sRowCnt 65 if [ condition ] then statements if [ condition ] then statements else statements fi if [ condition ] then statements elif [ condition ] statements elif [condition ] statements ... else statements fi iNUM=2 if [ $iNUM -eq 2 ] then echo "MATCHED IT." else echo "DIDN'T MATCH IT." fi MATCHED IT. case $var in match1) statement-11 statement-12 ... ;; match2) statement-21 statement-22 ... ;; *) statement-n1 statement-n2 ... ;; esac iNUM=2 case $iNUM in 2) echo "MATCHED IT." ;; *) echo "DIDN'T MATCH IT." ;; esac MATCHED IT. iNUM=2 sSTATUS="GO" if [[ $iNUM -eq 2 && "$sSTATUS"=="GO" ]] then echo "MATCHED AND WE ARE A-GO." else echo "LAUNCHED DELAYED." fi MATCHED AND WE ARE A-GO. for iINDX in {1..10} do echo "Index number is currently $iINDX." done Index number is currently 1. Index number is currently 2. Index number is currently 3. Index number is currently 4. Index number is currently 5. Index number is currently 6. Index number is currently 7. Index number is currently 8. Index number is currently 9. Index number is currently 10. for iINDX in {1..10..2} do echo "Index number is currently $iINDX." done Index number is currently 1. Index number is currently 3. Index number is currently 5. Index number is currently 7. Index number is currently 9. for iINDX in {10..1..2} do echo "Index number is currently $iINDX." done Index number is currently 10. Index number is currently 8. Index number is currently 6. Index number is currently 4. Index number is currently 2. iINDX=1 while [ $iINDX -le 10 ] do echo "Index number is currently $iINDX." ((iINDX++)) done iINDX=1 while [ $iINDX -le 10 ] do echo "Index number is currently $iINDX." ((iINDX+=1)) done iINDX=1 while [ $iINDX -le 10 ] do echo "Index number is currently $iINDX." ((iINDX=iINDX+1)) done iINDX=1 until [ $iINDX -gt 10 ] do echo "Index number is currently $iINDX." ((iINDX=iINDX+1)) done iINDX=1 while [ $iINDX -le 10 ] do echo "Index number is currently $iINDX." ((iINDX++)) if [ $iINDX -eq 7 ] then echo "LUCKY NUMBER SEVEN" break fi done for iINDX in {1..10} do if [ $iINDX -eq 7 ] then continue fi echo "Index number is currently $iINDX." done #!/bin/bash sALL_ARGS="$@" sSQLCode="$1" iBEGYYYY="$2" iENDYYYY="$3" echo "Querying database with SQL code $sSQLCode starting from year $iBEGYYYY and ending at year $iENDYYYY." exit [smithbob@lnxserver ~]$ ./querydb update.sql 2020 2021 Querying database with SQL code update.sql starting from year 2020 and ending at year 2021. [smithbob@lnxserver ~]$ ./querydb update.sql `date -d "-1 year" +%Y` `date +%Y` Querying database with SQL code update.sql starting from year 2020 and ending at year 2021. if [ $# -ne 3 ] then echo "SCRIPT CALLED WITH INCORRECT NUMBER OF PARAMETERS...YOU DOLT!" exit fi sYYYY="2020" sMM="06" sYYYYMM="${sYYYY}${sMM}" echo $sYYYYMM 202006 sYYYYMM=`date +%Y-%m` echo $sYYYYMM 2021-12 echo $sYYYYMM | cut -d'-' -f1 2021 echo $sYYYYMM | cut -d'-' -f2 12 sYYYY="`echo $sYYYYMM | cut -d'-' -f1`" echo $sYYYY 2021 sYYYY="$(cut -d'-' -f1 <<< $sYYYYMM)" echo $sYYYY 2021 sTB_LOWER="taco bell" sTB_UPPER="${sTB_LOWER^^}" echo $ sTB_UPPER TACO BELL sTB_UPPER="TACO BELL" sTB_LOWER="${sTB_UPPER,,}" echo $ sTB_LOWER taco bell #!/bin/bash # All of the quote-enclosed tables listed on the command line. TBLLIST="$1" echo $TBLLIST # Process each table in turn stored in TBLLIST. set -- $TBLLIST while [ $# -gt 0 ] do # Each table name is now known as $1 within the loop. sThisTable="$1" # Since we grabbed the current iteration`s table name, we # use shift to remove it from the TBLLIST in preparation # for the next loop. shift echo $sThisTable done exit [smithbob@lnxserver ~]$ ./updateTables "TBL1 TBL2 TBL3 TBL4 TBL5" TBL1 TBL2 TBL3 TBL4 TBL5 TBL1 TBL2 TBL3 TBL4 TBL5 for sThisFile in /home/smithbob/sqlfiles/query*.sql do echo "Processing file: $sThisFile." done Processing file: /home/smithbob/sqlfiles/query1.sql. Processing file: /home/smithbob/sqlfiles/query2.sql. Processing file: /home/smithbob/sqlfiles/query3.sql. Processing file: /home/smithbob/sqlfiles/query4.sql. Processing file: /home/smithbob/sqlfiles/query5.sql. #!/bin/bash # Create our log file. echo "Program started at `date`." > /home/smithbob/myscript1.log # Append the contents of /proc/cpuinfo to the log file. cat /proc/cpuinfo >> /home/smithbob/myscript1.log # Close out our log file. echo "Program ended at `date`." >> /home/smithbob/myscript1.log exit 0 #!/bin/bash # Create our log file. echo "Program started at `date`." > /home/smithbob/myscript1.log # Append the contents of /proc/cpuinfo to the log file. cat /proc/cpuinfo >> /home/smithbob/myscript1.log if [[ $? -ne 0 ]] do echo "Something has gone horribly wrong! IT'S SIRENHEAD! RUN AWAY!!" exit $? done # Close out our log file. echo "Program ended at `date`." >> /home/smithbob/myscript1.log exit 0 [smithbob@lnxserver ~]$ env USERNAME=smithbob PWD=/home/smithbob HOME=/home/smithbob SHELL=/bin/bash PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin JAVA_HOME=/usr/java/latest CLASSPATH=/usr/lib/hadoop/*:/usr/lib/hadoop/*:. #!/bin/bash # Create our log file. echo "Program started at `date`." > $HOME/myscript1.log # Append the contents of /proc/cpuinfo to the log file. cat /proc/cpuinfo >> $HOME/myscript1.log if [[ $? -ne 0 ]] do echo "Something has gone wrong! RUN AWAY!!" exit $? done # Close out our log file. echo "Program ended at `date`." >> $HOME/myscript1.log exit 0 #!/bin/bash # Bring in environment variables. source $HOME/.bash_profile # Create our log file. echo "Program started at `date`." > $HOME/myscript1.log # Append the contents of /proc/cpuinfo to the log file. cat /proc/cpuinfo >> $HOME/myscript1.log if [[ $? -ne 0 ]] do echo "Something has gone wrong! RUN AWAY!!" exit $? done # Close out our log file. echo "Program ended at `date`." >> $HOME/myscript1.log exit 0 PATH=$PATH:$HOME/bin export PATH PATH=$PATH:$HOME/bin:. export PATH [smithbob@lnxserver ~]$ source .bash_profile Chapter 21 - Running ImpalaSQL from the Linux Command Line [smithbob@lnxserver ~]$ impala-shell Starting Impala Shell without Kerberos authentication Connected to hdpserver:21000 Server version: impalad version 2.10.0-cdh5.13.1 RELEASE (build 1e4b23c4eb52dac95c5be6316f49685c41783c51) ********************************************************************************** Welcome to the Impala shell. (Impala Shell v2.10.0-cdh5.13.1 (1e4b23c) built on Thu Nov 9 08:29:47 PST 2017) When pretty-printing is disabled, you can use the '--output_delimiter' flag to set the delimiter for fields in the same row. The default is ','. ********************************************************************************** [hdpserver:21000] > [smithbob@lnxserver ~]$ impala-shell -i hdpserver -d prod_schema alias isps='impala-shell -k -i hdpserver -d prod_schema' [smithbob@lnxserver ~]$ impala-shell -h Usage: impala_shell.py [options] Options: -h, --help show this help message and exit -i IMPALAD, --impalad=IMPALAD of impalad to connect to [default: hdpserver:21000] -q QUERY, --query=QUERY Execute a query without the shell [default: none] -f QUERY_FILE, --query_file=QUERY_FILE Execute the queries in the query file, delimited by ;. If the argument to -f is "-", then queries are read from stdin and terminated with ctrl-d. [default: none] -k, --kerberos Connect to a kerberized impalad [default: False] -o OUTPUT_FILE, --output_file=OUTPUT_FILE If set, query results are written to the given file. Results from multiple semicolon-terminated queries will be appended to the same file [default: none] -B, --delimited Output rows in delimited mode [default: False] --print_header Print column names in delimited mode when pretty- printed. [default: False] --output_delimiter=OUTPUT_DELIMITER Field delimiter to use for output in delimited mode [default: \t] -s KERBEROS_SERVICE_NAME, --kerberos_service_name=KERBEROS_SERVICE_NAME Service name of a kerberized impalad [default: impala] -V, --verbose Verbose output [default: True] -p, --show_profiles Always display query profiles after execution [default: False] --quiet Disable verbose output [default: False] -v, --version Print version information [default: False] -c, --ignore_query_failure Continue on query failure [default: False] -r, --refresh_after_connect Refresh Impala catalog after connecting [default: False] -d DEFAULT_DB, --database=DEFAULT_DB Issues a use database command on startup [default: none] -l, --ldap Use LDAP to authenticate with Impala. Impala must be configured to allow LDAP authentication. [default: False] -u USER, --user=USER User to authenticate with. [default: smithbob] --ssl Connect to Impala via SSL-secured connection [default: False] --ca_cert=CA_CERT Full path to certificate file used to authenticate Impala's SSL certificate. May either be a copy of Impala's certificate (for self-signed certs) or the certificate of a trusted third-party CA. If not set, but SSL is enabled, the shell will NOT verify Impala's server certificate [default: none] --config_file=CONFIG_FILE Specify the configuration file to load options. File must have case-sensitive '[impala]' header. Specifying this option within a config file will have no effect. Only specify this as a option in the commandline. [default: /home/oracle/.impalarc] --live_summary Print a query summary every 1s while the query is running. [default: False] --live_progress Print a query progress every 1s while the query is running. [default: False] --auth_creds_ok_in_clear If set, LDAP authentication may be used with an insecure connection to Impala. WARNING: Authentication credentials will therefore be sent unencrypted, and may be vulnerable to attack. [default: none] --ldap_password_cmd=LDAP_PASSWORD_CMD Shell command to run to retrieve the LDAP password [default: none] --var=KEYVAL Define variable(s) to be used within the Impala session. It must follow the pattern "KEY=VALUE", KEY starts with an alphabetic character and contains alphanumeric characters or underscores. [default: none] [hdpserver:21000] > select * from prod_schema.dim_us_state_mapping; Query: select * from prod_schema.dim_us_state_mapping Query submitted at: 2021-12-19 10:54:00 (Coordinator: http://company.com:25000) Query progress can be monitored at: http://hdpserver:25000/query_plan?query_id=3141a066db178d37:42ab345f00000000 +------------+----------------------------------------+ | state_code | state_name | +------------+----------------------------------------+ | MH | MARSHALL ISLANDS | | RI | RHODE ISLAND | | WI | WISCONSIN | | WY | WYOMING | | PA | PENNSYLVANIA | | CZ | PANAMA CANAL ZONE | ...snip... [smithbob@lnxserver ~]$ impala-shell -i hdpserver -d prod_schema -B -output_delimiter=';' [hdpserver:21000] > select * from prod_schema.dim_us_state_mapping; ...snip... MH;MARSHALL ISLANDS RI;RHODE ISLAND WI;WISCONSIN WY;WYOMING PA;PENNSYLVANIA CZ;PANAMA CANAL ZONE ...snip... [smithbob@lnxserver ~]$ impala-shell -i hdpserver -d prod_schema -B -output_delimiter=';' --quiet --print_header [hdpserver:21000] > select * from prod_schema.dim_us_state_mapping; state_code;state_name MH;MARSHALL ISLANDS RI;RHODE ISLAND WI;WISCONSIN WY;WYOMING PA;PENNSYLVANIA CZ;PANAMA CANAL ZONE ...snip... sRowCnt=`impala-shell -B --database=prod_schema -q 'select count(*) from dim_us_state_mapping;'` echo $sRowCnt 65 use prod_schema; drop table if exists states_A purge; create table states_A stored as parquet as select state_code,state_name from dim_us_state_mapping where state_code like 'A%'; drop table if exists states_M purge; create table states_M stored as parquet as select state_code,state_name from dim_us_state_mapping where state_code like 'M%'; [smithbob@lnxserver ~]$ impala-shell -i hdpserver -d prod_schema -f query1.sql Starting Impala Shell without Kerberos authentication Connected to hdpserver:21000 Server version: impalad version 2.10.0-cdh5.13.1 RELEASE (build 1e4b23c4eb52dac95c5be6316f49685c41783c51) Query: use prod_schema Query: drop table if exists states_A purge Query: create table states_A stored as parquet as select state_code,state_name from dim_us_state_mapping where state_code like 'A%' Query submitted at: 2021-12-19 14:09:08 (Coordinator: http://hdpserver:25000) Query progress can be monitored at: http://hdpserver:25000/query_plan?query_id=35424326c140647d:2acdc4e600000000 +-------------------+ | summary | +-------------------+ | Inserted 8 row(s) | +-------------------+ Fetched 1 row(s) in 0.52s Query: drop table if exists states_M purge Query: create table states_M stored as parquet as select state_code,state_name from dim_us_state_mapping where state_code like 'M%' Query submitted at: 2021-12-19 14:09:08 (Coordinator: http://hdpserver:25000) Query progress can be monitored at: http://hdpserver:25000/query_plan?query_id=b44dde19a8b47536:761dede000000000 +--------------------+ | summary | +--------------------+ | Inserted 10 row(s) | +--------------------+ Fetched 1 row(s) in 0.51s [smithbob@lnxserver ~]$ impala-shell -c -i hdpserver -d prod_schema -f query1.sql [smithbob@lnxserver ~]$ impala-shell -i hdpserver -d prod_schema -f query2.sql --var "stcd=N" use prod_schema; drop table if exists states_${var:stcd} purge; create table states_${var:stcd} stored as parquet as select state_code,state_name from dim_us_state_mapping where state_code like '${var:stcd}%'; Starting Impala Shell without Kerberos authentication Connected to hdpserver:21000 Server version: impalad version 2.10.0-cdh5.13.1 RELEASE (build 1e4b23c4eb52dac95c5be6316f49685c41783c51) Query: use default Query: drop table if exists states_N purge Query: create table states_N stored as parquet as select state_code,state_name from dim_us_state_mapping where state_code like 'N%' Query submitted at: 2021-12-19 14:35:42 (Coordinator: http://hdpserver:25000) Query progress can be monitored at: http://hdpserver:25000/query_plan?query_id=1b4c8750e6265d3c:87d66db900000000 +-------------------+ | summary | +-------------------+ | Inserted 8 row(s) | +-------------------+ Fetched 1 row(s) in 0.64s PART IV - Working with Hadoop Chapter 22 - Hadoop Commands from Linux (hadoop/hdfs) hadoop fs -command hadoop fs -ls /data/prod/teams/prod_schema/ hadoop fs -ls -R /data/prod/teams/prod_schema/ hadoop fs -help Usage: hadoop fs [generic options] [-appendToFile ... ] [-cat [-ignoreCrc] ...] [-checksum ...] [-chgrp [-R] GROUP PATH...] [-chmod [-R] PATH...] [-chown [-R] [OWNER][:[GROUP]] PATH...] [-copyFromLocal [-f] [-p] [-l] [-d] [-t ] ... ] [-copyToLocal [-f] [-p] [-ignoreCrc] [-crc] ... ] [-count [-q] [-h] [-v] [-t []] [-u] [-x] [-e] [-s] ...] [-cp [-f] [-p | -p[topax]] [-d] ... ] [-createSnapshot []] [-deleteSnapshot ] [-df [-h] [ ...]] [-du [-s] [-h] [-v] [-x] ...] [-expunge [-immediate]] [-find ... ...] [-get [-f] [-p] [-ignoreCrc] [-crc] ... ] [-getfacl [-R] ] [-getfattr [-R] {-n name | -d} [-e en] ] [-getmerge [-nl] [-skip-empty-file] ] [-head ] [-help [cmd ...]] [-ls [-C] [-d] [-h] [-q] [-R] [-t] [-S] [-r] [-u] [-e] [ ...]] [-mkdir [-p] ...] [-moveFromLocal [-f] [-p] [-l] [-d] ... ] [-moveToLocal ] [-mv ... ] [-put [-f] [-p] [-l] [-d] [-t ] ... ] [-renameSnapshot ] [-rm [-f] [-r|-R] [-skipTrash] [-safely] ...] [-rmdir [--ignore-fail-on-non-empty] ...] [-setfacl [-R] [{-b|-k} {-m|-x } ]|[--set ]] [-setfattr {-n name [-v value] | -x name} ] [-setrep [-R] [-w] ...] [-stat [format] ...] [-tail [-f] [-s ] ] [-test -[defsz] ] [-text [-ignoreCrc] ...] [-touch [-a] [-m] [-t TIMESTAMP (yyyyMMdd:HHmmss) ] [-c] ...] [-touchz ...] [-truncate [-w] ...] [-usage [cmd ...]] -appendToFile ... : Appends the contents of all the given local files to the given dst file. The dst file will be created if it does not exist. If is -, then the input is read from stdin. -cat [-ignoreCrc] ... : Fetch all files that match the file pattern and display their content on stdout. ...snip... hadoop fs -help ls -ls [-C] [-d] [-h] [-q] [-R] [-t] [-S] [-r] [-u] [-e] [ ...] : List the contents that match the specified file pattern. If path is not specified, the contents of /user/ will be listed. For a directory a list of its direct children is returned (unless -d option is specified). Directory entries are of the form: permissions - userId groupId sizeOfDirectory(in bytes) modificationDate(yyyy-MM-dd HH:mm) directoryName and file entries are of the form: permissions numberOfReplicas userId groupId sizeOfFile(in bytes) modificationDate(yyyy-MM-dd HH:mm) fileName -C Display the paths of files and directories only. -d Directories are listed as plain files. -h Formats the sizes of files in a human-readable fashion rather than a number of bytes. -q Print ? instead of non-printable characters. -R Recursively list the contents of directories. -t Sort files by modification time (most recent first). -S Sort files by size. -r Reverse the order of the sort. -u Use time of last access instead of modification for display and sorting. -e Display the erasure coding policy of files and directories. hadoop fs -usage ls Usage: hadoop fs [generic options] -ls [-C] [-d] [-h] [-q] [-R] [-t] [-S] [-r] [-u] [-e] [ ...] hadoop fs -mkdir /data/prod/teams/prod_schema/tmp_postal_code hadoop fs -rmdir /data/prod/teams/prod_schema/tmp_postal_code rmdir: `/data/prod/teams/prod_schema/tmp_postal_code': Directory is not empty hadoop fs -rm -R /data/prod/teams/prod_schema/tmp_postal_code 22/03/05 13:40:56 INFO fs.TrashPolicyDefault: Moved: '/data/prod/teams/prod_schema/tmp_postal_code' to trash at: /user/hdfs/.Trash/Current/warehouse/tablespace/external/tmp_postal_code hadoop fs -rm -R -skipTrash /data/prod/teams/prod_schema/tmp_postal_code hadoop fs -mkdir /data/prod/teams/prod_schema/tmp_postal_code hadoop fs -copyFromLocal /home/smithbob/dim_postal_code.tsv /data/prod/teams/prod_schema/tmp_postal_code/ hadoop fs -ls -R /data/prod/teams/prod_schema/tmp_postal_code -rw-r--r-- 3 hdfs supergroup 1784376 2022-03-05 13:54 /data/prod/teams/prod_schema/tmp_postal_code/dim_postal_code.tsv hadoop fs -copyToLocal /data/prod/teams/prod_schema/tmp_postal_code /home/smithbob/tmp_postal_code [smithbob@lnxserver tmp_postal_code]$ lsf total 1748 drwxr-xr-x. 2 smithbob smithbob 33 Mar 5 14:02 ./ drwx------. 18 smithbob smithbob 4096 Mar 5 14:02 ../ -rw-r--r--. 1 smithbob smithbob 1784376 Mar 5 14:02 dim_postal_code.tsv [smithbob@lnxserver tmp_postal_code]$ [smithbob@lnxserver tmp_postal_code]$ hadoop fs -head /data/prod/teams/prod_schema/tmp_postal_code/dim_postal_code.tsv 00623 CABO ROJO PR 18.08643 -67.15222 00633 CAYEY PR 18.194527 -66.18346699999999 00640 COAMO PR 18.077197 -66.359104 00676 MOCA PR 18.37956 -67.08423999999999 00728 PONCE PR 18.013353 -66.65218 00734 PONCE PR 17.999499 -66.643934 00735 CEIBA PR 18.258444 -65.65987 00748 FAJARDO PR 18.326732 -65.652484 00766 VILLALBA PR 18.126023 -66.48208 00771 LAS PIEDRAS PR 18.18744 -65.87088 00791 HUMACAO PR 18.147257 -65.82268999999999 00901 SAN JUAN PR 18.465426 -66.10786 00906 SAN JUAN PR 18.46454 -66.10079 00909 SAN JUAN PR 18.442282 -66.06764 00922 SAN JUAN PR 18.410462 -66.06053300000001 00924 SAN JUAN PR 18.401917 -66.01194 00961 BAYAMON PR 18.412462 -66.16033 01704 FRAMINGHAM MA 42.446396 -71.459405 01731 HANSCOM AFB MA 42.459085 -71.27556 01746 HOLLISTON MA 42.196065 -71.43797000000001 01749 HUDSON MA 42.389813 -71.55791000000001 01770 SHERBORN MA 42.231025 -71.37202000000001 01831 HAVERHILL MA 42.771095 -71.12205400000001 01856 LOWELL MA 42.641779 -71.303488 01908 NAHANT MA 42.427096 -70.92809 01951 NEWBURY MA 4[smithbob@lnxserver tmp_postal_code]$ [smithbob@lnxserver tmp_postal_code]$ hadoop fs -tail /data/prod/teams/prod_schema/tmp_postal_code/dim_postal_code.tsv 596999999999 51541 HENDERSON IA 41.137694 -95.39897000000001 67671 VICTORIA KS 38.861194 -99.15047 30436 LYONS GA 32.177508 -82.30448 30719 DALTON GA 34.801861 -84.989796 37013 ANTIOCH TN 36.055115 -86.64782 26537 KINGWOOD WV 39.472924 -79.69873 57034 HUDSON SD 43.134318 -96.51958999999999 61259 ILLINOIS CITY IL 41.369036 -90.9284 52035 COLESBURG IA 42.662381 -91.18541 52072 SAINT OLAF IA 42.927724 -91.38723 22412 FREDERICKSBURG VA 38.184716 -77.662559 25161 POWELLTON WV 38.084773 -81.31241 35748 GURLEY AL 34.710942 -86.38995 31647 SPARKS GA 31.183567 -83.43559 46374 SAN PIERRE IN 41.204744 -86.90009000000001 57212 ARLINGTON SD 44.377534 -97.13878 57236 GARDEN CITY SD 44.971494 -97.58996 57369 PLATTE SD 43.435193 -98.89387000000001 39532 BILOXI MS 30.462388 -88.93293 39730 ABERDEEN MS 33.833689 -88.55463 28206 CHARLOTTE NC 35.248292 -80.82747999999999 36033 GEORGIANA AL 31.655458 -86.76737 37167 SMYRNA TN 35.968513 -86.52231 62706 SPRINGFIELD IL 39.79885 -89.65339899999999 52352 WALKER IA 42.290421 -91.77461 hadoop fs -cat /data/prod/teams/prod_schema/tmp_postal_code/dim_postal_code.tsv hadoop fs -copyFromLocal /home/smithbob/dim_postal_code.tsv /data/prod/teams/prod_schema/tmp_postal_code/dim_postal_code2.tsv hadoop fs -copyFromLocal /home/smithbob/dim_postal_code.tsv /data/prod/teams/prod_schema/tmp_postal_code/dim_postal_code3.tsv hadoop fs -ls -R /data/prod/teams/prod_schema/tmp_postal_code -rw-r--r-- 3 hdfs supergroup 1784376 2022-03-05 13:54 /data/prod/teams/prod_schema/tmp_postal_code/dim_postal_code.tsv -rw-r--r-- 3 hdfs supergroup 1784376 2022-03-05 14:21 /data/prod/teams/prod_schema/tmp_postal_code/dim_postal_code2.tsv -rw-r--r-- 3 hdfs supergroup 1784376 2022-03-05 14:22 /data/prod/teams/prod_schema/tmp_postal_code/dim_postal_code3.tsv hadoop fs -getmerge /data/prod/teams/prod_schema/tmp_postal_code /home/smithbob/dim_postal_code_ALL.tsv hadoop fs -count -h /data/prod/teams/prod_schema/tmp_postal_code 1 3 5.1 M /data/prod/teams/prod_schema/tmp_postal_code DIR_COUNT FILE_COUNT CONTENT_SIZE PATHNAME 1 3 5.1 M .../tmp_postal_code hadoop fs -du -h -s -v /data/prod/teams/prod_schema/tmp_postal_code SIZE DISK_SPACE_CONSUMED_WITH_ALL_REPLICAS FULL_PATH_NAME 5.1 M 15.3 M .../tmp_postal_code hadoop fs -df -h Filesystem Size Used Available Use% hdfs://lnxserver 63.0 G 1.3 G 11.6 G 2% Chapter 23 - Working with Managed and External Tables oFile = open_the_bloody_file(file=file_location + "/" + file_name, file_type="text", file_mode="read_write"); String sLine; while(!oFile.AT_END_OF_FILE) { //Read in a single line from the text file sLine = oFile.read_a_bloody_line(); } String sLine = ''; String sSepChar = '\t'; String sQuoteChar = '"'; String sEscapeChar = '\'; //Create a function to blow apart the fields in sLine function deserialize(psLine,psSepChar,psQuoteChar,psEscapeChar) { String[] asRowData = psLine.split_apart(psSepCharpsQuoteChar,psEscapeChar); ...do something useful with the row of data... } while(!oFile.AT_END_OF_FILE) { //read in a single line from the text file sLine = oFile.read_a_bloody_line(); //Process the line by separating out each field based on the delimiter. deserialize(sLine,sSepChar,sQuoteChar,sEscapeChar); } //Create a function to prep a single row of delimited data function serialize(pasNewData,psSepChar,psQuoteChar,psEscapeChar) { String sPrepped_Line = pasNewData.join_together(psSepChar,psQuoteChar,psEscapeChar); return sPrepped_Line; } | SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL | | InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL | | OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL | CREATE EXTERNAL TABLE PROD_SCHEMA.TMP_STATE_DATA( STATE_CODE STRING, STATE_NAME STRING ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = ",", "quoteChar" = "\"" ) STORED AS TEXTFILE LOCATION '/data/prod/teams/prod_schema/tmp_state_data' TBLPROPERTIES('skip.header.line.count'='1'); private char separatorChar; private char quoteChar; private char escapeChar; public static final String SEPARATORCHAR = "separatorChar"; public static final String QUOTECHAR = "quoteChar"; public static final String ESCAPECHAR = "escapeChar"; @Override public void initialize(final Configuration conf, final Properties tbl) throws SerDeException { final List columnNames = Arrays.asList(tbl.getProperty(serdeConstants.LIST_COLUMNS).split(",")); numCols = columnNames.size(); final List columnOIs = new ArrayList(numCols); for (int i = 0; i < numCols; i++) { columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); } inspector = ObjectInspectorFactory.getStandardStructObjectInspector( columnNames,columnOIs); outputFields = new String[numCols]; row = new ArrayList(numCols); for (int i = 0; i < numCols; i++) { row.add(null); } separatorChar = getProperty(tbl, SEPARATORCHAR, CSVWriter.DEFAULT_SEPARATOR); quoteChar = getProperty(tbl, QUOTECHAR, CSVWriter.DEFAULT_QUOTE_CHARACTER); escapeChar = getProperty(tbl, ESCAPECHAR, CSVWriter.DEFAULT_ESCAPE_CHARACTER); } public static final String SEPARATORCHAR = "separatorChar"; public static final String QUOTECHAR = "quoteChar"; public static final String ESCAPECHAR = "escapeChar"; WITH SERDEPROPERTIES ( "separatorChar" = ",", "quoteChar" = "\"", "escapeChar" = "\\", ) /** The character used for escaping quotes. */ public static final char DEFAULT_ESCAPE_CHARACTER = '"'; /** The default separator to use if none is supplied to the constructor. */ public static final char DEFAULT_SEPARATOR = ','; /** * The default quote character to use if none is supplied to the * constructor. */ public static final char DEFAULT_QUOTE_CHARACTER = '"'; ROW FORMAT SERDE 'serde-format-Java-classname' WITH SERDEPROPERTIES ( ... ) STORED AS INPUTFORMAT 'input-format-Java-classname' OUTPUTFORMAT 'output-format-Java-classname' ROW FORMAT SERDE 'com.company.bob.smith.is.great.PNGSerde' WITH SERDEPROPERTIES ( 'fontFamily'='Courier', 'fontSize'='10') STORED AS INPUTFORMAT 'com.company.bob.smith.is.great.PNGInputFormat' OUTPUTFORMAT 'com.company.bob.smith.is.great.PNGOutputFormat' CREATE EXTERNAL TABLE database_name.table_name ( column_name_1 data_type_1 COMMENT 'column comment 1', column_name_2 data_type_2 COMMENT 'column comment 2', ... column_name_n data_type_n COMMENT 'column comment n' ) PARTITIONED BY ( column_name_p1 data_type_p1 COMMENT 'column comment p1', column_name_p2 data_type_p2 COMMENT 'column comment p2', ... column_name_pk data_type_pk COMMENT 'column comment pk' ) SORT BY (column_name_i, column_name_j, ...) COMMENT 'table-comment' ROW FORMAT row-format WITH SERDEPROPERTIES ( 'key-1','value-1', 'key-2','value-2', ... 'key-m','value-m' ) STORED AS storage-format LOCATION 'HDFS-path-to-data-file-directory' CACHED IN 'cache-pool-name' WITH REPLICATION = replication-value | UNCACHED TBLPROPERTIES ( 'key-1','value-1', 'key-2','value-2', ... 'key-r','value-r' ) ; DELIMITED FIELDS TERMINATED BY 'char' ESCAPED BY 'char' LINES TERMINATED BY 'char' SERDE 'serde-format-Java-classname' WITH SERDEPROPERTIES ( ... ) STORED AS INPUTFORMAT 'input-format-Java-classname' OUTPUTFORMAT 'output-format-Java-classname' ROW FORMAT SERDE 'com.company.bob.smith.is.great.PNGSerde' WITH SERDEPROPERTIES ( 'fontFamily'='Courier', 'fontSize'='10') STORED AS INPUTFORMAT 'com.company.bob.smith.is.great.PNGInputFormat' OUTPUTFORMAT 'com.company.bob.smith.is.great.PNGOutputFormat' state_code,state_name aa,u.s. armed forces - americas ae,u.s. armed forces - europe ak,alaska al,alabama ap,u.s. armed forces - pacific ar,arkansas as,american samoa az,arizona ca,california co,colorado ...snip... hadoop fs -mkdir /user/hive/warehouse/tmp_us_state_mapping hadoop fs -copyFromLocal /home/smithbob/us_state_mapping.csv /user/hive/warehouse/tmp_us_state_mapping/tmp_us_state_mapping.csv hadoop fs -ls -R /user/hive/warehouse/tmp_us_state_mapping create external table prod_schema.tmp_us_state_mapping(state_code string, state_name string) row format delimited fields terminated by ',' stored as textfile location '/user/hive/warehouse/tmp_us_state_mapping' tblproperties('skip.header.line.count'='1'); create external table prod_schema.tmp_us_state_mapping(state_code string, state_name string) row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde' with serdeproperties ( "separatorChar" = ",", "quoteChar" = "\"" ) stored as textfile location '/user/hive/warehouse/tmp_us_state_mapping' tblproperties('skip.header.line.count'='1'); [hdpserver:21000] prod_schema> select * > from tmp_us_state_mapping; +------------+----------------------------------------+ | state_code | state_name | +------------+----------------------------------------+ | aa | u.s. armed forces - americas | | ae | u.s. armed forces - europe | ...snip... | wv | west virginia | | wy | wyoming | +------------+----------------------------------------+ TBLPROPERTIES ( 'key-1','value-1', 'key-2','value-2', ... 'key-r','value-r' ) tblproperties('skip.header.line.count'='1'); tblproperties('serialization.null.format'=' '); tblproperties('bob.smith.deserves.a.pay.raise'='true'); public static final String INPUT_REGEX = "input.regex"; public static final String INPUT_REGEX_CASE_SENSITIVE = "input.regex.case.insensitive"; row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties ( "input.regex" = "...", "input.regex.case.insensitive" = "false" ) "BOB SMITH" 123-45-6780 822-6235 212A "PEG SMITH" 123-45-6781 822-6236 212B "JOE SMITH" 123-45-6782 822-6237 212C "KAT SMITH" 123-45-6783 822-6238 212D hadoop fs -mkdir /user/hive/warehouse/tmp_team_info hadoop fs -copyFromLocal /home/smithbob/team_info.txt /user/hive/warehouse/tmp_team_info/tmp_team_info.txt hadoop fs -ls -R /user/hive/warehouse/tmp_team_info CREATE EXTERNAL TABLE tmp_team_info(team_member_name string, team_member_ss_nbr string, team_member_phone string, team_member_office_number string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' WITH SERDEPROPERTIES ('input.regex'= '^["]{1}(.*)["]{1} (\\d{3}-\\d{2}-\\d{4}) (\\d{3}-\\d{4}) (\\d{3}[A-Z]{1})$') STORED AS TEXTFILE LOCATION '/user/hive/warehouse/tmp_team_info'; +------------------+--------------------+-------------------+---------------------------+ | team_member_name | team_member_ss_nbr | team_member_phone | team_member_office_number | +------------------+--------------------+-------------------+---------------------------+ | BOB SMITH | 123-45-6780 | 822-6235 | 212A | | PEG SMITH | 123-45-6781 | 822-6236 | 212B | | JOE SMITH | 123-45-6782 | 822-6237 | 212C | | KAT SMITH | 123-45-6783 | 822-6238 | 212D | +------------------+--------------------+-------------------+---------------------------+ {"team_member_name":"BOB SMITH","team_member_ss_nbr":"123-45-6780","team_member_phone":"822-6235","team_member_office_number":"212A"} {"team_member_name":"PEG SMITH","team_member_ss_nbr":"123-45-6781","team_member_phone":"822-6236","team_member_office_number":"212B"} {"team_member_name":"JOE SMITH","team_member_ss_nbr":"123-45-6782","team_member_phone":"822-6237","team_member_office_number":"212C"} {"team_member_name":"KAT SMITH","team_member_ss_nbr":"123-45-6783","team_member_phone":"822-6238","team_member_office_number":"212D"} CREATE EXTERNAL TABLE tmp_team_info_json(team_member_name string, team_member_ss_nbr string, team_member_phone string, team_member_office_number string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe' STORED AS TEXTFILE LOCATION '/user/hive/warehouse/tmp_team_info_json'; +------------------+--------------------+-------------------+---------------------------+ | team_member_name | team_member_ss_nbr | team_member_phone | team_member_office_number | +------------------+--------------------+-------------------+---------------------------+ | BOB SMITH | 123-45-6780 | 822-6235 | 212A | | PEG SMITH | 123-45-6781 | 822-6236 | 212B | | JOE SMITH | 123-45-6782 | 822-6237 | 212C | | KAT SMITH | 123-45-6783 | 822-6238 | 212D | +------------------+--------------------+-------------------+---------------------------+ CREATE EXTERNAL TABLE prod_schema.output_textfile(col1 string,...) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES('serialization.null.format'='', 'external.table.purge'='true'); INSERT INTO prod_schema.output_textfile SELECT COL1, CAST(COL2 AS STRING) AS COL2, CAST(COL3 AS STRING FORMAT 'yyyy-mm-dd') AS COL3 FROM FINAL_DATA_FOR_CLIENT; desc formatted prod_schema.output_textfile;. hadoop fs -getmerge /data/prod/teams/prod_schema/output_textfile /home/smithbob/output_textfile.tsv #!/bin/bash -v #*----------------------------------------------------------------------------* #* Program: tableExporter * #* Author(s): Bob Smith * #* Date: July 1, 2022 * #* * #* Application: Table exporter. * #* * #* Abstract: This script exports one or more tables located in the schema * #* prod_schema and creates corresponding tables stored as * #* TEXTFILE in the schema prod_schema_export. Delimited text * #* files will be located in /tmp/prod_schema_export. * #* * #* Assumptions: 1. Requested tables are located in prod_schema. * #* 2. Exported tables are located in prod_schema_export. * #* 3. Exported delimited file(s) will be stored in * #* /tmp/prod_schema_export. * #* 4. Log files are located in /tmp/prod_schema_export/logs. * #* * #* Parameters: 1. Delimiter * #* 2. E-Mail Address * #* 3. Tables to export * #* * #* Input(s): Hadoop table(s) in prod_schema. * #* * #* Output(s): Delimited text files. * #* * #* Example: ./tableExporter ";" * #* "mikethesalesguy@company.com" * #* "dim_postal_code dim_us_state_mapping" * #* * #* Notes: 1. Must use the external.table.purge property on the CREATE * #* EXTERNAL TABLE SQL code below: * #* * #* TBLPROPERTIES('external.table.purge'='true') * #* * #* This will allow the DROP TABLE PURGE to completely remove * #* the files as well as the directory from HDFS. * #* * #* 2. Ensure EXPORT_DIR exists beforehand! * #* 3. Always boil water in a time of war. * #* * #* Modification History: * #* Date Prog Mod # Reason * #* --------- ---- ----- -------------------------------------------- * #* * #*----------------------------------------------------------------------------* #*----------------------------------------------------------------------------* #* Check that the number of incoming arguments is correct. * #*----------------------------------------------------------------------------* if [ "$#" != "3" ] then echo "tableExporter: Not enough parameters provided on the command line." echo "" echo "Syntax:" echo " tableExporter delimiter-in-quotes email-address-in-quotes space-delimited-list-of-tables-in-quotes" echo "" echo "Note: Exported files are located in the directory /tmp/prod_schema_export." echo " Log files are located in the directory /tmp/prod_schema_export/logs." fi #*----------------------------------------------------------------------------* #* Initialize variables used throughout the script. * #*----------------------------------------------------------------------------* #* Directory where the delimited files will be stored * EXPORTDIR="/tmp/prod_schema_export" echo "EXPORT DIRECTORY: $EXPORTDIR" #* Directory where the log files will be placed * LOGDIR="/tmp/prod_schema_export/logs" echo "LOG DIRECTORY: $LOGDIR" #* Argument count * ARGCNT=$# echo "NUMBER OF ARGUMENTS: $ARGCNT" #* Requested delimiter * DLM=$1 echo "DELIMITER: $DLM" #* E-Mail address * EMAIL=$2 echo "E-MAIL: $EMAIL" #* List of tables to export * TBLLIST=$3 echo "TABLES TO EXPORT: $TBLLIST" #* Create input schema * INSCHEMA="prod_schema" echo "INPUT SCHEMA: $INSCHEMA" #* Create output schema * OUTSCHEMA="prod_schema_export" TARGETDB=$OUTSCHEMA echo "OUTPUT SCHEMA: $OUTSCHEMA" echo "TARGET DB: $TARGETDB.db" #* Create log file directory/name * DT="`date +%Y%m%d%H%M`" LOGDIR="$EXPORTDIR/logs" LOGFILE="$LOGDIR/$DT.log" echo "LOG FILE: $LOGFILE" #*----------------------------------------------------------------------------* #* Produce some nice looking output to justify our enormous salary. * #*----------------------------------------------------------------------------* echo "Table Exporter" > $LOGFILE echo " Run Date/Time: `date`" >> $LOGFILE echo " Delimiter: $DLM" >> $LOGFILE echo " E-Mail: $EMAIL" >> $LOGFILE echo " Source DB: $INSCHEMA" >> $LOGFILE echo " Target DB: $TARGETDB" >> $LOGFILE echo " Export Table List: $TBLLIST" >> $LOGFILE #*----------------------------------------------------------------------------* #* Create upper- and lowercased versions of the INSCHEMA. * #*----------------------------------------------------------------------------* DB_LC=${INSCHEMA,,} DB_UC=${INSCHEMA^^} #*----------------------------------------------------------------------------* #* Create a subdirectory which will hold the delimited files. * #* Note: You could supplement the code with a unique project number or other * #* identifier in order to keep the files separated. * #* Since both /tmp/prod_schema_export/logs and /tmp/prod_schema_export have * #* been created up-front, no need to run the code below. * #*----------------------------------------------------------------------------* #CREATESUBDIR="mkdir $EXPORTDIR" #echo " Output Subdirectory: $CREATESUBDIR" >> $LOGFILE #eval $CREATESUBDIR #*----------------------------------------------------------------------------* #* Loop through the list of requested tables processing each one at a time. * #*----------------------------------------------------------------------------* set -- $TBLLIST while [ $# -gt 0 ] do #*----------------------------------------------------------------------------* #* Pull in the first table as well as create upper- and lowercased versions. * #*----------------------------------------------------------------------------* TBL="$1" TBL_LC=${TBL,,} TBL_UC=${TBL^^} echo "------------------------------------------------------------------------------" >> $LOGFILE echo " Exporting the following table: $TBL" >> $LOGFILE shift #*----------------------------------------------------------------------------* #* Create SQL to pull in the column names/data types for the current table. * #* Execute this code using impala-shell and the -q switch. * #*----------------------------------------------------------------------------* SQL_TBLDEFN=" SELECT LOWER(ALL_COL_INFO) AS ALL_COLL_INFO FROM ( SELECT TABLE_NAME,GROUP_CONCAT(COL_INFO,',') AS ALL_COL_INFO FROM ( SELECT TABLE_NAME,CONCAT_WS(' ',TRIM(COLUMN_NAME),TRIM(DATA_TYPE)) AS COL_INFO FROM ( SELECT TABLE_NAME,COLUMN_NAME,DATA_TYPE FROM ALL_TAB_COLUMNS WHERE UPPER(DATABASE_NAME)='$DB_UC' AND UPPER(TABLE_NAME)='$TBL_UC' ORDER BY TABLE_NAME,COLUMN_ID LIMIT 1000000 ) A ) B GROUP BY TABLE_NAME ) C; " echo " SQL_TBLDEFN: $SQL_TBLDEFN" >> $LOGFILE COLDEFN=`impala-shell -i lnxserver --database=$DB_UC -B --quiet -q "$SQL_TBLDEFN" 2>/dev/null` echo " COLDEFN: $COLDEFN" >> $LOGFILE #*----------------------------------------------------------------------------* #* Create similar SQL to the above, but for use with the INSERT Statement. * #* Execute this code using impala-shell and the -q switch. * #*----------------------------------------------------------------------------* SQL_TBLDEFN_INSERT=" SELECT LOWER(ALL_COL_INFO) AS ALL_COLL_INFO FROM ( SELECT TABLE_NAME,GROUP_CONCAT(COL_INFO,',') AS ALL_COL_INFO FROM ( SELECT TABLE_NAME,TRIM(COLUMN_NAME) AS COL_INFO FROM ( SELECT TABLE_NAME,COLUMN_NAME FROM ALL_TAB_COLUMNS WHERE UPPER(DATABASE_NAME)='$DB_UC' AND UPPER(TABLE_NAME)='$TBL_UC' ORDER BY TABLE_NAME,COLUMN_ID LIMIT 1000000 ) A ) B GROUP BY TABLE_NAME ) C; " echo " SQL_TBLDEFN_INSERT: $SQL_TBLDEFN_INSERT" >> $LOGFILE COLDEFN_INSERT=`impala-shell -i lnxserver --database=$DB_UC -B --quiet -q "$SQL_TBLDEFN_INSERT" 2>/dev/null` echo " COLDEFN_INSERT: $COLDEFN_INSERT" >> $LOGFILE #*----------------------------------------------------------------------------* #* Drop the external table if it already exists. * #*----------------------------------------------------------------------------* SQL_DROP="DROP TABLE IF EXISTS $TARGETDB.$TBL_UC PURGE;" echo " SQL_DROP: $SQL_DROP" >> $LOGFILE impala-shell -i lnxserver --database=$DB_UC -B --quiet -q "$SQL_DROP" >> $LOGFILE 2>&1 #*----------------------------------------------------------------------------* #* Create the external table for this iteration's table. * #*----------------------------------------------------------------------------* SQL_CREATE_EXTERNAL_TABLE=" CREATE EXTERNAL TABLE $TARGETDB.$TBL_LC($COLDEFN) ROW FORMAT DELIMITED FIELDS TERMINATED BY '$DLM' STORED AS TEXTFILE TBLPROPERTIES('serialization.null.format'=' ', 'external.table.purge'='true'); " echo " SQL_CREATE_EXTERNAL_TABLE: $SQL_CREATE_EXTERNAL_TABLE" >> $LOGFILE impala-shell -i lnxserver --database=$DB_UC -B --quiet -q "$SQL_CREATE_EXTERNAL_TABLE" >> $LOGFILE 2>&1 #*----------------------------------------------------------------------------* #* Insert data into the external table. * #*----------------------------------------------------------------------------* SQL_INSERT=" INSERT INTO $TARGETDB.$TBL_UC SELECT $COLDEFN_INSERT FROM $DB_UC.$TBL_UC; " echo " SQL_INSERT: $SQL_INSERT" >> $LOGFILE impala-shell -i lnxserver --database=$DB_UC -B --quiet -q "$SQL_INSERT" >> $LOGFILE 2>&1 #*----------------------------------------------------------------------------* #* Create the delimited text file from the external table files in HDFS. * #*----------------------------------------------------------------------------* CREATE_TEXT_FILE="hadoop fs -getmerge hdfs://lnxserver.com:8020/warehouse/tablespace/external/hive/$TARGETDB.db/$TBL_LC $EXPORTDIR/$TBL_LC.txt" eval $CREATE_TEXT_FILE #*----------------------------------------------------------------------------* #* Generate the headers for the text file. * #*----------------------------------------------------------------------------* SQL_HEADER=" SELECT LOWER(ALL_COL_HEADER) AS ALL_COL_HEADER FROM ( SELECT TABLE_NAME,GROUP_CONCAT(COL_INFO,'$DLM') AS ALL_COL_HEADER FROM ( SELECT TABLE_NAME,CONCAT_WS(' ',TRIM(COLUMN_NAME)) AS COL_INFO FROM ( SELECT TABLE_NAME,COLUMN_NAME FROM ALL_TAB_COLUMNS WHERE UPPER(DATABASE_NAME)='$DB_UC' AND UPPER(TABLE_NAME)='$TBL_UC' ORDER BY TABLE_NAME,COLUMN_ID LIMIT 1000000 ) A ) B GROUP BY TABLE_NAME ) C; " COLHDR=`impala-shell -i lnxserver --database=$DB_UC -B --quiet -q "$SQL_HEADER" 2>/dev/null` echo " Column Headers: $COLHDR" >> $LOGFILE #*----------------------------------------------------------------------------* #* Create the final text file with the column headers on line one using sed. * #*----------------------------------------------------------------------------* TBL_FINAL_0="sed -e '1i\\" TBL_FINAL="$TBL_FINAL_0$COLHDR' $EXPORTDIR/$TBL_LC.txt > $EXPORTDIR/$TBL_LC.dlm" echo " TBL_FINAL: $TBL_FINAL" >> $LOGFILE eval $TBL_FINAL #*----------------------------------------------------------------------------* #* Remove unneeded files. * #*----------------------------------------------------------------------------* rm -f $EXPORTDIR/$TBL_LC.txt rm -f $EXPORTDIR/.$TBL_LC.txt.crc done #*----------------------------------------------------------------------------* #* Count the rows in each exported table as a sanity check for the user. * #*----------------------------------------------------------------------------* echo " Row Counts: " >> $LOGFILE wc -l $EXPORTDIR/*.dlm >> $LOGFILE echo "" >> $LOGFILE #*----------------------------------------------------------------------------* #* Produce a useful message for the user. * #*----------------------------------------------------------------------------* echo " Your table(s) have been exported and are located in $EXPORTDIR." >> $LOGFILE echo " After FTP'ing your files over, please delete them from the server tout de suite!" >> $LOGFILE #*----------------------------------------------------------------------------* #* E-Mail the user as well as the programmer of this script. * #*----------------------------------------------------------------------------* cat $LOGFILE | mail -s "Export Complete" $EMAIL cat $LOGFILE | mail -s "Export Complete" smithbob@company.com exit [smithbob@lnxserver ~]$ ./tableExporter ";" "mikethesalesguy@company.com" "dim_postal_code dim_us_state_mapping" [smithbob@lnxserver test]$ cd /tmp/prod_schema_export/ [smithbob@lnxserver prod_schema_export]$ lsf total 1828 drwxrwxr-x. 3 smithbob smithbob 77 Apr 24 10:46 ./ drwxrwxrwt. 600 root root 61440 Apr 24 10:48 ../ -rw-rw-r--. 1 smithbob smithbob 1784423 Apr 24 10:46 dim_postal_code.dlm -rw-rw-r--. 1 smithbob smithbob 1011 Apr 24 10:46 dim_us_state_mapping.dlm drwxrwxr-x. 2 smithbob smithbob 30 Apr 24 10:46 logs/ [smithbob@lnxserver prod_schema_export]$ Row Counts: 43690 /tmp/prod_schema_export/dim_postal_code.dlm 66 /tmp/prod_schema_export/dim_us_state_mapping.dlm 43756 total Chapter 24 - The Impala Queries Webpages [hdpserver.com:21000] prod_schema> create table state_code_jamboree stored as parquet as select A.state_code as state_code_A,B.state_code as state_code_B from dim_us_state_mapping A cross join dim_us_state_mapping B; Query progress can be monitored at: http://lnxserver:25000/query_plan?query_id=2c47fffa33cb35e3:4083c43700000000 +----------------------+ | summary | +----------------------+ | Inserted 4225 row(s) | +----------------------+ Fetched 1 row(s) in 0.52s PART V - HPL/SQL Procedural Language Chapter 25 - Introduction to HPL/SQL hplsql -f hplsql_program_file.hplsql --define parm1='value1' --define parm2='value2' ... --define parmN='valueN' [smithbob@lnxserver ~]$ hplsql -f integrate_x2.hplsql --define pX0=0 --define pX1=10 --define pN=1000000 hplsql -e '...some SQL code...' --define parm1='value1' [smithbob@lnxserver ~]$ hplsql -e "SUBSTR('ABC',x0,x1)" --define x0=1 --define x1=2 SLF4J: Class path contains multiple SLF4J bindings. SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/log4j-slf4j-impl-2.13.3.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory] WARNING: Use "yarn jar" to launch YARN applications. SLF4J: Class path contains multiple SLF4J bindings. SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/log4j-slf4j-impl-2.13.3.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory] AB <-- RESULTS ARE HERE!! hplsql -f hplsql_program_file.hplsql --define parm1='' [smithbob@lnxserver ~]$ hplsql -f pgm1.hplsql --define beg_yyyymm=`date -d "-11 month" +%Y%m` --define end_yyyymm=`date +%Y%m` hplsql.conn.impala com.cloudera.impala.jdbc4.Driver;jdbc:impala://hdpserver:21050/ default;smithbob;PaSsWoRd123; Impala JDBC Connection set hplsql.conn.default=impala; Open connection: jdbc:impala://hdpserver:21050/default (178 ms) hplsql.conn.hive2conn org.apache.hive.jdbc.HiveDriver;jdbc:hive2://ip-address:10000; user;password; HiveServer2 JDBC connection hplsql.conn.mysqlconn com.mysql.jdbc.Driver;jdbc:mysql://ip-address/ schema?serverTimezone=UTC;user;password; MySQL connection hplsql.conn.impala com.cloudera.impala.jdbc4.Driver;jdbc:impala://hdpserver:21050/prod_schema;AuthMech=1;KrbRealm=COMPANY.COM;KrbHostFQDN=hdpserver.company.com;KrbServiceName=impala; Impala JDBC Connection Chapter 26 - HPL/SQL Syntax * BOOL - Same as BOOLEAN * INT8 - Same as BIGINT * INT4/INTEGER/PLS_INTEGER/SIMPLE_INTEGER/BINARY_INTEGER - Same as INT * INT2 - same as SMALLINT * BINARY_DOUBLE/DOUBLE PRECISION/SIMPLE_DOUBLE - Same as DOUBLE * BINARY_FLOAT/SIMPLE_FLOAT - Same as FLOAT/REAL * DATETIME - Same as TIMESTAMP * NUMBER(p,s)/NUMERIC(p,s) - Same as DECIMAL(p,s) * CHARACTER(n) - Same as CHAR(n) * NCHAR(n)/NVARCHAR(n)/VARCHAR(n)/VARCHAR2(n)/VARCHAR(max) - Same as STRING ...crap goes here... DECLARE ...more crap goes here... BEGIN ...even more crap goes here... END; ...don't usually crap here... set hplsql.conn.default=impala; DECLARE iCNT int; sSQL string := 'SELECT COUNT(*) AS CNT FROM PROD_SCHEMA.DIM_POSTAL_CODE'; BEGIN EXECUTE(sSQL) INTO iCNT; DBMS_OUTPUT.PUT_LINE(iCNT); END; CREATE OR REPLACE FUNCTION function_name(IN pARG1 datatype1,...) RETURNS datatype AS ...variable declarations... BEGIN ...your code here... RETURN variable; END; PRINT MYFTN('TESTING 1, 2, 3!'); CREATE OR REPLACE PROCEDURE procedure_name(direction1 pARG1 datatype1,...) AS ...variable declarations... BEGIN ...your code here... END; CALL MYPROC('TESTING 1, 2, 3!'); set hplsql.conn.default=impala; include /directory/location/hplsql/files/myproc.hplsql declare begin call myproc('TESTING 1, 2, 3!'); end; IF boolean-expression THEN ...statements... END IF; IF boolean-expression THEN ...statements... ELSEIF boolean-expression-1 THEN ...statements... ELSEIF boolean-expression-2 THEN ...statements... END IF; IF boolean-expression THEN ...statements... ELSEIF boolean-expression-1 THEN ...statements... ELSEIF boolean-expression-2 THEN ...statements... ELSE ...statements... END IF; FOR index-variable IN starting-value..ending-value LOOP ...statements... END LOOP; FOR index_variable IN starting-value..ending-value STEP step-value LOOP ...statements... END LOOP; FOR index_variable IN REVERSE starting-value..ending-value LOOP ...statements... END LOOP; WHILE boolean-expression LOOP ...statements... END LOOP; LOOP ...statements... END LOOP; SET variable-name-1 = value-1; variable-name-1 := value-1; SET variable-name-1 = value-1, variable-name-2 = value-2, ..., variable-name-n = value-n; SET (variable-name-1,variable-name-2,...) = (value-1,value-2,...); IF (iCNT1 < iCNT2) AND (iCNT1 < iCNT2) THEN PRINT 'HOORAY!'; END IF; IF (iCNT1 < iCNT2) OR (iCNT1 < iCNT2) THEN PRINT 'HOORAY!'; END IF; IF NOT(iCNT1 > iCNT2) THEN PRINT 'HOORAY!'; END IF; --This is an inline comment. iCNT INT := 0; --This is an inline comment. /* The following code will not run because it`s part of a multiline comment: iCNT1 INT := 1; iCNT2 INT := 2; iCNT3 INT := 3; */ DATE 'YYYY-MM-DD' TIMESTAMP 'YYYY-MM-DD HH:MI:SS.sss' INTERVAL shift-amount DAYS INTERVAL shift-amount MICROSECONDS PRINT DATE '1962-03-21' + INTERVAL 1 DAY; 1962-03-22 sSTR1 := 'HELLO '; sSTR2 := 'WORLD!'; sSTR := sSTR1 + sSTR2; sSTR := sSTR1 || sSTR2; sSQL := " SELECT * FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE='PA' ORDER BY POSTAL_CODE "; sSQL := " SELECT * FROM PROD_SCHEMA.DIM_POSTAL_CODE WHERE STATE_CODE='" || sSTATE_CODE || "' ORDER BY POSTAL_CODE "; sSTR := CONCAT(sSTR1,sSTR2); HPL/SQL Example #1 - The First of the Examples DECLARE X0 DOUBLE := 0; X1 DOUBLE := 10; H DOUBLE; N INT; TOT_AREA DOUBLE := 0; X0_INCR DOUBLE := X0; BEGIN --DEFINE N, THE NUMBER OF RECTANGLES. N := 1000000; --COMPUTE H BASED ON X0, X1 AND N. THIS IS THE INCREMENT AMOUNT -- TO BE APPLIED TO X0_INCR AS WELL AS THE WIDTH OF EACH RECTANGLE. H := (X1 - X0)/N; --LOOP AROUND EACH OF THE N RECTANGLES SUMMING UP THE AREA -- BASED ON THE RECTANGULAR RULE. FOR i IN 1..N LOOP --COMPUTE THIS ITERATION`S TOTAL AREA BY MULTIPLYING THE -- WIDTH (H) OF EACH RECTANGLE BY THE HEIGHT. TOT_AREA := TOT_AREA + H*(X0_INCR*X0_INCR); --SLIDE XO_INCR TO THE RIGHT BY H. PROBABLY SHOULD ADD AN EXTRA HALF --TO HIT THE CENTER OF THE RECTANGLE, BUT I DON'T HAVE THE STRENGTH. X0_INCR := X0_INCR + H; END LOOP; PRINT TOT_AREA; END; hplsql -f integrate_x2.hplsql 333.3328333242384 [smithbob@lnxserver ~]$ hplsql -f integrate_x2.hplsql --define pX0=0 --define pX1=10 --define pN=1000000 X0 DOUBLE := CAST(pX0 AS DOUBLE); X1 DOUBLE := CAST(pX1 AS DOUBLE); N INT := CAST(pN AS INT); --DEFINE N, THE NUMBER OF RECTANGLES. N := 1000000; HPL/SQL Example #2 - The Second of the Examples CREATE OR REPLACE FUNCTION MYFTN(pX IN DOUBLE) RETURN DOUBLE AS BEGIN -- COMPUTE x^2 as x*x RETURN pX * pX; END; CREATE OR REPLACE PROCEDURE COMPUTE_AREA(pX0 IN DOUBLE, pX1 IN DOUBLE, pN IN DOUBLE) AS -- DEFINE VARIABLES HERE‼ THIS IS EFFECTIVELY THE DECLARE SECTION! X0 DOUBLE; X1 DOUBLE; H DOUBLE; N INT; TOT_AREA DOUBLE; X0_INCR DOUBLE; BEGIN -- ASSIGN TO THE VARIABLES HERE‼ X0 := CAST(pX0 AS DOUBLE); X1 := CAST(pX1 AS DOUBLE); N := CAST(pN AS INT); TOT_AREA := 0; X0_INCR := X0; --COMPUTE H BASED ON X0, X1 AND N. THIS IS THE INCREMENT AMOUNT -- TO BE APPLIED TO X0_INCR AS WELL AS THE WIDTH. H := (X1 - X0)/N; --LOOP AROUND EACH OF THE n RECTANGLES SUMMING UP THE AREA -- BASED ON THE RECTANGULAR RULE. FOR i IN 1..N LOOP --COMPUTE THIS ITERATION`S TOTAL AREA BY MULTIPLYING THE -- WIDTH (H) OF EACH RECTANGLE BY THE HEIGHT. TOT_AREA := TOT_AREA + H*MYFTN(X0_INCR); --SLIDE XO_INCR TO THE RIGHT BY H. X0_INCR := X0_INCR + H; END LOOP; PRINT TOT_AREA; END; --CALL THE PROCEDURE CALL COMPUTE_AREA(0,10,1000000); CREATE OR REPLACE PROCEDURE COMPUTE_AREA(pX0 IN DOUBLE, pX1 IN DOUBLE, pN IN DOUBLE, pOUT_TOTAREA OUT DOUBLE) AS pOUT_TOTAREA := TOT_AREA; --CALL THE PROCEDURE DECLARE dTOTAREA DOUBLE; CALL COMPUTE_AREA(0,10,1000000,dTOTAREA); PRINT dTOTAREA; --CALL THE PROCEDURE DECLARE dTOTAREA DOUBLE; CALL COMPUTE_AREA(0,10,1000000,dTOTAREA); PRINT dTOTAREA; INCLUDE /directory/location/hplsql/files/integrate_x2.hplsql DECLARE X0 DOUBLE := CAST(pX0 AS DOUBLE); X1 DOUBLE := CAST(pX1 AS DOUBLE); N INT := CAST(pN AS INT); dTOTAREA DOUBLE; BEGIN CALL COMPUTE_AREA(X0,X1,N,dTOTAREA); PRINT dTOTAREA; END; hplsql -f stubby.hplsql --define pX0=0 --define pX1=10 --define pN=1000000 CREATE OR REPLACE PACKAGE package_name AS ...variable declarations... ...function declarations... ...procedure declarations... END; CREATE OR REPLACE PACKAGE BODY package_name AS ...variable declarations... ...function code... ...procedure code... END; HPL/SQL Example #3 - The Third of the Examples /* PACKAGE SPECIFICATION */ CREATE OR REPLACE PACKAGE PKG_COMPUTE_AREA AS /* VARIABLE DECLARATION */ X0 INT; --INITIAL X-VALUE X1 DOUBLE; --ENDING X-VALUE H DOUBLE; --RECTANGLE WIDTH N INT; --NUMBER OF RECTANGLES TOT_AREA DOUBLE; --TOTAL AREA /* FUNCTION DECLARATION(S) */ --FUNCTION TO RETURN THE SQUARE OF THE ARGUMENT FUNCTION MYFTN(pX IN DOUBLE) RETURN DOUBLE; --FUNCTION TO RETURN THE COMPUTED AREA FUNCTION GET_AREA() RETURN DOUBLE; /* PROCEDURE DECLARATION(S) */ --PROCEDURE TO COMPUTE THE AREA UNDER THE CURVE PROCEDURE COMPUTE_AREA(pX0 IN DOUBLE,pX1 IN DOUBLE,pN IN DOUBLE); END; /* PACKAGE BODY */ CREATE OR REPLACE PACKAGE BODY PKG_COMPUTE_AREA AS /* VARIABLE DECLARATION(S) */ X0_INCR DOUBLE; /* FUNCTION(S) */ --FUNCTION TO RETURN THE SQUARE OF THE ARGUMENT CREATE OR REPLACE FUNCTION MYFTN(pX IN DOUBLE) RETURN DOUBLE AS BEGIN RETURN pX * pX; END; --FUNCTION TO RETURN THE COMPUTED AREA CREATE OR REPLACE FUNCTION GET_AREA() RETURN DOUBLE AS BEGIN RETURN TOT_AREA; END; /* PROCEDURE(S) */ CREATE OR REPLACE PROCEDURE COMPUTE_AREA(pX0 IN DOUBLE, pX1 IN DOUBLE, pN IN DOUBLE) AS BEGIN -- ASSIGN TO THE VARIABLES HERE. X0 := CAST(pX0 AS DOUBLE); X1 := CAST(pX1 AS DOUBLE); N := CAST(pN AS INT); TOT_AREA := 0; X0_INCR := X0; --COMPUTE H BASED ON X0, X1 AND N. THIS IS THE INCREMENT AMOUNT -- TO BE APPLIED TO X0_INCR AS WELL AS THE WIDTH. H := (X1 - X0)/N; --LOOP AROUND EACH OF THE n RECTANGLES SUMMING UP THE AREA -- BASED ON THE RECTANGULAR RULE. FOR i IN 1..N LOOP --COMPUTE THIS ITERATION`S TOTAL AREA BY MULTIPLYING THE -- WIDTH (H) OF EACH RECTANGLE BY THE HEIGHT. TOT_AREA := TOT_AREA + H*MYFTN(X0_INCR); --SLIDE XO_INCR TO THE RIGHT BY H. X0_INCR := X0_INCR + H; END LOOP; END; END; INCLUDE /directory/location/hplsql/files/integrate_x2.hplsql DECLARE dTOTAREA double; BEGIN --COMPUTE THE AREA UNDER X^2 FROM 0 TO 10 USING 1,000,000 RECTANGLES. PKG_COMPUTE_AREA.COMPUTE_AREA(0,10,1000000); --RETRIEVE THE COMPUTED AREA USING THE GET_AREA() FUNCTION. dTOTAREA := PKG_COMPUTE_AREA.GET_AREA(); PRINT dTOTAREA; END; PRINT PKG_COMPUTE_AREA.X0; PRINT PKG_COMPUTE_AREA.X1; PRINT PKG_COMPUTE_AREA.N; PRINT PKG_COMPUTE_AREA.H; PRINT PKG_COMPUTE_AREA.X0_INCR; Chapter 27 - HPL/SQL and Chatting with a Database set hplsql.conn.default=impala; hplsql.conn.impala com.cloudera.impala.jdbc4.Driver;jdbc:impala://hdpserver:21050/ default;smithbob;PaSsWoRd123; Impala JDBC Connection hplsql.conn.impala com.cloudera.impala.jdbc4.Driver;jdbc:impala://hdpserver:21050/ prod_schema;smithbob;PaSsWoRd123; Impala JDBC Connection set hplsql.conn.default=impala; use prod_schema; sSQL := "use prod_schema"; EXECUTE sSQL; EXECUTE ddl-sql-string; sSQL := " CREATE TABLE DIM_POSTAL_CODE_BACKUP STORED AS PARQUET AS SELECT * FROM DIM_POSTAL_CODE "; EXECUTE sSQL; sSQL_TMPL := " CREATE TABLE DIM_POSTAL_CODE_BACKUP_stcd STORED AS PARQUET AS SELECT * FROM DIM_POSTAL_CODE WHERE STATE_CODE='stcd' "; --The selected state code is coming in from our fab dept website! sSQL := REPLACE(sSQL_TMPL,"stcd","PA"); EXECUTE sSQL; EXECUTE dml-sql-string INTO hplsql-var1, hplsql-var2,...; set hplsql.conn.default=impala; DECLARE iROWCNT int; iSTCDCNT int; sSQL string; BEGIN sSQL := "use prod_schema"; EXECUTE sSQL; sSQL := " SELECT COUNT(*) AS ROWCNT, COUNT(DISTINCT STATE_CODE) AS DISTSTCD FROM DIM_POSTAL_CODE "; EXECUTE sSQL INTO iROWCNT,iSTCDCNT; PRINT "DIM_POSTAL_CODE has " || TO_CHAR(iROWCNT) || " rows."; PRINT "DIM_POSTAL_CODE has " || TO_CHAR(iSTCDCNT) || " distinct state codes."; END; [smithbob@lnxserver ~]$ hplsql -f counts.hplsql SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". SLF4J: Defaulting to no-operation (NOP) logger implementation SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details. Open connection: jdbc:impala://hdpserver:21050/default (140 ms) Starting SQL statement SQL statement executed successfully (76 ms) Starting SQL statement SQL statement executed successfully (245 ms) DIM_POSTAL_CODE has 43689 rows. DIM_POSTAL_CODE has 61 distinct state codes. set hplsql.conn.default=impala; DECLARE sSQL string; BEGIN sSQL := "use prod_schema"; EXECUTE sSQL; sSQL := " SELECT * FROM DIM_POSTAL_CODE WHERE STATE_CODE='NJ' ORDER BY POSTAL_CODE LIMIT 10 "; EXECUTE sSQL; END; SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". SLF4J: Defaulting to no-operation (NOP) logger implementation SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details. Open connection: jdbc:impala://hdpserver:21050/default (141 ms) Starting SQL statement SQL statement executed successfully (76 ms) Starting SQL statement SQL statement executed successfully (256 ms) 07001 AVENEL NJ 40.57899 -74.27987 07002 BAYONNE NJ 40.66655 -74.11768 07003 BLOOMFIELD NJ 40.80300 -74.18895 07004 FAIRFIELD NJ 40.87904 -74.29378 07005 BOONTON NJ 40.91279 -74.41516 07006 CALDWELL NJ 40.84899 -74.27917 07007 CALDWELL NJ 40.79185 -74.24524 07008 CARTERET NJ 40.58250 -74.22997 07009 CEDAR GROVE NJ 40.85585 -74.22898 07010 CLIFFSIDE PARK NJ 40.82154 -73.98949 HPL/SQL Example #4 - The Fourth of the Examples set hplsql.conn.default=impala; DECLARE sSQL STRING; -- GENERIC SQL STRING --SQL INSERT TEMPLATE sSQL_INSERT_TMPL STRING := " INSERT INTO DIM_CALENDAR VALUES( DATE '{sDATEID}', {sDAY}, {sMONTH}, {sYEAR}, {sQUARTER}, '{sYYYYDDD}', '{sDDD}', DATE '{sFIRSTDAYOFMONTH}', DATE '{sFIRSTDAYOFQUARTER}', DATE '{sFIRSTDAYOFYEAR}', '{sMONTHNAME}', '{sDAYNAME}', '{sYYYYQQ}', '{sYYYYMM}', '{sYYYYMMDD}', '{sDAYLONG}', '{sDAYSHORT}' ) "; sSQL_INSERT STRING; --FINAL INSERT STRING FOR EACH DAY dBEGDATE DATE := DATE '2021-01-01'; --BEGINNING DATE dCURDATE DATE := dBEGDATE; dENDDATE DATE := DATE '2021-12-31'; --ENDING DATE bSTATE BOOLEAN := TRUE; --WHILE LOOP STATE sDATEID STRING; sYEAR STRING; sMONTH STRING; sDAY STRING; sYYYYQQ STRING; sYYYYMM STRING; sYYYMMDD STRING; sQUARTER STRING; sYYYYDDD STRING; sDDD STRING; sFIRSTDAYOFMONTH STRING; sFIRSTDAYOFQUARTER STRING; sFIRSTDAYOFYEAR STRING; sMONTHNAME STRING; sDAYNAME STRING; sDAYLONG STRING; sDAYSHORT STRING; BEGIN --CHANGE TO PROD_SCHEMA sSQL := "USE PROD_SCHEMA"; EXECUTE sSQL; --DROP THE TABLE DIM_CALENDAR sSQL := "DROP TABLE IF EXISTS DIM_CALENDAR PURGE"; EXECUTE sSQL; --CREATE THE TABLE DIM_CALENDAR sSQL := " CREATE TABLE DIM_CALENDAR( DATE_ID DATE, DAY TINYINT, MONTH TINYINT, YEAR INT, QUARTER TINYINT, YYYYDDD STRING, DDD STRING, FIRST_DAY_OF_MONTH DATE, FIRST_DAY_OF_QUARTER DATE, FIRST_DAY_OF_YEAR DATE, MONTH_NAME STRING, WEEKDAY_NAME STRING, YYYYQQ STRING, YYYYMM STRING, YYYYMMDD STRING, DATE_LONG STRING, DATE_SHORT STRING ) STORED AS PARQUET "; EXECUTE sSQL; --LOOP AROUND FROM dBEGDATE TO dENDDATE WHILE bSTATE LOOP --IF THE CURRENT DATE IS THE SAME AS THE END DATE, CHANGE bSTATE TO FALSE. IF TO_CHAR(dCURDATE) = TO_CHAR(dENDDATE) THEN bSTATE := FALSE; END IF; /* GATHER THE COLUMNS FOR THE TABLE. */ --DATE_ID IS FORMATTED AS YYYY-MM-DD AUTOMATICALLY BY TO_CHAR(). sDATEID := TO_CHAR(dCURDATE); --YYYY-MM-DD sDAY := SUBSTR(sDATEID,9,2); sMONTH := SUBSTR(sDATEID,6,2); sYEAR := SUBSTR(sDATEID,1,4); --DETERMINE THE QUARTER FROM sMONTH. IF sMONTH='01' OR sMONTH='02' OR sMONTH='03' THEN sQUARTER := '1'; ELSEIF sMONTH='04' OR sMONTH='05' OR sMONTH='06' THEN sQUARTER := '2'; ELSEIF sMONTH='07' OR sMONTH='08' OR sMONTH='09' THEN sQUARTER := '3'; ELSEIF sMONTH='10' OR sMONTH='11' OR sMONTH='12' THEN sQUARTER := '4'; END IF; --CREATE sYYYYMM, sYYYYMMDD AND sYYYYQQ FROM THE OTHER VARIABLES. sYYYYMM := sYEAR || sMONTH; sYYYYMMDD := sYEAR || sMONTH || sDAY sYYYYQQ := sYEAR || "0" || sQUARTER; --CREATE THE JULIAN DAY FROM IMPALA DIRECTLY. --DAYOFYEAR() DOES NOT RETURN LEADING ZEROES, SO WE PUT THEM IN BELOW. sSQL := "SELECT DAYOFYEAR(DATE '" || sDATEID || "')"; EXECUTE sSQL INTO sDDD; IF LENGTH(sDDD)=1 THEN sDDD := "00" || sDDD; ELSEIF LENGTH(sDDD)=2 THEN sDDD := "0" || sDDD; END IF; --CREATE THE JULIAN DAY WITH THE YEAR PREPENDED. sYYYYDDD := sYEAR || sDDD; --CREATE THE FIRST DAY OF MONTH, QUARTER AND YEAR. sSQL := " SELECT TRUNC(DT,'MONTH') AS FIRST_DAY_OF_MTH, TRUNC(DT,'Q') AS FIRST_DAY_OF_QTR, TRUNC(DT,'YEAR') AS FIRST_DAY_OF_YR, MONTHNAME(DT) AS MONTH_NAME, DAYNAME(DT) AS DAY_NAME FROM ( SELECT DATE '" || sDATEID || "' AS DT ) A "; EXECUTE sSQL INTO sFIRSTDAYOFMONTH, sFIRSTDAYOFQUARTER, sFIRSTDAYOFYEAR, sMONTHNAME, sDAYNAME; --CREATE LONG FORMAT (Month dd, yyyy) AND SHORT FORMAT (ddMONYYYY) STRINGS. sDAYLONG := sMONTHNAME || " " || sDAY || ", " || sYEAR; sDAYSHORT := sDAY || UPPER(SUBSTR(sMONTHNAME,1,3)) || sYEAR --CREATE THIS DAY`S INSERT STRING BY UPDATING THE TEMPLATE. sSQL_INSERT := REPLACE(sSQL_INSERT_TMPL,'\{sDATEID\}',sDATEID); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sDAY\}',sDAY); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sMONTH\}',sMONTH); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sYEAR\}',sYEAR); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sQUARTER\}',sQUARTER); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sYYYYDDD\}',sYYYYDDD); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sDDD\}',sDDD); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sFIRSTDAYOFMONTH\}',sFIRSTDAYOFMONTH); sSQL_INSERT := REPLACE(sSQL_INSERT, '\{sFIRSTDAYOFQUARTER\}',sFIRSTDAYOFQUARTER); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sFIRSTDAYOFYEAR\}',sFIRSTDAYOFYEAR); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sMONTHNAME\}',sMONTHNAME); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sDAYNAME\}',sDAYNAME); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sYYYYQQ\}',sYYYYQQ); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sYYYYMM\}',sYYYYMM); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sYYYYMMDD\}',sYYYYMMDD); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sDAYLONG\}',sDAYLONG); sSQL_INSERT := REPLACE(sSQL_INSERT,'\{sDAYSHORT\}',sDAYSHORT); --INSERT TODAY`S DATA INTO THE DATABASE. EXECUTE sSQL_INSERT; --INCREMENT THE DATE BY ONE DAY. dCURDATE := dCURDATE + INTERVAL 1 DAY; END WHILE; --COMPUTE STATS ON THE TABLE. sSQL := 'COMPUTE STATS DIM_CALENDAR'; EXECUTE sSQL; END; set hplsql.conn.default=impala; DECLARE dBEGDATE DATE := DATE '2021-01-01'; --BEGINNING DATE dCURDATE DATE := dBEGDATE; dENDDATE DATE := DATE '2021-12-31'; --ENDING DATE bSTATE BOOLEAN := TRUE; --WHILE LOOP STATE sDATEID STRING; sYEAR STRING; iYEAR INT; sMONTH STRING; iMONTH TINYINT; sDAY STRING; iDAY TINYINT; sYYYYQQ STRING; sYYYYMM STRING; sYYYMMDD STRING; sQUARTER STRING; iQUARTER TINYINT; sYYYYDDD STRING; sDDD STRING; sFIRSTDAYOFMONTH STRING; dFIRSTDAYOFMONTH DATE; sFIRSTDAYOFQUARTER STRING; sFIRSTDAYOFYEAR STRING; sMONTHNAME STRING; sDAYNAME STRING; sDAYLONG STRING; sDAYSHORT STRING; BEGIN --CHANGE TO PROD_SCHEMA USE PROD_SCHEMA; --DROP THE TABLE DIM_CALENDAR DROP TABLE IF EXISTS DIM_CALENDAR PURGE; --CREATE THE TABLE DIM_CALENDAR CREATE TABLE DIM_CALENDAR( DATE_ID DATE, DAY TINYINT, MONTH TINYINT, YEAR INT, QUARTER TINYINT, YYYYDDD STRING, DDD STRING, FIRST_DAY_OF_MONTH DATE, FIRST_DAY_OF_QUARTER DATE, FIRST_DAY_OF_YEAR DATE, MONTH_NAME STRING, WEEKDAY_NAME STRING, YYYYQQ STRING, YYYYMM STRING, YYYYMMDD STRING, DATE_LONG STRING, DATE_SHORT STRING ) STORED AS PARQUET; --LOOP AROUND FROM dBEGDATE TO dENDDATE WHILE bSTATE LOOP --IF THE CURRENT DATE IS THE SAME AS THE END DATE, CHANGE bSTATE TO FALSE. IF TO_CHAR(dCURDATE) = TO_CHAR(dENDDATE) THEN bSTATE := FALSE; END IF; /* GATHER THE COLUMNS FOR THE TABLE. */ --DATE_ID IS FORMATTED AS YYYY-MM-DD AUTOMATICALLY BY TO_CHAR(). sDATEID := TO_CHAR(dCURDATE); --YYYY-MM-DD sDAY := SUBSTR(sDATEID,9,2); iDAY := CAST(SUBSTR(sDATEID,9,2) AS TINYINT); sMONTH := SUBSTR(sDATEID,6,2); iMONTH := CAST(SUBSTR(sDATEID,6,2) AS TINYINT); sYEAR := SUBSTR(sDATEID,1,4); iYEAR := CAST(SUBSTR(sDATEID,1,4) AS INT); --DETERMINE THE QUARTER FROM sMONTH. IF sMONTH='01' OR sMONTH='02' OR sMONTH='03' THEN sQUARTER := '1'; ELSEIF sMONTH='04' OR sMONTH='05' OR sMONTH='06' THEN sQUARTER := '2'; ELSEIF sMONTH='07' OR sMONTH='08' OR sMONTH='09' THEN sQUARTER := '3'; ELSEIF sMONTH='10' OR sMONTH='11' OR sMONTH='12' THEN sQUARTER := '4'; END IF; iQUARTER := CAST(sQUARTER AS TINYINT); --CREATE sYYYYMM, sYYYYMMDD AND sYYYYQQ FROM THE OTHER VARIABLES. sYYYYMM := sYEAR || sMONTH; sYYYYMMDD := sYEAR || sMONTH || sDAY sYYYYQQ := sYEAR || "0" || sQUARTER; --CREATE THE JULIAN DAY FROM IMPALA DIRECTLY. --DAYOFYEAR() DOES NOT RETURN LEADING ZEROES, SO WE PUT THEM IN BELOW. SELECT DAYOFYEAR(sDATEID) INTO sDDD; IF LENGTH(sDDD)=1 THEN sDDD := "00" || sDDD; ELSEIF LENGTH(sDDD)=2 THEN sDDD := "0" || sDDD; END IF; --CREATE THE JULIAN DAY WITH THE YEAR PREPENDED. sYYYYDDD := sYEAR || sDDD; --CREATE THE FIRST DAY OF MONTH, QUARTER AND YEAR. SELECT CAST(TRUNC(DT,'MONTH') AS STRING), CAST(TRUNC(DT,'Q') AS STRING), CAST(TRUNC(DT,'YEAR') AS STRING), MONTHNAME(DT), DAYNAME(DT) INTO sFIRSTDAYOFMONTH,sFIRSTDAYOFQUARTER,sFIRSTDAYOFYEAR,sMONTHNAME,sDAYNAME FROM ( SELECT sDATEID AS DT ) A; --CREATE LONG FORMAT (Month dd, yyyy) AND SHORT FORMAT (ddMONYYYY) STRINGS. sDAYLONG := sMONTHNAME || " " || sDAY || ", " || sYEAR; sDAYSHORT := sDAY || UPPER(SUBSTR(sMONTHNAME,1,3)) || sYEAR --INSERT THIS DAY`S INFO INTO THE TABLE. INSERT INTO DIM_CALENDAR SELECT sDATEID, iDAY, iMONTH, iYEAR, iQUARTER, sYYYYDDD, sDDD, sFIRSTDAYOFMONTH, sFIRSTDAYOFQUARTER, sFIRSTDAYOFYEAR, sMONTHNAME, sDAYNAME, sYYYYQQ, sYYYYMM, sYYYYMMDD, sDAYLONG, sDAYSHORT; --INCREMENT THE DATE BY ONE DAY. dCURDATE := dCURDATE + INTERVAL 1 DAY; END WHILE; --COMPUTE STATS ON THE TABLE. sSQL := 'COMPUTE STATS DIM_CALENDAR'; EXECUTE sSQL; END; SELECT DAYOFYEAR(sDATEID) INTO sDDD; SELECT CAST(TRUNC(DT,'MONTH') AS STRING), CAST(TRUNC(DT,'Q') AS STRING), CAST(TRUNC(DT,'YEAR') AS STRING), MONTHNAME(DT), DAYNAME(DT) INTO sFIRSTDAYOFMONTH,sFIRSTDAYOFQUARTER,sFIRSTDAYOFYEAR,sMONTHNAME,sDAYNAME FROM ( SELECT sDATEID AS DT ) A; SELECT CAST(TRUNC(sDATEID,'MONTH') AS STRING), CAST(TRUNC(sDATEID,'Q') AS STRING), CAST(TRUNC(sDATEID,'YEAR') AS STRING), MONTHNAME(sDATEID), DAYNAME(sDATEID) INTO sFIRSTDAYOFMONTH,sFIRSTDAYOFQUARTER,sFIRSTDAYOFYEAR,sMONTHNAME,sDAYNAME; java.sql.SQLException: [Cloudera][ImpalaJDBCDriver](500051) ERROR processing query/statement. Error Code: 0, SQL state: TStatus(statusCode:ERROR_STATUS, sqlState:HY000, errorMessage:AnalysisException: Could not resolve column/field reference: 'sdateid' ), Query: SELECT CAST(TRUNC(sDATEID,'MONTH') AS STRING), CAST(TRUNC(sDATEID,'Q') AS STRING), CAST(TRUNC(sDATEID,'YEAR') AS STRING), MONTHNAME('2021-01-01'), DAYNAME('2021-01-01'). [smithbob@lnxserver ~]$ hadoop fs -ls -R hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_calendar -rw-rw----+ 3 impala hive 4685 2022-03-24 10:59 hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_calendar/delta_10_10/b44f1b8a02701836-af00f19300000000_1527263800_data.0.parq -rw-rw----+ 3 impala hive 4685 2022-03-24 10:59 hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_calendar/delta_11_11/ba46de5e33074e11-4031dafb00000000_1033013878_data.0.parq -rw-rw----+ 3 impala hive 4692 2022-03-24 10:59 hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_calendar/delta_12_12/dd45f1ce44adf38d-8a1ce48700000000_1619420159_data.0.parq -rw-rw----+ 3 impala hive 4706 2022-03-24 10:59 hdfs://lnxserver.com:8020/warehouse/tablespace/managed/hive/dim_calendar/delta_13_13/554f1cfd17af1762-ef007dbf00000000_450424858_data.0.parq --A STRING VARIABLE TO HOLD AN ENTIRE LINE OF DATA. sLINE STRING; --CREATE THIS DAY`S INSERT STRING. sLINE := sDATEID || ";" || sDAY || ";" || sMONTH || ";" || sYEAR || ";" || sQUARTER || ";" || sYYYYDDD || ";" || sDDD || ";" || SUBSTR(sFIRSTDAYOFMONTH,1,10) || ";" || SUBSTR(sFIRSTDAYOFQUARTER,1,10) || ";" || SUBSTR(sFIRSTDAYOFYEAR,1,10) || ";" || sMONTHNAME || ";" || sDAYNAME || ";" || sYYYYQQ || ";" || sYYYYMM || ";" || sYYYYMMDD || ";" || sDAYLONG || ";" || sDAYSHORT; --WRITE THE LINE TO THE FILE DBMS_OUTPUT.PUT_LINE(sLINE); [smithbob@lnxserver ~]$ hplsql -f dimcalendar.hplsql > tmp_calendar.dlm 2021-01-01;01;01;2021;1;2021001;001;2021-01-01 00:00:00;2021-01-01 00:00:00;2021-01-01 00:00:00;January;...snip... 2021-01-02;02;01;2021;1;2021002;002;2021-01-01 00:00:00;2021-01-01 00:00:00;2021-01-01 00:00:00;January;...snip... 2021-01-03;03;01;2021;1;2021003;003;2021-01-01 00:00:00;2021-01-01 00:00:00;2021-01-01 00:00:00;January;...snip... 2021-01-04;04;01;2021;1;2021004;004;2021-01-01 00:00:00;2021-01-01 00:00:00;2021-01-01 00:00:00;January;...snip... 2021-01-05;05;01;2021;1;2021005;005;2021-01-01 00:00:00;2021-01-01 00:00:00;2021-01-01 00:00:00;January;...snip... ...snip... [smithbob@lnxserver ~]$ hadoop fs -mkdir hdfs://lnxserver.com:8020/warehouse/tablespace/external/tmp_calendar [smithbob@lnxserver ~]$ hadoop fs -put tmp_calendar.dlm hdfs://lnxserver.com:8020/warehouse/tablespace/external/tmp_calendar drop table dim_calendar purge; create table dim_calendar( date_id date, day tinyint, month tinyint, year int, quarter tinyint, yyyyddd string, ddd string, first_day_of_month date, first_day_of_quarter date, first_day_of_year date, month_name string, weekday_name string, yyyyqq string, yyyymm string, yyyymmdd string, date_long string, date_short string ) stored as parquet; insert into dim_calendar select cast(date_id as date format 'YYYY-MM-DD'), cast(day as tinyint), cast(month as tinyint), cast(year as int), cast(quarter as tinyint), yyyyddd, ddd, cast(first_day_of_month as date format 'YYYY-MM-DD'), cast(first_day_of_quarter as date format 'YYYY-MM-DD'), cast(first_day_of_year as date format 'YYYY-MM-DD'), month_name, weekday_name, yyyyqq, yyyymm, yyyymmdd, date_long, date_short from tmp_calendar; compute stats dim_calendar; drop table tmp_calendar purge; --A STRING VARIABLE TO HOLD AN ENTIRE LINE OF DATA. sLINE STRING; --FILE HANDLE FOR EXTERNAL FILE. oFILE UTL_FILE.FILE_TYPE; --OPEN THE EXTERNAL HDFS FILE. oFILE=UTL_FILE.FOPEN('hdfs://lnxserver.com:8020/warehouse/tablespace/ managed/hive/tmp_calendar','tmp_calendar.dlm','w'); --CREATE THIS DAY`S INSERT STRING. sLINE := sDATEID || ";" || sDAY || ";" || sMONTH || ";" || sYEAR || ";" || sQUARTER || ";" || sYYYYDDD || ";" || sDDD || ";" || sFIRSTDAYOFMONTH || ";" || sFIRSTDAYOFQUARTER || ";" || sFIRSTDAYOFYEAR || ";" || sMONTHNAME || ";" || sDAYNAME || ";" || sYYYYQQ || ";" || sYYYYMM || ";" || sYYYYMMDD || ";" || sDAYLONG || ";" || sDAYSHORT; --WRITE THE LINE TO THE FILE UTL_FILE.PUT_LINE(oFILE,sLINE); --CLOSE THE FILE UTL_FILE.FCLOSE(oFILE); drop table tmp_calendar; create external table tmp_calendar( date_id string, day string, month string, year string, quarter string, yyyyddd string, ddd string, first_day_of_month string, first_day_of_quarter string, first_day_of_year string, month_name string, weekday_name string, yyyyqq string, yyyymm string, yyyymmdd string, date_long string, date_short string ) row format delimited fields terminated by ';' stored as textfile location 'hdfs://lnxserver.com:8020/warehouse/tablespace/ external/tmp_calendar' tblproperties('skip.header.line.count'='0'); drop table dim_calendar purge; create table dim_calendar( date_id date, day tinyint, month tinyint, year int, quarter tinyint, yyyyddd string, ddd string, first_day_of_month date, first_day_of_quarter date, first_day_of_year date, month_name string, weekday_name string, yyyyqq string, yyyymm string, yyyymmdd string, date_long string, date_short string ) stored as parquet; REGEXP_REPLACE(DATE_ID,'\\x00','') insert into dim_calendar select cast(regexp_replace(date_id,'\\x00','') as date format 'YYYY-MM-DD'), cast(regexp_replace(day,'\\x00','') as tinyint), cast(regexp_replace(month,'\\x00','') as tinyint), cast(regexp_replace(year,'\\x00','') as int), cast(regexp_replace(quarter,'\\x00','') as tinyint), regexp_replace(yyyyddd,'\\x00',''), regexp_replace(ddd,'\\x00',''), cast(substr(regexp_replace(first_day_of_month,'\\x00',''),1,10) as date format 'YYYY-MM-DD'), cast(substr(regexp_replace(first_day_of_quarter,'\\x00',''),1,10) as date format 'YYYY-MM-DD'), cast(substr(regexp_replace(first_day_of_year,'\\x00',''),1,10) as date format 'YYYY-MM-DD'), regexp_replace(month_name,'\\x00',''), regexp_replace(weekday_name,'\\x00',''), regexp_replace(yyyyqq,'\\x00',''), regexp_replace(yyyymm,'\\x00',''), regexp_replace(yyyymmdd,'\\x00',''), regexp_replace(date_long,'\\x00',''), regexp_replace(date_short,'\\x00','') from tmp_calendar; compute stats dim_calendar; drop table tmp_calendar purge; FOR cursor-name IN (select-statement) LOOP ...statements... END LOOP; cursor-name.column-name set hplsql.conn.default=impala; DECLARE sSQL string; BEGIN --Change to PROD_SCHEMA USE PROD_SCHEMA; FOR csrNJONLY IN ( SELECT POSTAL_CODE,CITY FROM DIM_POSTAL_CODE WHERE STATE_CODE='NJ' ORDER BY POSTAL_CODE ) LOOP PRINT csrNJONLY.POSTAL_CODE + "/" + csrNJONLY.CITY; END LOOP; END; 07001/AVENEL 07002/BAYONNE 07003/BLOOMFIELD 07004/FAIRFIELD 07005/BOONTON 07006/CALDWELL 07007/CALDWELL 07008/CARTERET 07009/CEDAR GROVE 07010/CLIFFSIDE PARK 07011/CLIFTON ...snip... --Declare the cursor DECLARE cursor-name CURSOR FOR select-statement; --Open the cursor OPEN cursor-name; --Fetch the first row`s data into the HPL/SQL variable(s) FETCH cursor-name INTO hplsql-var1,hplsql-var2,... --Process the remaining rows one row at a time WHILE SQLCODE=0 LOOP ...statements... --Fetch the next row FETCH cursor-name INTO hplsql-var1,hplsql-var2,... END WHILE --All done? Be a good citizen and close the cursor! CLOSE cursor-name set hplsql.conn.default=impala; DECLARE sSQL string; sPOSTALCODE string; sCITY string; BEGIN --Change to PROD_SCHEMA USE PROD_SCHEMA; --SQL Query to pull only NJ from the DIM_POSTAL_CODE table. sSQL := " SELECT POSTAL_CODE,CITY FROM DIM_POSTAL_CODE WHERE STATE_CODE='NJ' ORDER BY POSTAL_CODE "; --Declare the cursor. DECLARE csrNJONLY CURSOR FOR sSQL; --Open the cursor. OPEN csrNJONLY; --Fetch the first row`s data. FETCH csrNJONLY INTO sPOSTALCODE,sCITY; --Loop around for each of the remaining rows of data. WHILE SQLCODE=0 LOOP PRINT sPOSTALCODE + "/" + sCITY; --Don`t forget the fetch inside the loop!! FETCH csrNJONLY INTO sPOSTALCODE,sCITY; END WHILE; --Close the cursor. CLOSE csrNJONLY; END; HOST 'command-string'; DECLARE sCMD string; BEGIN sCMD := "impala-shell -q 'use prod_schema;compute stats dim_postal_code;'"; HOST sCMD; END; #!/bin/bash # Schema name echo $1 # Table name echo $2 # Form the command to run...your command may be more complicated. sComputeStats="impala-shell --query='use $1;compute stats $2;'" echo $sComputeStats # Run the command. eval $sComputeStats exit CREATE OR REPLACE PROCEDURE IMPALA_STATS(psSCHEMA IN STRING, psTABLENAME IN STRING) AS sSQL STRING; BEGIN sSQL := '/directory/impalaTableStats ' || psSCHEMA || ' ' || psTABLENAME; PRINT sSQL; HOST sSQL; END; set hplsql.conn.default=impala; INCLUDE impala_stats.hplsql DECLARE sSCHEMA STRING; sTABLE STRING; BEGIN sSCHEMA := "PROD_SCHEMA"; sTABLE := "DIM_POSTAL_CODE"; CALL IMPALA_STATS(sSCHEMA,sTABLE); END; CREATE OR REPLACE PROCEDURE IMPALA_STATS(psSCHEMA IN STRING, psTABLENAME IN STRING) AS sSQL STRING; BEGIN sSQL := '/directory/impalaTableStats ' || psSCHEMA || ' ' || psTABLENAME; PRINT sSQL; HOST sSQL; /* CAPTURE THE RETURN CODE FROM THE HOST STATEMENT */ IF HOSTCODE <> 0 THEN PRINT "ERROR: HOST COMMAND >>> " + sSQL + " <<< FAILED WITH RETURN CODE " + TO_CHAR(HOSTCODE) + "! PLEASE HELP! I'M LOST!"; END IF; END; ERROR: HOST COMMAND >>> /directory/impalaTableStats PROD_SCHEMA DIM_POSTAL_CODE <<< FAILED WITH RETURN CODE 1! HELP! I'M TRAPPED IN A FORTUNE COOKIE FACTORY AND CAN'T GET OUT! Chapter 28 - Handling HPL/SQL Exceptions 1. 00000 - Successful execution of your SQL code. Note that the built-in variable SQLCODE will be set to 0 as well. 2. 01000 - No more data available and corresponds to SQLCODE being set to 100. 3. 02000 - A SQL error occurred and corresponds to SQLCODE being set to -1. ...statements... DECLARE ...statements... BEGIN ...statements... EXCEPTION WHEN condition-1 THEN ...statements... WHEN condition-2 THEN ...statements... ... WHEN condition-n THEN ...statements... WHEN OTHERS THEN ...statements... END; ...statements... DECLARE ...statements... BEGIN ...statements... EXCEPTION WHEN OTHERS THEN ...statements... END; DECLARE dVAL double := -999.0; dNUM int := 5; dDEN int := 0; BEGIN --DIVIDE dNUM BY dDEN...WHAT COULD POSSIBLY GO WORNG? dVAL := dNUM / dDEN; PRINT dVAL; EXCEPTION WHEN OTHERS THEN PRINT ">>>>> A BIG HONKIN' ERROR OCCURRED!! <<<<<"; END; >>>>> A BIG HONKIN' ERROR OCCURRED!! <<<<< SET HPLSQL.ONERROR = configuration-option; * EXCEPTION - If an error occurs, an exception is raised. How this is handled depends on whether you code an EXCEPTION WHEN OTHERS condition or you create your own condition handler. We discuss condition handlers later on in this chapter. (This option is the default behavior.) * SETERROR - If an error occurs, the built-in variables SQLCODE, ERRORCODE and HOSTCODE (if applicable) are set and the execution continues with the next statement. You can capture the codes from these built-in variables using an IF Statement. * STOP - If an error occurs, HPL/SQL stops executing the program and exits. set hplsql.onerror=seterror; DECLARE dVAL double := -999.0; dNUM int := 5; dDEN int := 0; BEGIN --DIVIDE dNUM BY dDEN...WHAT COULD POSSIBLY GO WORNG? dVAL := dNUM / dDEN; PRINT dVAL; EXCEPTION WHEN OTHERS THEN PRINT ">>>>> A BIG HONKIN' ERROR OCCURRED!! <<<<<"; END; -999.0 DECLARE user-defined-condition-name CONDITION; DECLARE control-option HANDLER FOR handler-option BEGIN ...statements... END; DECLARE EXIT HANDLER FOR user-defined-condition-name BEGIN ...statements... END; set hplsql.onerror=exeception; DECLARE dVAL double := -999.0; dNUM int := 5; dDEN int := 0; BEGIN --DECLARE A USER-DEFINED CONDITION BELOW. DECLARE zero_divide CONDITION; DECLARE EXIT HANDLER FOR zero_divide BEGIN PRINT ">>>>> DIVISION BY ZERO IS A NO-NO!! <<<<<"; END; --DIVIDE dNUM BY dDEN...WHAT COULD POSSIBLY GO WORNG? IF dDEN = 0 THEN SIGNAL zero_divide; ELSE dVAL := dNUM / dDEN; END IF; END; >>>>> DIVISION BY ZERO IS A NO-NO!! <<<<< DECLARE EXIT HANDLER FOR SQLEXCEPTION BEGIN ...statements... END; set hplsql.conn.default=impala; set hplsql.onerror=exeception; DECLARE iCNT int := -1; sSQL string; BEGIN DECLARE EXIT HANDLER FOR SQLEXCEPTION BEGIN PRINT ">>>>> UH-OH! THERE'S BEEN A SQL ERROR!! <<<<<"; END; sSQL := " SELECT COUNT(DISTINCT STATE_CODE) AS DIST_STATE FROM DIM_POSTAL_CODA "; EXECUTE sSQL INTO iCNT; PRINT iCNT; END; >>>>> UH-OH! THERE'S BEEN A SQL ERROR!! <<<<< GET DIAGNOSTICS EXCEPTION 1 hplsql-variable-name = MESSAGE_TEXT; DECLARE EXIT HANDLER FOR SQLEXCEPTION BEGIN GET DIAGNOSTICS EXCEPTION 1 sSQLERR = MESSAGE_TEXT; PRINT ">>>>> UH-OH! THERE'S BEEN A SQL ERROR!! <<<<<"; PRINT "THE ACTUAL ERROR MESSAGE IS => " + sSQLERR; END; >>>>> UH-OH! THERE'S BEEN A SQL ERROR!! <<<<< THE ACTUAL ERROR MESSAGE IS => [Cloudera][ImpalaJDBCDriver](500051) ERROR processing query/statement. Error Code: 0, SQL state: TStatus(statusCode:ERROR_STATUS, sqlState:HY000, errorMessage:AuthorizationException: User 'smithbob' does not have privileges to execute 'SELECT' on: prod_schema.dim_postal_coda ), Query: SELECT COUNT(DISTINCT STATE_CODE) AS DIST_STATE FROM DIM_POSTAL_CODA. set hplsql.conn.default=impala; set hplsql.onerror=exeception; DECLARE iCNT int := -1; sSQL string; BEGIN DECLARE EXIT HANDLER FOR NOT FOUND BEGIN PRINT ">>>>> NOT FOUND! <<<<<"; END; OPEN csrSTATECODES FOR 'SELECT DISTINCT STATE_CODE FROM DIM_POSTAL_CODA"; END; >>>>> NOT FOUND! <<<<< PART VI - Updating Your Database Chapter 29 - Database Import/Export Using sqoop sqoop list-tables --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password INFO sqoop.Sqoop: Running Sqoop version: 1.4.7-cdh6.3.2 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead. INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset. categories customers departments order_items orders products [smithbob@lnxserver ~]$ sqoop list-databases --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password information_schema amon metastore mysql oozie performance_schema retail_db rman scm [smithbob@lnxserver ~]$ sqoop import --query "select purchase_id, to_char(purchase_date,'YYYYMMDD') as purchase_date, purchase_item from purchase where \$CONDITIONS" sqoop import --table purchase sqoop import --table purchase --map-column-java unnecessarily_verbose_description=STRING CAST(INPUT_DATE/1000 AS TIMESTAMP) AS FINAL_DATE sqoop import -Dmapreduce.map.memory.mb=32768 -Dmapreduce.map.java.opts=-Xmx16g --table purchase select category_id,category_department_id,category_name from retail_db.categories where \$CONDITIONS sqoop import --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --query "select category_id,category_department_id,category_name from retail_db.categories WHERE \$CONDITIONS" --target-dir /user/hive/warehouse/tmp_categories INFO mapreduce.Job: map 0% reduce 0% INFO mapreduce.Job: map 100% reduce 0% INFO mapreduce.Job: Job job_1652113797137_0001 completed successfully INFO mapreduce.Job: Counters: 33 File System Counters FILE: Number of bytes read=0 FILE: Number of bytes written=242714 FILE: Number of read operations=0 FILE: Number of large read operations=0 FILE: Number of write operations=0 HDFS: Number of bytes read=85 HDFS: Number of bytes written=1029 HDFS: Number of read operations=6 HDFS: Number of large read operations=0 HDFS: Number of write operations=2 HDFS: Number of bytes read erasure-coded=0 ...snip... File Input Format Counters Bytes Read=0 File Output Format Counters Bytes Written=1029 INFO mapreduce.ImportJobBase: Transferred 1.0049 KB in 37.9923 seconds (27.0844 bytes/sec) INFO mapreduce.ImportJobBase: Retrieved 58 records. [smithbob@lnxserver ~]$ hadoop fs -ls -R /user/hive/warehouse/tmp_categories -rw-r--r-- 3 osboxes hive 1029 2022-05-09 23:04 /user/hive/warehouse/tmp_categories/part-m-00000 [smithbob@lnxserver ~]$ [smithbob@lnxserver ~]$ hadoop fs -cat /user/hive/warehouse/tmp_categories/part-m-00000 1,2,Football 2,2,Soccer ...snip... 57,8,MLB Players 58,8,NFL Players sqoop import --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --table categories --warehouse-dir /user/hive/warehouse [smithbob@lnxserver ~]$ hadoop fs -ls -R /user/hive/warehouse/categories -rw-r--r-- 3 osboxes hive 271 2022-05-10 19:33 /user/hive/warehouse/categories/part-m-00000 -rw-r--r-- 3 osboxes hive 263 2022-05-10 19:33 /user/hive/warehouse/categories/part-m-00001 -rw-r--r-- 3 osboxes hive 266 2022-05-10 19:33 /user/hive/warehouse/categories/part-m-00002 -rw-r--r-- 3 osboxes hive 229 2022-05-10 19:33 /user/hive/warehouse/categories/part-m-00003 [smithbob@lnxserver ~]$ sqoop import --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --table categories --columns category_id,category_department_id,category_name --warehouse-dir /user/hive/warehouse sqoop import --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --table categories --columns category_id,category_department_id,category_name --warehouse-dir /user/hive/warehouse --delete-target-dir sqoop import --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --table categories --columns category_id,category_department_id,category_name --warehouse-dir /user/hive/warehouse --append [smithbob@lnxserver ~]$ hadoop fs -ls -R /user/hive/warehouse/categories -rw-r--r-- 3 osboxes hive 271 2022-05-10 19:33 /user/hive/warehouse/categories/part-m-00000 -rw-r--r-- 3 osboxes hive 263 2022-05-10 19:33 /user/hive/warehouse/categories/part-m-00001 -rw-r--r-- 3 osboxes hive 266 2022-05-10 19:33 /user/hive/warehouse/categories/part-m-00002 -rw-r--r-- 3 osboxes hive 229 2022-05-10 19:33 /user/hive/warehouse/categories/part-m-00003 -rw-r--r-- 3 osboxes supergroup 271 2022-05-10 19:53 /user/hive/warehouse/categories/part-m-00004 -rw-r--r-- 3 osboxes supergroup 263 2022-05-10 19:53 /user/hive/warehouse/categories/part-m-00005 -rw-r--r-- 3 osboxes supergroup 266 2022-05-10 19:53 /user/hive/warehouse/categories/part-m-00006 -rw-r--r-- 3 osboxes supergroup 229 2022-05-10 19:53 /user/hive/warehouse/categories/part-m-00007 sqoop import --hive-database prod_schema --hive-table categories --hive-import --hive-overwrite --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --table categories --columns category_id,category_department_id,category_name --warehouse-dir /user/hive/warehouse --delete-target-dir [smithbob@lnxserver ~]$ impala-shell [hdpserver:21000] prod_schema> invalidate metadata categories; [hdpserver:21000] prod_schema> select * from categories; +-------------+------------------------+----------------------+ | category_id | category_department_id | category_name | +-------------+------------------------+----------------------+ | 1 | 2 | Football | | 2 | 2 | Soccer | ...snip... | 28 | 5 | Top Brands | | 29 | 5 | Shop By Sport | +-------------+------------------------+----------------------+ [hdpserver:21000] prod_schema> sqoop import --hive-database prod_schema --hive-table categories --hive-import --hive-overwrite --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --table categories --columns category_id,category_department_id,category_name --null-string '' --null-non-string '' --warehouse-dir /user/hive/warehouse --delete-target-dir mysql> desc categories; +------------------------+-------------+------+-----+---------+----------------+ | Field | Type | Null | Key | Default | Extra | +------------------------+-------------+------+-----+---------+----------------+ | category_id | int(11) | NO | PRI | NULL | auto_increment | | category_department_id | int(11) | NO | | NULL | | | category_name | varchar(45) | NO | | NULL | | +------------------------+-------------+------+-----+---------+----------------+ INFO db.DataDrivenDBInputFormat: BoundingValsQuery: SELECT MIN(`category_id`), MAX(`category_id`) FROM `categories` INFO db.IntegerSplitter: Split size: 14; Num splits: 4 from: 1 to: 58 INFO mapreduce.JobSubmitter: number of splits:4 Job Counters Launched map tasks=4 Other local map tasks=4 Total time spent by all maps in occupied slots (ms)=12806656 Total time spent by all reduces in occupied slots (ms)=0 Total time spent by all map tasks (ms)=25013 Total vcore-milliseconds taken by all map tasks=25013 Total megabyte-milliseconds taken by all map tasks=12806656 sqoop import --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --query "select category_id,category_department_id,category_name from categories where \$CONDITIONS" --split-by category_department_id --delete-target-dir --target-dir /user/osboxes/categories --hive-database default --hive-table categories --hive-import --hive-overwrite INFO db.DataDrivenDBInputFormat: BoundingValsQuery: SELECT MIN(category_department_id), MAX(category_department_id) FROM ( select category_id,category_department_id,category_name from categories where (1 = 1) ) AS t1 INFO db.IntegerSplitter: Split size: 1; Num splits: 4 from: 2 to: 8 INFO mapreduce.JobSubmitter: number of splits:4 sqoop import --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --query "select category_id,category_department_id,category_name from categories where \$CONDITIONS" --split-by category_department_id --num-mappers 2 --delete-target-dir --target-dir /user/osboxes/categories --hive-database default --hive-table categories --hive-import --hive-overwrite INFO db.DataDrivenDBInputFormat: BoundingValsQuery: SELECT MIN(category_department_id), MAX(category_department_id) FROM ( select category_id,category_department_id,category_name from categories where (1 = 1) ) AS t1 INFO db.IntegerSplitter: Split size: 3; Num splits: 2 from: 2 to: 8 INFO mapreduce.JobSubmitter: number of splits:2 [smithbob@lnxserver ~]$ hadoop fs -ls -R hdfs://quickstart-bigdata:8020/user/hive/warehouse/categories -rwxrwxrwt 3 osboxes supergroup 403 2022-05-11 00:32 hdfs://quickstart-bigdata:8020/user/hive/warehouse/categories/part-m-00000 -rwxrwxrwt 3 osboxes supergroup 626 2022-05-11 00:32 hdfs://quickstart-bigdata:8020/user/hive/warehouse/categories/part-m-00001 [smithbob@lnxserver ~]$ sqoop import --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --query "select category_id,category_department_id,category_name from categories where \$CONDITIONS" --split-by category_department_id --num-mappers 2 --fetch-size 10000 --delete-target-dir --target-dir /user/osboxes/categories --hive-database default --hive-table categories --hive-import --hive-overwrite SELECT * FROM ( SELECT /*+ NO_PARALLEL */ A.*, MOD(A.YOUR_COLUMN,MAPPERS) AS SPLITCOL FROM YOUR_TABLE_NAME A ) create table candybar_consumption_data( consumer_id tinyint, candybar_name varchar(20), survey_year smallint, gender varchar(1), overall_rating tinyint, number_bars_consumed smallint ); sqoop export --connect "jdbc:mysql://remotehost:3306/retail_db?serverTimezone=UTC" --username username --password password --table candybar_consumption_data --export-dir /user/hive/warehouse/candybar_consumption_data --fields-terminated-by '\001' [smithbob@lnxserver ~]$ hadoop fs -cat /user/hive/warehouse/candybar_consumption_data/blah_blah_data.0. INFO mapreduce.ExportJobBase: Transferred 3.2119 KB in 33.7468 seconds (97.4612 bytes/sec) INFO mapreduce.ExportJobBase: Exported 36 records. mysql> select * from candybar_consumption_data; +-------------+---------------+-------------+--------+----------------+----------------------+ | consumer_id | candybar_name | survey_year | gender | overall_rating | number_bars_consumed | +-------------+---------------+-------------+--------+----------------+----------------------+ | 5 | HERSHEY BAR | 2010 | M | 8 | 15 | | 5 | HERSHEY BAR | 2011 | M | 6 | 5 | | 5 | SNICKERS BAR | 2009 | M | 8 | 55 | | 5 | SNICKERS BAR | 2010 | M | 8 | 65 | | 5 | SNICKERS BAR | 2011 | M | 8 | 75 | ...snip... | 4 | MARS BAR | 2011 | F | 7 | 15 | | 4 | TWIX BAR | 2009 | F | 7 | 20 | | 4 | TWIX BAR | 2010 | F | 7 | 30 | | 4 | TWIX BAR | 2011 | F | 7 | 10 | | 5 | HERSHEY BAR | 2009 | M | 8 | 15 | +-------------+---------------+-------------+--------+----------------+----------------------+ 36 rows in set (0.00 sec) https://www.oracle.com/database/technologies/datawarehouse-bigdata/big-data-connectors.html. https://docs.microsoft.com/en-us/sql/big-data-cluster/big-data-options?view=sql-server-ver15 https://docs.microsoft.com/en-us/sql/relational-databases/polybase/polybase-guide?view=sql-server-ver15 https://docs.teradata.com/r/Teradata-Parallel-Transporter-Reference/July-2017/DataConnector-Operator https://docs.teradata.com/r/Teradata-Parallel-Transporter-Reference/July-2017/DataConnector-Operator/Usage-Notes/Processing-Hadoop-Files-and-Tables. https://documentation.sas.com/doc/en/hadoopov/9.4/p1d3oooypq5aemn1e3t2cbkvxm6p.htm Chapter 30 - Loading Data using LOAD DATA to Load Data [smithbob@lnxserver ~]$ cp dim_postal_code.tsv new_postal_code.tsv [smithbob@lnxserver ~]$ hadoop fs -mkdir hdfs://lnxserver.com:8020/warehouse/tablespace/external/hive/new_postal_code [smithbob@lnxserver ~]$ hadoop fs -copyFromLocal /home/smithbob/new_postal_code.tsv hdfs://lnxserver.com:8020/warehouse/tablespace/external/hive/new_postal_code/ new_postal_code.tsv [smithbob@lnxserver ~]$ hadoop fs -ls -R hdfs://lnxserver.com:8020/warehouse/tablespace/external/hive/new_postal_code -rw-r--r-- 3 hdfs supergroup 1784376 2022-03-30 10:22 hdfs://lnxserver.com:8020/warehouse/tablespace/external/hive/new_postal_code/ new_postal_code.tsv [smithbob@lnxserver ~]$ hadoop fs -chmod 666 hdfs://lnxserver.com:8020/warehouse/tablespace/external/hive/new_postal_code CREATE TABLE NEW_POSTAL_CODE( POSTAL_CODE STRING, CITY STRING, STATE_CODE STRING, LATITUDE DOUBLE, LONGITUDE DOUBLE ) STORED AS TEXTFILE TBLPROPERTIES('transactional'='false'); ALTER TABLE NEW_POSTAL_CODE SET SERDEPROPERTIES('field.delim'='\t'); LOAD DATA INPATH '/warehouse/tablespace/external/hive/new_postal_code' INTO TABLE NEW_POSTAL_CODE; SELECT * FROM NEW_POSTAL_CODE LIMIT 10; +-------------+-------------+------------+-----------+--------------------+ | postal_code | city | state_code | latitude | longitude | +-------------+-------------+------------+-----------+--------------------+ | 00623 | CABO ROJO | PR | 18.08643 | -67.15222 | | 00633 | CAYEY | PR | 18.194527 | -66.18346699999999 | | 00640 | COAMO | PR | 18.077197 | -66.359104 | | 00676 | MOCA | PR | 18.37956 | -67.08423999999999 | | 00728 | PONCE | PR | 18.013353 | -66.65218 | | 00734 | PONCE | PR | 17.999499 | -66.643934 | | 00735 | CEIBA | PR | 18.258444 | -65.65987 | | 00748 | FAJARDO | PR | 18.326732 | -65.652484 | | 00766 | VILLALBA | PR | 18.126023 | -66.48208 | | 00771 | LAS PIEDRAS | PR | 18.18744 | -65.87088 | +-------------+-------------+------------+-----------+--------------------+ DESC FORMATTED NEW_POSTAL_CODE; Location: hdfs://lnxserver.com:8020/warehouse/tablespace/external/hive/ new_postal_code Table Type: EXTERNAL_TABLE ALTER TABLE NEW_POSTAL_CODE SET TBLPROPERTIES('EXTERNAL'='FALSE'); LOAD DATA INPATH '/warehouse/tablespace/external/hive/new_postal_code' OVERWRITE INTO TABLE NEW_POSTAL_CODE; LOAD DATA INPATH '/directory/...' OVERWRITE INTO TABLE partitioned-table-name PARTITION (partition-column-1 = value-1, partition-column-2 = value-2, ... partition-column-n = value-n); LOAD DATA INPATH '/directory/new_jersey/...' INTO TABLE USA_WEATHER_DATA PARTITION (state_code = 'NJ'); Chapter 31 - Scheduling Jobs Using crontab [smithbob@lnxserver ~]$ crontab -e ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ "/tmp/crontab.zEaA9w" 0L, 0C 0 0 1 JAN * /home/smithbob/dbupdate crontab: installing new crontab [smithbob@lnxserver ~]$ */5 * * * * /home/smithbob/dbupdate 30 11 * * * /home/smithbob/dbupdate 30 23 * * * /home/smithbob/dbupdate 15 02 3 * * /home/smithbob/dbupdate 45 06 15 OCT * /home/smithbob/dbupdate 00 01 * * THU /home/smithbob/dbupdate 00 03 28 2 * /home/smithbob/dbupdate 00 03 30 4,6,9,11 * /home/smithbob/dbupdate 00 03 31 1,3,5,7,8,10,12 * /home/smithbob/dbupdate 00 01 * * FRI /home/smithbob/dbupdate 00 01 * * FRI [ $(date +\%d) -le 07 ] && (/home/smithbob/dbupdate) klist Ticket cache: FILE:/tmp/krb5cc_123083789_0DLaGV Default principal: SmithBob@COMPANY.COM Valid starting Expires Service principal 01/29/2022 07:54:34 01/29/2022 17:54:34 krbtgt/COMPANY.COM@COMPANY.COM renew until 02/05/2022 07:54:34 ktutil ktutil: addent -password -p smithbob@lnxserver.company.com -k 1 -e rc4-hmac Password for smithbob@lnxserver.company.com: ktutil: addent -password -p smithbob@lnxserver.company.com -k 1 -e aes256-cts Password for smithbob@lnxserver.company.com: ktutil: addent -password -p smithbob@lnxserver.company.com -k 1 -e arcfour-hmac-md5 Password for smithbob@lnxserver.company.com: ktutil: addent -password -p smithbob@lnxserver -k 1 -e rc4-hmac Password for smithbob@lnxserver: ktutil: addent -password -p smithbob@lnxserver -k 1 -e aes256-cts Password for smithbob@lnxserver: ktutil: addent -password -p smithbob@lnxserver -k 1 -e arcfour-hmac-md5 Password for smithbob@lnxserver: ktutil: addent -password -p smithbob -k 1 -e rc4-hmac Password for smithbob@COMPANY.COM: ktutil: addent -password -p smithbob -k 1 -e aes256-cts Password for smithbob@COMPANY.COM: ktutil: addent -password -p smithbob -k 1 -e arcfour-hmac-md5 Password for smithbob@COMPANY.COM: ktutil: wkt /home/smithbob/smithbob.keytab ktutil: quit kinit: Preauthentication failed while getting initial credentials chmod 644 /home/smithbob/smithbob.keytab kinit SmithBob@COMPANY.COM -k -t /home/smithbob/smithbob.keytab 0 0 * * TUE kinit SmithBob@COMPANY.COM -k -t /home/smithbob/smithbob.keytab; /home/smithbob/dbupdate Chapter 32 - Updating Your Hadoop Tables with make if [[ $1 == "DIM_US_STATE_MAPPING" ]] … case $1 in "DIM_US_STATE_MAPPING") ...;; esac DIM_US_STATE_MAPPING: DIM_US_STATE_MAPPING: echo "Target: $@" sqoop import --hive-database prod_schema ... --target-dir /data/prod/teams/prod_schema/TMP_$@ impala-shell -k -i hdpserver --database prod_schema -f $@.sql #------------------------------------------# # Global variables # #------------------------------------------# # Target Impala database TGTDB = prod_schema DIM_US_STATE_MAPPING: echo "Target: $@" sqoop import --hive-database ${TGTDB} ... --target-dir /data/prod/teams/prod_schema/TMP_$@ impala-shell -k -i hdpserver --database ${TGTDB} -f $@.sql make -f FabDeptDBUpdateFile DIM_US_STATE_MAPPING make --dry-run -f FabDeptDBUpdateFile DIM_US_STATE_MAPPING echo "Target: DIM_US_STATE_MAPPING" sqoop import --hive-database prod_schema ... --target-dir /data/prod/teams/prod_schema/TMP_DIM_US_STATE_MAPPING impala-shell -k -i hdpserver --database prod_schema -f DIM_US_STATE_MAPPING.sql make -f FabDeptDBUpdateFile DIM_US_STATE_MAPPING DIM_US_CENSUS_REGION usstates: DIM_US_STATE_MAPPING DIM_US_POSTAL_CODE DIM_US_CENSUS_REGION #------------------------------------------# # Global variables # #------------------------------------------# # Target Impala database TGTDB = prod_schema # Target Groups usstates: DIM_US_STATE_MAPPING DIM_US_POSTAL_CODE DIM_US_CENSUS_REGION # Targets DIM_US_STATE_MAPPING: echo "Target: $@" sqoop import --hive-database ${TGTDB} ... --target-dir /data/prod/teams/prod_schema/TMP_$@ impala-shell -k -i hdpserver --database ${TGTDB} -f $@.sql DIM_US_POSTAL_CODE: echo "Target: $@" sqoop import --hive-database ${TGTDB} ... --target-dir /data/prod/teams/prod_schema/TMP_$@ impala-shell -k -i hdpserver --database ${TGTDB} -f $@.sql DIM_US_CENSUS_REGION: echo "Target: $@" sqoop import --hive-database ${TGTDB} ... --target-dir /data/prod/teams/prod_schema/TMP_$@ impala-shell -k -i hdpserver --database ${TGTDB} -f $@.sql make -f FabDeptDBUpdateFile usstates PART VII - Advanced Topics I Chapter 33 - Accessing the Hive MetaStore * DBS - This table contains the database names and other information:  DB_ID (BIGINT) - The database identifier  NAME (VARCHAR(128)) - The name of the database  DESC (VARCHAR(4000)) - The description of the database * TBLS - This table contains the table names and other information:  DB_ID (BIGINT) - The database identifier  TB_ID (BIGINT) - The table identifier  SD_ID (BIGINT) - The table-column link identifier  TBL_NAME (VARCHAR(128)) - The name of the table  TBL_TYPE (VARCHAR(128)) - The type of the table  OWNER (VARCHAR(767)) - The name of the owner/creator of the table (e.g., smithbob) * COLUMNS_V2 - This table contains the column names and other information  CD_ID (BIGINT) - The column identifier  COLUMN_NAME (VARCHAR(128)) - The name of the column  TYPE_NAME (VARCHAR(4000)) - The name of the column's data type  INTEGER_IDX (INT) - The column's location number in the CREATE TABLE statement (starts from 0) * SDS - This table is used to link the tables to their associated columns  SD_ID (BIGINT) - The table-column link identifier (or storage information ID)  CD_ID (BIGINT) - The column identifier * PARTITION_KEYS - Although the table COLUMNS_V2 contains the column names, data types, and so on, if the table is partitioned, the columns used to partition the table don't appear in COLUMNS_V2, but rather they appear in this table.  TBL_ID (BIGINT) - The table identifier  PKEY_NAME (VARCHAR(128)) - The partition key column name (akin to COLUMNS_V2.COLUMN_NAME)  PKEY_TYPE (VARCHAR(767)) - The partition key column data type (akin to COLUMNS_V2.TYPE_NAME)  INTEGER_IDX (INT) - The column's location number in the CREATE TABLE statement (starts from 0) SELECT UPPER(D.NAME) AS DATABASE_NAME, UPPER(T.TBL_NAME) AS TABLE_NAME, T.OWNER AS TABLE_OWNER FROM DBS D INNER JOIN TBLS T ON D.DB_ID=T.DB_ID ORDER BY 1,2; SELECT UPPER(D.NAME) AS DATABASE_NAME, UPPER(T.TBL_NAME) AS TABLE_NAME, C.INTEGER_IDX AS COLUMN_ID, UPPER(C.COLUMN_NAME) AS COLUMN_NAME, UPPER(T.OWNER) AS TABLE_OWNER, UPPER(C.TYPE_NAME) AS DATA_TYPE, CAST(1 AS TINYINT) AS RESULT_ORDER FROM DBS D INNER JOIN TBLS T ON D.DB_ID=T.DB_ID INNER JOIN SDS S ON T.SD_ID=S.SD_ID INNER JOIN COLUMNS_V2 C ON S.CD_ID=C.CD_ID ORDER BY 1,2; ...snip... SELECT 1 AS RESULT_ORDER, INTEGER_IDX, COLUMN_NAME, TYPE_NAME FROM COLUMNS_V2 UNION ALL SELECT 2 AS RESULT_ORDER, INTEGER_IDX, PKEY_NAME AS COLUMN_NAME, PKEY_TYPE AS TYPE_NAME FROM PARTITION_KEYS ORDER BY 1,2 ...snip... CREATE EXTERNAL TABLE PROD_SCHEMA.ALL_TABLES( DATABASE_NAME STRING, TABLE_NAME STRING, TABLE_OWNER STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/hdfs-directory/all_tables'; CREATE EXTERNAL TABLE PROD_SCHEMA.ALL_TAB_COLUMNS( DATABASE_NAME STRING, TABLE_NAME STRING, COLUMN_ID SMALLINT, COLUMN_NAME STRING, DATA_TYPE STRING, RESULT_ORDER TINYINT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/hdfs-directory/all_tab_columns'; #!/bin/bash -v # Bring in the .bash_profile to capture the PATH. source $HOME/.bash_profile # Remove the temporary tables. rm -f $HOME/all_tables.tsv rm -f $HOME/all_tab_columns.tsv # Unload the MetaStore for the tables temporarily to $HOME mysql -hhostname -uusername -ppassword --database=dbname --skip-column-names -e "SELECT UPPER(D.NAME) AS DATABASE_NAME,UPPER(T.TBL_NAME) AS TABLE_NAME, T.OWNER AS TABLE_OWNER FROM DBS D INNER JOIN TBLS T ON D.DB_ID=T.DB_ID" > $HOME/all_tables.tsv # Unload the MetaStore for the tables/columns temporarily to $HOME mysql -hhostname -uusername -ppassword --database=dbname --skip-column-names -e "SELECT UPPER(D.NAME) AS DATABASE_NAME,UPPER(T.TBL_NAME) AS TABLE_NAME,C.INTEGER_IDX AS COLUMN_ID,UPPER(C.COLUMN_NAME) AS COLUMN_NAME,UPPER(T.OWNER) AS TABLE_OWNER,UPPER(C.TYPE_NAME) AS DATA_TYPE,CAST(1 AS TINYINT) AS RESULT_ORDER FROM DBS D INNER JOIN TBLS T ON D.DB_ID=T.DB_ID INNER JOIN SDS S ON T.SD_ID=S.SD_ID INNER JOIN COLUMNS_V2 C ON S.CD_ID=C.CD_ID UNION ALL SELECT UPPER(D.NAME) AS DATABASE_NAME,UPPER(T.TBL_NAME) AS TABLE_NAME,P.INTEGER_IDX AS COLUMN_ID,UPPER(P.PKEY_NAME) AS COLUMN_NAME,UPPER(T.OWNER) AS TABLE_OWNER,UPPER(P.PKEY_TYPE) AS DATA_TYPE,CAST(2 AS TINYINT) AS RESULT_ORDER FROM DBS D INNER JOIN TBLS T ON D.DB_ID=T.DB_ID INNER JOIN SDS S ON T.SD_ID=S.SD_ID INNER JOIN PARTITION_KEYS P ON T.TBL_ID=P.TBL_ID" > $HOME/all_tab_columns.tsv # Copy the local all_tables.tsv over to the HDFS all_tables directory. # Note: The -f switch forces replacement if the file exists. hadoop fs -copyFromLocal -f $HOME/all_tables.tsv .../all_tables # Copy the local all_tab_columns.tsv over to the HDFS all_tab_columns directory # Note: The -f switch forces replacement if the file exists. hadoop fs -copyFromLocal -f $HOME/all_tab_columns.tsv .../all_tab_columns # Now that the files are located in their respective directories, we have # to tell Hadoop to recognize that the files have changed. impala-shell -quiet -i hdpserver -database=prod_schema --query "invalidate metadata all_tables;invalidate metadata all_tab_columns;refresh all_tables;refresh all_tab_columns;compute stats all_tables; compute stats all_tab_columns;" exit */10 * * * updateMetadata hplsql.conn.mysqlconn com.mysql.jdbc.Driver;jdbc:mysql://hostname/dbname;username;password MySQL connection to the Hive MySQL Metastore create or replace function table_exists(psTBLNAME in string) return int as iCNT int; sSQL string; sDBNAME string; begin /* Check if the passed-in table name is null or not */ if (psTBLNAME is null) then return(null); end if; /* Connect to the MySQL MetaStore */ set hplsql.conn.default=mysqlconn; /* Set the default database: prod_schema */ sDBNAME := 'prod_schema'; /* Prepare the SQL code to query the MySQL database */ sSQL := "select count(*) from dbs d inner join tbls t on d.db_id=t.db_id where upper(t.tbl_name)='" || upper(psTBLNAME) || "' and upper(d.name)='" || upper(sDBNAME) || "'"; /* Execute the SQL query placing the results of the count into iCNT */ execute(sSQL) into iCNT; /* Reconnect to Impala since we are done with MySQL */ set hplsql.conn.default=impala; /* Return the appropriate return code */ if iCNT > 0 then return(1); else return(0); end if; end; create or replace function column_exists(psTBLNAME in string,psCOLNAME in string) return int as iCNT int; sSQL string; sDBNAME string; begin /* Check if the passed-in table name is null or not */ if (psTBLNAME is null) then return(null); end if; /* Check if the passed-in column name is null or not */ if (psCOLNAME is null) then return(null); end if; /* Connect to the MySQL MetaStore */ set hplsql.conn.default=mysqlconn; /* Set the default database: prod_schema */ sDBNAME := 'prod_schema'; /* Prepare the SQL code to query the MySQL database */ sSQL := "select count(*) from dbs d inner join tbls t on d.db_id=t.db_id inner join sds s on t.sd_id=s.sd_id inner join columns_v2 c on s.cd_id=c.cd_id where upper(t.tbl_name)='" || upper(psTBLNAME) || "' and upper(d.name)='" || upper(sDBNAME) || "' and upper(c.column_name)='" || upper(psCOLNAME) || "'"; /* Execute the SQL query placing the results of the count into iCNT */ execute(sSQL) into iCNT; /* Reconnect to Impala since we are done with MySQL */ set hplsql.conn.default=impala; /* Return the appropriate return code */ if iCNT > 0 then return(1); else return(0); end if; end; include table_exists.hplsql include column_exists.hplsql ...snip... iTblExists int; iColExists int; ...snip... iTblExists := table_exists(sTableName); iColExists := column_exists(sTableName,sColumnName); ...snip... Chapter 34 - Working with Impala Request Pools set request_pool=hdpserver_small_pool; execute("set request_pool=hdpserver_small_pool;"); Driver=Cloudera ODBC Driver for Impala;...;ssp_request_pool=hdpserver_small_pool; Chapter 35 - Making a Backup Copy of a Linux Directory cd /tmp tar --warning=no-file-changed -zvcf lnxserver_smithbob.tgz /home/smithbob mv lnxserver_smithbob.tgz $HOME Chapter 36 - Using ssh and scp from Linux and Windows ssh username@remote_machine ssh smithbob@lnxserver The authenticity of host 'lnxserver (10.20.30.40)' can't be established. ECDSA key fingerprint is SHA256:a7VrQZuy0jjufLNfpYtwixbarWiYBous36ARKOMtBsF. Are you sure you want to continue connecting (yes/no/[fingerprint])? Warning: Permanently added 'lnxserver,10.20.30.40' (ECDSA) to the list of known hosts. smithbob@lnxserver's password: usage: ssh [-46AaCfGgKkMNnqsTtVvXxYy] [-B bind_interface] [-b bind_address] [-c cipher_spec] [-D [bind_address:]port] [-E log_file] [-e escape_char] [-F configfile] [-I pkcs11] [-i identity_file] [-J [user@]host[:port]] [-L address] [-l login_name] [-m mac_spec] [-O ctl_cmd] [-o option] [-p port] [-Q query_option] [-R address] [-S ctl_path] [-W host:port] [-w local_tun[:remote_tun]] username@remote_server [command] ssh smithbob@lnxserver ls -alF total 25 drwxr-xr-x+ 1 smithbob None 0 Feb 28 13:02 ./ drwxrwxrwt+ 1 smithbob None 0 Feb 27 17:12 ../ -rw------- 1 smithbob None 218 Feb 28 14:06 .bash_history -rwxr-xr-x 1 smithbob None 1494 Feb 27 17:09 .bash_profile* -rwxr-xr-x 1 smithbob None 5645 Feb 27 17:09 .bashrc* -rwxr-xr-x 1 smithbob None 1919 Feb 27 17:09 .inputrc* -rwxr-xr-x 1 smithbob None 1236 Feb 27 17:09 .profile* drwx------+ 1 smithbob None 0 Feb 28 13:37 .ssh/ scp smithbob@lnxserver:/home/smithbob/pull_this_file.txt . usage: scp [-346BCpqrTv] [-c cipher] [-F ssh_config] [-i identity_file] [-J destination] [-l limit] [-o ssh_option] [-P port] [-S program] source target scp C:\borscht_recipe.txt smithbob@lnxserver:/home/smithbob/borscht_recipe.txt borscht_recipe.txt 92% 45KB 2.9MB/s 00:05 Using ssh and scp without a Password Step #1: Generate a Public and Private Key Pair ssh-keygen -t rsa Generating public/private rsa key pair. Enter file in which to save the key (/home/smithbob/.ssh/id_rsa): Enter passphrase (empty for no passphrase): Enter same passphrase again: Your identification has been saved in /home/smithbob/.ssh/id_rsa. Your public key has been saved in /home/smithbob/.ssh/id_rsa.pub. The key fingerprint is: A9:54:f1:21:fa:98:41:da:ba:05:8d:51:2d:10:e5:8f smithbob@lnxserver * Your private key is in a file named ~/.ssh/id_rsa * Your public key is in a file named ~/.ssh/id_rsa.pub Step #2: Copy your Public Key to your Remote Server mkdir .ssh Step 3: Using ssh/scp without a Password Chapter 37 - The Linux /etc/skel Directory [smithbob@lnxserver ~]$ lsf /etc/skel total 28 drwxr-xr-x. 3 root root 78 Dec 31 15:10 ./ drwxr-xr-x. 175 root root 12288 Feb 28 08:53 ../ -rw-r--r--. 1 root root 18 Jul 27 2021 .bash_logout -rw-r--r--. 1 root root 141 Jul 27 2021 .bash_profile -rw-r--r--. 1 root root 376 Jul 27 2021 .bashrc drwxr-xr-x. 4 root root 39 Dec 31 15:10 .mozilla/ [smithbob@lnxserver ~]$ Chapter 38 - The parquet-tools and parquet-cli Utilities [smithbob@lnxserver ~]$ parquet-tools cat dim_calendar.parq [smithbob@lnxserver ~]$ hadoop jar /opt/cloudera/.../parquet-tools-1.10.99.7.1.7.0-551.jar cat hdfs://lnxserver.com:8020/.../7d4ef2af478388ef-5c3a50e700000000_2110153450_data.0.parq date_id = 18628 day = 1 month = 1 year = 2021 quarter = 1 yyyyddd = MjAyMTAwMQ== ddd = MDAx first_day_of_month = 18628 first_day_of_quarter = 18628 first_day_of_year = 18628 month_name = SmFudWFyeQ== weekday_name = RnJpZGF5 yyyyqq = MjAyMTAx yyyymm = MjAyMTAx yyyymmdd = MjAyMTAxMDE= date_long = SmFudWFyeSAwMSwgMjAyMQ== date_short = MDFKQU4yMDIx [smithbob@lnxserver ~]$ parq dim_calendar.parq --head date_id day month year quarter yyyyddd ddd first_day_of_month \ 0 2021-01-01 1 1 2021 1 2021001 001 2021-01-01 1 2021-01-02 2 1 2021 1 2021002 002 2021-01-01 2 2021-01-03 3 1 2021 1 2021003 003 2021-01-01 3 2021-01-04 4 1 2021 1 2021004 004 2021-01-01 4 2021-01-05 5 1 2021 1 2021005 005 2021-01-01 5 2021-01-06 6 1 2021 1 2021006 006 2021-01-01 6 2021-01-07 7 1 2021 1 2021007 007 2021-01-01 7 2021-01-08 8 1 2021 1 2021008 008 2021-01-01 8 2021-01-09 9 1 2021 1 2021009 009 2021-01-01 9 2021-01-10 10 1 2021 1 2021010 010 2021-01-01 first_day_of_quarter first_day_of_year month_name weekday_name yyyyqq \ 0 2021-01-01 2021-01-01 January Friday 202101 1 2021-01-01 2021-01-01 January Saturday 202101 2 2021-01-01 2021-01-01 January Sunday 202101 3 2021-01-01 2021-01-01 January Monday 202101 4 2021-01-01 2021-01-01 January Tuesday 202101 5 2021-01-01 2021-01-01 January Wednesday 202101 6 2021-01-01 2021-01-01 January Thursday 202101 7 2021-01-01 2021-01-01 January Friday 202101 8 2021-01-01 2021-01-01 January Saturday 202101 9 2021-01-01 2021-01-01 January Sunday 202101 yyyymm yyyymmdd date_long date_short 0 202101 20210101 January 01, 2021 01JAN2021 1 202101 20210102 January 02, 2021 02JAN2021 2 202101 20210103 January 03, 2021 03JAN2021 3 202101 20210104 January 04, 2021 04JAN2021 4 202101 20210105 January 05, 2021 05JAN2021 5 202101 20210106 January 06, 2021 06JAN2021 6 202101 20210107 January 07, 2021 07JAN2021 7 202101 20210108 January 08, 2021 08JAN2021 8 202101 20210109 January 09, 2021 09JAN2021 9 202101 20210110 January 10, 2021 10JAN2021 PART VIII - Advanced Topics II Chapter 39 - Quick Start Guide to Java Programming Please see the book for the code fragments presented there. Chapter 40 - Creating User-Defined Functions (UDFs) for ImpalaSQL import org.apache.hadoop.hive.ql.exec.UDF; public class your-class-name extends UDF { public int evaluate(Integer iMonthid,Integer iShiftValue) { import org.apache.hadoop.hive.ql.exec.UDF; import java.util.*; import java.text.SimpleDateFormat; import org.apache.commons.lang3.StringUtils; // Check for nulls. if (iMonthid == null || iShiftValue == null) { return -1; } import org.apache.hadoop.hive.ql.exec.UDF; import java.util.*; import java.text.SimpleDateFormat; import org.apache.commons.lang3.StringUtils; public class MonthIdShifter extends UDF { public MonthIdShifter() { } public int evaluate(Integer iMonthid,Integer iShiftValue) { try{ // Check for nulls. if (iMonthid == null || iShiftValue == null) { return -1; } // Convert iMonthId and iShiftValue to String String sMonthId = iMonthid.toString(); String sShiftValue = iShiftValue.toString(); // Compute the length of the string...must be length of 6 (YYYYMM). if (sMonthId.length() != 6) { return -1; } // If iShiftValue is zero, just return the iMonthid if (iShiftValue == 0) { return iMonthid; } // Break up with YYYYMM into YYYY and MM. Set DD to 1. // Must subtract 1900 from year. Must subtract 1 from month. int iYYYY = Integer.parseInt(sMonthId.substring(0,4)) - 1900; // YYYY - 1900 int iMM = Integer.parseInt(sMonthId.substring(4)) - 1; // MM - 1 int iDD = 1; // DD // iMM should be in the range of 0 to 11. if (iMM < 0 || iMM > 11) { return -1; } // Create a Date object. Date dMonthId = new Date(iYYYY,iMM,iDD); // Add iShiftValue to dMonthId. Calendar cal = Calendar.getInstance(); cal.setTime(dMonthId); cal.add(Calendar.MONTH,iShiftValue); Date dMonthIdShifted = cal.getTime(); // Get back YYYYMM from dMonthIdShifted. SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyyMM"); int iYYYYMMShifted = Integer.parseInt(DATE_FORMAT.format(dMonthIdShifted)); return iYYYYMMShifted; } catch(IllegalArgumentException exception) { return -1; } catch(Exception exception) { return -1; } } } [smithbob@lnxserver ~]$ javac -cp $CLASSPATH:. MonthIdShifter.java [smithbob@lnxserver ~]$ jar -cfv MonthIdShifter.jar MonthIdShifter.class [smithbob@lnxserver ~]$ hadoop fs -copyFromLocal -f /home/smithbob/UDFDevel/MonthIdShifter.jar /data/prod/teams/prod_schema/UDF/MonthIdShifter.jar CREATE FUNCTION IF NOT EXISTS MONTHIDSHIFTER(INT,INT) RETURNS INT LOCATION '/data/prod/teams/prod_schema/UDF/MonthIdShifter.jar' SYMBOL='MonthIdShifter'; [hdpserver:21000] prod_schema> select monthidshifter(201712,12); +------------------------------------+ | monthidshifter(201712, 12) | +------------------------------------+ | 201812 | +------------------------------------+ [hdpserver:21000] prod_schema> show functions; +--------------+-------------------------------+-------------+---------------+ | return type | signature | binary type | is persistent | +--------------+-------------------------------+-------------+---------------+ | INT | monthidshifter(INT, INT) | JAVA | true | +--------------+-------------------------------+-------------+---------------+ CREATE FUNCTION IF NOT EXISTS MONTHIDSHIFTER LOCATION '/data/prod/teams/prod_schema/UDF/MonthIdShifter.jar ' SYMBOL='MonthIdShifter'; PART IX - Appendages Appendage #1 - Hadoop Administrator E-Mail See above for the Hadoop Administrator E-Mail Appendage #2 - Linux on Windows https://www.cygwin.com/ Appendage #3 - When HPL/SQL Causes You Pain HPL/SQL Pre-Installed cd /home/smithbob mkdir tmp cd tmp cp /opt/cloudera/ parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/hive-hplsql-3.1.3000.7.1.7.0-551.jar hive-hplsql-3.1.3000.7.1.7.0-551.jar jar -xvf hive-hplsql-3.1.3000.7.1.7.0-551.jar jar -cvf ../hive-hplsql-3.1.3000.7.1.7.0-551.jar ./* 1. Rename /opt/cloudera/ parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/hive-hplsql-3.1.3000.7.1.7.0-551.jar to /opt/cloudera/ parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/hive-hplsql-3.1.3000.7.1.7.0-551.jar_ORIG 2. Copy hive-hplsql-3.1.3000.7.1.7.0-551.jar from /home/smithbob/ to /opt/cloudera/ parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars HPL/SQL Downloaded #!/bin/bash export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/etc/hadoop/conf" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-mapreduce/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-mapreduce/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-hdfs/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-hdfs/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-yarn/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-yarn/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hive/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hive/conf" export HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=/usr/lib/hadoop/lib/native" SCRIPTPATH=${0%/*} java -cp $SCRIPTPATH:$HADOOP_CLASSPATH:$SCRIPTPATH/hplsql-0.3.31.jar:$SCRIPTPATH/antlr-runtime-4.5.jar $HADOOP_OPTS org.apache.hive.hplsql.Hplsql "$@" * hive-jdbc-3.1.3000.7.1.7.0-551-standalone.jar * hive-exec.jar * hive-jdbc.jar * libthrift-0.9.3-1.jar * httpcore-4.4.10.jar * httpclient-4.5.6.jar * hadoop-common.jar * hadoop-hdfs.jar * hadoop-auth.jar * commons-cli-1.4.jar * commons-io-2.4.jar * hadoop-core-2.6.0-mr1-cdh5.16.2.jar * commons-logging-1.1.1.jar * hadoop-hdfs-client-3.1.1.7.1.7.0-551.jar * commons-collections-3.2.2.jar #!/bin/bash export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/etc/hadoop/conf" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-mapreduce/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-mapreduce/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-hdfs/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-hdfs/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-yarn/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hadoop-yarn/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hive/lib/*" export "HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/usr/lib/hive/conf" export HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=/usr/lib/hadoop/lib/native" SCRIPTPATH=${0%/*} java -cp .:$SCRIPTPATH:$HADOOP_CLASSPATH:$SCRIPTPATH/hplsql-0.3.31.jar:$SCRIPTPATH/antlr-runtime-4.5.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/hive-jdbc-3.1.3000.7.1.7.0-551-standalone.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/lib/hive/lib/hive-exec.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/lib/hive/lib/hive-jdbc.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/lib/hive/lib/libthrift-0.9.3-1.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/lib/hive/lib/httpcore-4.4.10.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/lib/hive/lib/httpclient-4.5.6.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/lib/hadoop/hadoop-common.jar:/opt/cloudera/cm/lib/cdh5/mr1/hadoop-core-2.6.0-mr1-cdh5.16.2.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/lib/hadoop-hdfs/hadoop-hdfs.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/lib/hadoop/client/hadoop-auth.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/commons-cli-1.4.jar:/opt/cloudera/parcels/CDH/jars/commons-io-2.4.jar:/opt/cloudera/cm/lib/commons-logging-1.1.1.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/hadoop-hdfs-client-3.1.1.7.1.7.0-551.jar:/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p0.15945976/jars/commons-collections-3.2.2.jar:/home/smithbob/jars/ImpalaJDBC4.jar $HADOOP_OPTS org.apache.hive.hplsql.Hplsql "$@" SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". SLF4J: Defaulting to no-operation (NOP) logger implementation SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details. Appendage #4 - When Bad Errors Happen to Good Programmers Please see the book for the error messages as well as possible solutions to them. Appendage #5 - Where Do I Go from Here? * Websites  7-Zip: https://www.7-zip.org/  Antlr Website: https://www.antlr.org/  Apache Hadoop: https://hadoop.apache.org/  Apache Hadoop Commands https://hadoop.apache.org/docs/stable/hadoop-project-dist/ hadoop-common/CommandsManual.html  Apache Hive: https://hive.apache.org/  Apache HiveQL Reference: https://cwiki.apache.org/confluence/display/Hive//LanguageManual  Apache Impala: https://impala.apache.org/  Apache ImpalaSQL Reference: https://impala.apache.org/docs/build/html/topics/impala_langref.html  Apache Parquet: https://parquet.apache.org/  Apache Kudu: https://kudu.apache.org/  Apache Sqoop: https://sqoop.apache.org/  Apache Sqoop Reference Manual: https://sqoop.apache.org/docs/1.4.7/SqoopUserGuide.html  CloneZilla: https://clonezilla.org/  Cloudera: https://www.cloudera.com/  Crontab Guru: https://crontab.guru/  Cygwin: https://www.cygwin.com/  Docker: https://www.docker.com/  HPL/SQL: http://www.hplsql.org/home  HPL/SQL Reference Manual: http://www.hplsql.org/doc  Hortonworks: https://www.cloudera.com/products/hdp.html  Java: https://www.java.com  Macrium Reflect: https://www.macrium.com/reflectfree  MySQL Connectors: https://www.mysql.com/products/connector/  Parquet Viewer: https://github.com/mukunku/ParquetViewer  PeaZip: https://www.peazip.org/  Taco Bell World Domination Website: https://www.tacobell.com/  UpGuard VMWare vs. Docker: https://www.upguard.com/blog/docker-vs-vmware-how-do-they-stack-up  VirtualBox: https://www.virtualbox.org  VMWare: https://www.vmware.com/ * Books  Hadoop: The Definitive Guide by Tom White  Learning the vi and Vim Editors by Arnold Robbins and Elbert Hannah  Getting Started with Impala: Interactive SQL for Apache Hadoop by John Russel  Programming Hive: Data Warehouse and Query Language for Hadoop by Edward Capriolo, et. al.  Mastering Regular Expressions by Jeffrey E.F. Friedl  Learning the Bash Shell by Cameron Newham  Managing Projects with GNU Make by Robert Mecklenburg  Java: How to Program by Paul Deitel and Harvey Deitel  The Definitive ANTLR 4 Reference by Terence Parr  Why I Loathe Linux Administrators by Linus Torvalds * Additional Items  Call your Mother.