$ head -n1 sample-data/customer_accounts_0_65000.dat 1000000000|1000000000|2013-06-09 $ head -n1 sample-data/transactions_0_65000.dat 1000000000|2017-05-12|2017-05-12|2017-05-12|loan-install|payment|303312.72|4.77|p908|2017-05-12|Fri 12 May 2017 01:02:25 PM |62267|OR|5046869|Gunnar O'Conner Jr.|Labore quidem vel officiis similique esse. Cumque quam distinctio quae dignissimos quaerat. Non corrupti natus possimus rerum.|122538.173913|LAK|GIP|1000000000 hdfs dfs -put /home/smayani.admin/sample-data/customer_accounts_0_65000.dat hdfs dfs -put /home/smayani.admin/sample-data/transactions_0_65000.dat CREATE TABLE IF NOT EXISTS customer_accounts_raw ( customer_id bigint ,account_id bigint ,effective_date string ) ROW FORMAT DELIMITED FIELDS TERMINATED by '|' STORED as TEXTFILE ; CREATE TABLE IF NOT EXISTS transactions_raw ( transaction_id bigint ,entry_date STRING ,post_date STRING ,effective_date string ,transaction_code string ,transaction_type string ,amount float ,quantity double ,product_category string ,test_date STRING ,test_datetime STRING ,transaction_location string ,transaction_state string ,sales_person_id bigint ,sales_person_name string ,random_string string ,random_number float ,original_ccy string ,reporting_ccy string ,account_id bigint ) ROW FORMAT DELIMITED FIELDS TERMINATED by '|' STORED as TEXTFILE ; LOAD DATA INPATH '/user/smayani.admin/customer_accounts_0_65000.dat' OVERWRITE INTO TABLE customer_accounts_raw; LOAD DATA INPATH '/user/smayani.admin/transactions_0_65000.dat' OVERWRITE INTO TABLE transactions_raw; sed -i -e 's/2017-05/2016-01/g' /home/smayani.admin/sample-data/transactions_0_65000.dat hdfs dfs -put /home/smayani.admin/sample-data/transactions_0_65000.dat hive -e "LOAD DATA INPATH '/user/smayani.admin/transactions_0_65000.dat' INTO TABLE db.transactions_raw;" sed -i -e 's/2016-01/2016-02/g' /home/smayani.admin/sample-data/transactions_0_65000.dat hdfs dfs -put /home/smayani.admin/sample-data/transactions_0_65000.dat hive -e "LOAD DATA INPATH '/user/smayani.admin/transactions_0_65000.dat' INTO TABLE db.transactions_raw;" sed -i -e 's/2016-02/2016-03/g' /home/smayani.admin/sample-data/transactions_0_65000.dat hdfs dfs -put /home/smayani.admin/sample-data/transactions_0_65000.dat hive -e "LOAD DATA INPATH '/user/smayani.admin/transactions_0_65000.dat' INTO TABLE db.transactions_raw;" sed -i -e 's/2016-03/2016-04/g' /home/smayani.admin/sample-data/transactions_0_65000.dat hdfs dfs -put /home/smayani.admin/sample-data/transactions_0_65000.dat hive -e "LOAD DATA INPATH '/user/smayani.admin/transactions_0_65000.dat' INTO TABLE db.transactions_raw;" sed -i -e 's/2016-04/2016-05/g' /home/smayani.admin/sample-data/transactions_0_65000.dat hdfs dfs -put /home/smayani.admin/sample-data/transactions_0_65000.dat hive -e "LOAD DATA INPATH '/user/smayani.admin/transactions_0_65000.dat' INTO TABLE db.transactions_raw;" sed -i -e 's/2016-05/2016-06/g' /home/smayani.admin/sample-data/transactions_0_65000.dat hdfs dfs -put /home/smayani.admin/sample-data/transactions_0_65000.dat hive -e "LOAD DATA INPATH '/user/smayani.admin/transactions_0_65000.dat' INTO TABLE db.transactions_raw;" sed -i -e 's/2016-06/2016-07/g' /home/smayani.admin/sample-data/transactions_0_65000.dat hdfs dfs -put /home/smayani.admin/sample-data/transactions_0_65000.dat hive -e "LOAD DATA INPATH '/user/smayani.admin/transactions_0_65000.dat' INTO TABLE db.transactions_raw;" sed -i -e 's/2016-07/2016-08/g' /home/smayani.admin/sample-data/transactions_0_65000.dat hdfs dfs -put /home/smayani.admin/sample-data/transactions_0_65000.dat hive -e "LOAD DATA INPATH '/user/smayani.admin/transactions_0_65000.dat' INTO TABLE db.transactions_raw;" CREATE TABLE IF NOT EXISTS customer_accounts_orc_200 ( customer_id bigint ,account_id bigint ,effective_date string ) CLUSTERED BY ( account_id) SORTED BY ( account_id ASC) INTO 200 BUCKETS STORED AS ORC; CREATE TABLE IF NOT EXISTS transactions_raw_orc_200 ( transaction_id bigint ,entry_date STRING ,post_date STRING ,effective_date string ,transaction_code string ,transaction_type string ,amount float ,quantity double ,product_category string ,test_date STRING ,test_datetime STRING ,transaction_location string ,transaction_state string ,sales_person_id bigint ,sales_person_name string ,random_string string ,random_number float ,original_ccy string ,reporting_ccy string ,account_id bigint ) PARTITIONED BY ( `year` int, `quarter` int) CLUSTERED BY ( account_id) SORTED BY ( account_id ASC) INTO 200 BUCKETS STORED AS ORC ; INSERT OVERWRITE TABLE customer_accounts_orc_200 SELECT * FROM customer_accounts_raw; INSERT OVERWRITE TABLE transactions_raw_orc_200 PARTITION (year, quarter) SELECT *, Year(transactions_raw.entry_date), (INT((MONTH(transactions_raw.entry_date)-1)/3)+1) FROM transactions_raw; hive -e "INSERT OVERWRITE TABLE db.transactions_raw_orc_200 PARTITION (year, quarter) SELECT *, Year(transactions_raw.entry_date), (INT((MONTH(transactions_raw.entry_date)-1)/3)+1) FROM db.transactions_raw;" explain select year,quarter,count(*) from db.transactions_raw_orc_200 a join db.customer_accounts_orc_200 b on a.account_id=b.account_id group by year,quarter; select year,quarter,count(*) from db.transactions_raw_orc_200 a join db.customer_accounts_orc_200 b on a.account_id=b.account_id group by year,quarter;