diff --git hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java index 0524572..c002070 100644 --- hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java +++ hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java @@ -18,9 +18,18 @@ package org.apache.hadoop.hive.hbase; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.filter.FilterList; +import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter; +import org.apache.hadoop.hbase.filter.KeyOnlyFilter; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping; import org.apache.hadoop.hive.ql.index.IndexSearchCondition; @@ -28,12 +37,6 @@ import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.mapred.JobConf; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - /** * Util code common between HiveHBaseTableInputFormat and HiveHBaseTableSnapshotInputFormat. */ @@ -95,26 +98,27 @@ public static Scan getScan(JobConf jobConf) throws IOException { } } - // The HBase table's row key maps to a Hive table column. In the corner case when only the - // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/ - // column qualifier will have been added to the scan. We arbitrarily add at least one column - // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive - // tables column projection. + // If we have cases where we are running a query like count(key) or count(*), + // in such cases, the readColIDs is either empty(for count(*)) or has just the + // key column in it. In either case, nothing gets added to the scan. So if readAllColumns is + // true, we are going to add all columns. Else we are just going to add a key filter to run a + // count only on the keys if (empty) { - for (ColumnMapping colMap: columnMappings) { - if (colMap.hbaseRowKey || colMap.hbaseTimestamp) { - continue; - } - - if (colMap.qualifierName == null) { - scan.addFamily(colMap.familyNameBytes); - } else { - scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes); - } + if (readAllColumns) { + for (ColumnMapping colMap: columnMappings) { + if (colMap.hbaseRowKey || colMap.hbaseTimestamp) { + continue; + } - if (!readAllColumns) { - break; + if (colMap.qualifierName == null) { + scan.addFamily(colMap.familyNameBytes); + } else { + scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes); + } } + } else { + // Add a filter to just do a scan on the keys so that we pick up everything + scan.setFilter(new FilterList(new FirstKeyOnlyFilter(), new KeyOnlyFilter())); } } diff --git hbase-handler/src/test/queries/positive/hbase_null_first_col.q hbase-handler/src/test/queries/positive/hbase_null_first_col.q new file mode 100644 index 0000000..0d9ff56 --- /dev/null +++ hbase-handler/src/test/queries/positive/hbase_null_first_col.q @@ -0,0 +1,22 @@ +DROP TABLE src_null; +DROP TABLE hbase_null; + +CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '../../data/files/null.txt' INTO TABLE src_null; + +CREATE TABLE hbase_null(key string, col1 string, col2 string) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ( +"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2" +); + +SELECT d, a, c FROM src_null; + +INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null; + +SELECT COUNT(d) FROM src_null; +SELECT COUNT(key) FROM hbase_null; +SELECT COUNT(*) FROM hbase_null; + +DROP TABLE src_null; +DROP TABLE hbase_null; diff --git hbase-handler/src/test/results/positive/hbase_null_first_col.q.out hbase-handler/src/test/results/positive/hbase_null_first_col.q.out new file mode 100644 index 0000000..bb4491b --- /dev/null +++ hbase-handler/src/test/results/positive/hbase_null_first_col.q.out @@ -0,0 +1,109 @@ +PREHOOK: query: DROP TABLE src_null +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE src_null +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE hbase_null +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE hbase_null +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@src_null +POSTHOOK: query: CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@src_null +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/null.txt' INTO TABLE src_null +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@src_null +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/null.txt' INTO TABLE src_null +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@src_null +PREHOOK: query: CREATE TABLE hbase_null(key string, col1 string, col2 string) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ( +"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2" +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@hbase_null +POSTHOOK: query: CREATE TABLE hbase_null(key string, col1 string, col2 string) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ( +"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2" +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@hbase_null +PREHOOK: query: SELECT d, a, c FROM src_null +PREHOOK: type: QUERY +PREHOOK: Input: default@src_null +#### A masked pattern was here #### +POSTHOOK: query: SELECT d, a, c FROM src_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_null +#### A masked pattern was here #### +0 1.0 same +1 1.0 same +2 1.0 same +3 1.0 same +4 1.0 same +5 NULL same +6 NULL same +7 1.0 same +8 1.0 same +9 1.0 same +PREHOOK: query: INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null +PREHOOK: type: QUERY +PREHOOK: Input: default@src_null +PREHOOK: Output: default@hbase_null +POSTHOOK: query: INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_null +POSTHOOK: Output: default@hbase_null +PREHOOK: query: SELECT COUNT(d) FROM src_null +PREHOOK: type: QUERY +PREHOOK: Input: default@src_null +#### A masked pattern was here #### +POSTHOOK: query: SELECT COUNT(d) FROM src_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_null +#### A masked pattern was here #### +10 +PREHOOK: query: SELECT COUNT(key) FROM hbase_null +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_null +#### A masked pattern was here #### +POSTHOOK: query: SELECT COUNT(key) FROM hbase_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_null +#### A masked pattern was here #### +10 +PREHOOK: query: SELECT COUNT(*) FROM hbase_null +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_null +#### A masked pattern was here #### +POSTHOOK: query: SELECT COUNT(*) FROM hbase_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_null +#### A masked pattern was here #### +10 +PREHOOK: query: DROP TABLE src_null +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@src_null +PREHOOK: Output: default@src_null +POSTHOOK: query: DROP TABLE src_null +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@src_null +POSTHOOK: Output: default@src_null +PREHOOK: query: DROP TABLE hbase_null +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@hbase_null +PREHOOK: Output: default@hbase_null +POSTHOOK: query: DROP TABLE hbase_null +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@hbase_null +POSTHOOK: Output: default@hbase_null