diff --git hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseTableInputFormat.java hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseTableInputFormat.java index 87bcab2..663ebd9 100644 --- hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseTableInputFormat.java +++ hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseTableInputFormat.java @@ -30,6 +30,9 @@ import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.filter.FilterList; +import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter; +import org.apache.hadoop.hbase.filter.KeyOnlyFilter; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapred.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableInputFormatBase; @@ -133,27 +136,26 @@ } } - // The HBase table's row key maps to a Hive table column. In the corner case when only the - // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/ - // column qualifier will have been added to the scan. We arbitrarily add at least one column - // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive - // tables column projection. + // HIVE-5277 HBase handler skips rows with null valued first cells when only row key is selected + // When the row key is the only selected column, the normal process will skip rows. + // http://hbase.apache.org/book/perf.reading.html 12.9.6. Optimal Loading of Row Keys describes how to load row + // keys only. if (empty) { - for (int i = 0; i < columnsMapping.size(); i++) { - ColumnMapping colMap = columnsMapping.get(i); - if (colMap.hbaseRowKey) { - continue; - } - - if (colMap.qualifierName == null) { - scan.addFamily(colMap.familyNameBytes); - } else { - scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes); - } + if (readAllColumns) { + for (int i = 0; i < columnsMapping.size(); i++) { + ColumnMapping colMap = columnsMapping.get(i); + if (colMap.hbaseRowKey) { + continue; + } - if (!readAllColumns) { - break; + if (colMap.qualifierName == null) { + scan.addFamily(colMap.familyNameBytes); + } else { + scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes); + } } + } else { + scan.setFilter(new FilterList(new FirstKeyOnlyFilter(), new KeyOnlyFilter())); } } diff --git hbase-handler/src/test/queries/positive/hbase_null_cell.q hbase-handler/src/test/queries/positive/hbase_null_cell.q new file mode 100644 index 0000000..ebe4472 --- /dev/null +++ hbase-handler/src/test/queries/positive/hbase_null_cell.q @@ -0,0 +1,24 @@ +DROP TABLE src_null; +DROP TABLE hbase_null; + + +CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '../data/files/null.txt' INTO TABLE src_null; + +CREATE TABLE hbase_null(key string, col1 string, col2 string) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ( +"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2" +); + +SELECT d, a, c FROM src_null; + +INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null; + + +SELECT COUNT(d) FROM src_null; +SELECT COUNT(key) FROM hbase_null; + + +DROP TABLE src_null; +DROP TABLE hbase_null; diff --git hbase-handler/src/test/results/positive/hbase_null_cell.q.out hbase-handler/src/test/results/positive/hbase_null_cell.q.out new file mode 100644 index 0000000..925c879 --- /dev/null +++ hbase-handler/src/test/results/positive/hbase_null_cell.q.out @@ -0,0 +1,92 @@ +PREHOOK: query: DROP TABLE src_null +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE src_null +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE hbase_null +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE hbase_null +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@src_null +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/null.txt' INTO TABLE src_null +PREHOOK: type: LOAD +PREHOOK: Output: default@src_null +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/null.txt' INTO TABLE src_null +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_null +PREHOOK: query: CREATE TABLE hbase_null(key string, col1 string, col2 string) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ( +"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2" +) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE hbase_null(key string, col1 string, col2 string) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ( +"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2" +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@hbase_null +PREHOOK: query: SELECT d, a, c FROM src_null +PREHOOK: type: QUERY +PREHOOK: Input: default@src_null +#### A masked pattern was here #### +POSTHOOK: query: SELECT d, a, c FROM src_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_null +#### A masked pattern was here #### +0 1.0 same +1 1.0 same +2 1.0 same +3 1.0 same +4 1.0 same +5 NULL same +6 NULL same +7 1.0 same +8 1.0 same +9 1.0 same +PREHOOK: query: INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null +PREHOOK: type: QUERY +PREHOOK: Input: default@src_null +PREHOOK: Output: default@hbase_null +POSTHOOK: query: INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_null +POSTHOOK: Output: default@hbase_null +PREHOOK: query: SELECT COUNT(d) FROM src_null +PREHOOK: type: QUERY +PREHOOK: Input: default@src_null +#### A masked pattern was here #### +POSTHOOK: query: SELECT COUNT(d) FROM src_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_null +#### A masked pattern was here #### +10 +PREHOOK: query: SELECT COUNT(key) FROM hbase_null +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_null +#### A masked pattern was here #### +POSTHOOK: query: SELECT COUNT(key) FROM hbase_null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_null +#### A masked pattern was here #### +10 +PREHOOK: query: DROP TABLE src_null +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@src_null +PREHOOK: Output: default@src_null +POSTHOOK: query: DROP TABLE src_null +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@src_null +POSTHOOK: Output: default@src_null +PREHOOK: query: DROP TABLE hbase_null +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@hbase_null +PREHOOK: Output: default@hbase_null +POSTHOOK: query: DROP TABLE hbase_null +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@hbase_null +POSTHOOK: Output: default@hbase_null