diff --git a/data/files/bf_parquet.txt b/data/files/bf_parquet.txt index e69de29..2a0bb2f 100644 --- a/data/files/bf_parquet.txt +++ b/data/files/bf_parquet.txt @@ -0,0 +1,24 @@ +41 +42 +48 +44 +45 +47 +31 +32 +34 +35 +37 +38 +141 +142 +148 +144 +145 +147 +131 +132 +134 +135 +137 +138 \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java index 9e2a9e1..a262198 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java @@ -51,11 +51,15 @@ public ParquetRecordWriterWrapper( final Progressable progress, Properties tableProperties) throws IOException { try { + jobConf.setBoolean(ParquetOutputFormat.ENABLE_BLOOM_FILTER, true); + jobConf.setLong(ParquetOutputFormat.EXPACTED_ENTRIES, 10000); + // create a TaskInputOutputContext TaskAttemptID taskAttemptID = TaskAttemptID.forName(jobConf.get("mapred.task.id")); if (taskAttemptID == null) { taskAttemptID = new TaskAttemptID(); } + taskContext = ContextUtil.newTaskAttemptContext(jobConf, taskAttemptID); LOG.info("initialize serde with table properties."); diff --git a/ql/src/test/queries/clientpositive/parquet_bloom_filter.q b/ql/src/test/queries/clientpositive/parquet_bloom_filter.q index e69de29..094c20c 100644 --- a/ql/src/test/queries/clientpositive/parquet_bloom_filter.q +++ b/ql/src/test/queries/clientpositive/parquet_bloom_filter.q @@ -0,0 +1,20 @@ +SET hive.optimize.ppd=true; +SET hive.optimize.index.filter=true; +SET parquet.enable.bloom.filter=true; +SET parquet.expected.entries=10000; + +CREATE TABLE bfTbl_staging(id int +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +NULL DEFINED AS ''; + +CREATE TABLE bfTbl(id int +) STORED AS PARQUET; + +LOAD DATA LOCAL INPATH '../../data/files/bf_parquet.txt' OVERWRITE INTO TABLE bfTbl_staging; + +INSERT OVERWRITE TABLE bfTbl SELECT * FROM bfTbl_staging; + +SELECT * FROM bfTbl WHERE id=36; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/parquet_bloom_filter.q.out b/ql/src/test/results/clientpositive/parquet_bloom_filter.q.out index e69de29..07a62e6 100644 --- a/ql/src/test/results/clientpositive/parquet_bloom_filter.q.out +++ b/ql/src/test/results/clientpositive/parquet_bloom_filter.q.out @@ -0,0 +1,53 @@ +PREHOOK: query: CREATE TABLE bfTbl_staging(id int +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +NULL DEFINED AS '' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bfTbl_staging +POSTHOOK: query: CREATE TABLE bfTbl_staging(id int +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +NULL DEFINED AS '' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bfTbl_staging +PREHOOK: query: CREATE TABLE bfTbl(id int +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@bfTbl +POSTHOOK: query: CREATE TABLE bfTbl(id int +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@bfTbl +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/bf_parquet.txt' OVERWRITE INTO TABLE bfTbl_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@bftbl_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/bf_parquet.txt' OVERWRITE INTO TABLE bfTbl_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@bftbl_staging +PREHOOK: query: INSERT OVERWRITE TABLE bfTbl SELECT * FROM bfTbl_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@bftbl_staging +PREHOOK: Output: default@bftbl +POSTHOOK: query: INSERT OVERWRITE TABLE bfTbl SELECT * FROM bfTbl_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bftbl_staging +POSTHOOK: Output: default@bftbl +POSTHOOK: Lineage: bftbl.id SIMPLE [(bftbl_staging)bftbl_staging.FieldSchema(name:id, type:int, comment:null), ] +PREHOOK: query: SELECT * FROM bfTbl WHERE id=36 +PREHOOK: type: QUERY +PREHOOK: Input: default@bftbl +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM bfTbl WHERE id=36 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bftbl +#### A masked pattern was here ####