files, Configuration conf,
buf.append("no stats at ");
} else {
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
- Object min = RecordReaderImpl.getMin(cs), max = RecordReaderImpl.getMax(cs);
- buf.append(" count: ").append(cs.getNumberOfValues());
- buf.append(" min: ").append(min);
- buf.append(" max: ").append(max);
+ buf.append(cs.toString());
}
buf.append(" positions: ");
for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
index b46937c..0281c86 100644
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
@@ -18,7 +18,14 @@
package org.apache.hadoop.hive.ql.io.orc;
-import java.io.IOException;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_BLOOM_FILTER_COLUMNS;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_PADDING;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_SIZE;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BUFFER_SIZE;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_COMPRESS;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_STRIPE_SIZE;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_WRITE_FORMAT;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -26,7 +33,7 @@
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.*;
+import java.io.IOException;
/**
* Contains factory methods to read or write ORC files.
@@ -148,7 +155,8 @@ private WriterVersion(int id) {
ROW_INDEX_STRIDE("orc.row.index.stride"),
ENABLE_INDEXES("orc.create.index"),
BLOCK_PADDING("orc.block.padding"),
- ENCODING_STRATEGY("orc.encoding.strategy");
+ ENCODING_STRATEGY("orc.encoding.strategy"),
+ BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns");
private final String propName;
@@ -256,6 +264,8 @@ public static Reader createReader(Path path,
private EncodingStrategy encodingStrategy;
private CompressionStrategy compressionStrategy;
private float paddingTolerance;
+ private String bloomFilterColumns;
+ private double bloomFilterFpp;
WriterOptions(Configuration conf) {
configuration = conf;
@@ -288,9 +298,12 @@ public static Reader createReader(Path path,
compressionStrategy = CompressionStrategy.valueOf(compString);
}
- paddingTolerance =
- conf.getFloat(HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.varname,
- HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.defaultFloatVal);
+ paddingTolerance = conf.getFloat(HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.varname,
+ HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.defaultFloatVal);
+
+ bloomFilterColumns = HiveConf.getVar(conf, HIVE_ORC_BLOOM_FILTER_COLUMNS);
+ bloomFilterFpp = conf.getFloat(HiveConf.ConfVars.HIVE_ORC_BLOOM_FILTER_FPP.varname,
+ HiveConf.ConfVars.HIVE_ORC_BLOOM_FILTER_FPP.defaultFloatVal);
}
/**
@@ -367,6 +380,14 @@ public WriterOptions paddingTolerance(float value) {
}
/**
+ * Comma separated values of column names for which bloom filter is to be created.
+ */
+ public WriterOptions bloomFilterColumns(String columns) {
+ bloomFilterColumns = columns;
+ return this;
+ }
+
+ /**
* Sets the generic compression that is used to compress the data.
*/
public WriterOptions compress(CompressionKind value) {
@@ -438,8 +459,8 @@ public static Writer createWriter(Path path,
opts.memoryManagerValue, opts.blockPaddingValue,
opts.versionValue, opts.callback,
opts.encodingStrategy, opts.compressionStrategy,
- opts.paddingTolerance,
- opts.blockSizeValue);
+ opts.paddingTolerance, opts.blockSizeValue,
+ opts.bloomFilterColumns, opts.bloomFilterFpp);
}
/**
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 5be2b4f..b7841d3 100644
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -18,18 +18,9 @@
package org.apache.hadoop.hive.ql.io.orc;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-import java.util.NavigableMap;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -54,9 +45,9 @@
import org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
import org.apache.hadoop.hive.ql.log.PerfLogger;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeStats;
@@ -74,9 +65,18 @@
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
-import com.google.common.cache.Cache;
-import com.google.common.cache.CacheBuilder;
-import com.google.common.util.concurrent.ThreadFactoryBuilder;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.TreeMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
/**
* A MapReduce/Hive input format for ORC files.
*
@@ -923,9 +923,8 @@ private boolean isStripeSatisfyPredicate(StripeStatistics stripeStatistics,
stripeStatistics.getColumnStatistics()[filterColumns[pred]];
Object minValue = RecordReaderImpl.getMin(stats);
Object maxValue = RecordReaderImpl.getMax(stats);
- truthValues[pred] =
- RecordReaderImpl.evaluatePredicateRange(predLeaves.get(pred),
- minValue, maxValue);
+ truthValues[pred] = RecordReaderImpl.evaluatePredicateRange(predLeaves.get(pred),
+ minValue, maxValue, stats.getBloomFilter());
} else {
// parition column case.
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
index 5bd3f0c..d76e25f 100644
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
@@ -170,6 +170,11 @@ private String getSettingFromPropsFallingBackToConf(String key, Properties props
options.encodingStrategy(EncodingStrategy.valueOf(propVal));
}
+ if ((propVal = getSettingFromPropsFallingBackToConf(
+ OrcFile.OrcTableProperties.BLOOM_FILTER_COLUMNS.getPropName(),props,conf)) != null){
+ options.bloomFilterColumns(propVal);
+ }
+
return options;
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
index f7fce3f..0f68b5f 100644
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
@@ -19,19 +19,7 @@
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_ZEROCOPY;
-import java.io.EOFException;
-import java.io.IOException;
-import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
+import com.google.common.collect.ComparisonChain;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.builder.HashCodeBuilder;
@@ -51,6 +39,7 @@
import org.apache.hadoop.hive.ql.exec.vector.TimestampUtils;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
+import org.apache.hadoop.hive.ql.io.filters.BloomFilter;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
@@ -74,7 +63,20 @@
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
-import com.google.common.collect.ComparisonChain;
+import java.io.EOFException;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
class RecordReaderImpl implements RecordReader {
@@ -2366,11 +2368,11 @@ static TruthValue evaluatePredicate(OrcProto.ColumnStatistics index,
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(index);
Object minValue = getMin(cs);
Object maxValue = getMax(cs);
- return evaluatePredicateRange(predicate, minValue, maxValue);
+ return evaluatePredicateRange(predicate, minValue, maxValue, cs.getBloomFilter());
}
static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
- Object max) {
+ Object max, BloomFilter bloomFilter) {
// if we didn't have any values, everything must have been null
if (min == null) {
if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) {
@@ -2395,89 +2397,107 @@ static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
Object predObj = getBaseObjectForComparison(baseObj, minValue);
switch (predicate.getOperator()) {
- case NULL_SAFE_EQUALS:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.BEFORE || loc == Location.AFTER) {
- return TruthValue.NO;
- } else {
- return TruthValue.YES_NO;
- }
- case EQUALS:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (minValue.equals(maxValue) && loc == Location.MIN) {
- return TruthValue.YES_NULL;
- } else if (loc == Location.BEFORE || loc == Location.AFTER) {
- return TruthValue.NO_NULL;
- } else {
- return TruthValue.YES_NO_NULL;
- }
- case LESS_THAN:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.AFTER) {
- return TruthValue.YES_NULL;
- } else if (loc == Location.BEFORE || loc == Location.MIN) {
- return TruthValue.NO_NULL;
- } else {
- return TruthValue.YES_NO_NULL;
- }
- case LESS_THAN_EQUALS:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.AFTER || loc == Location.MAX) {
- return TruthValue.YES_NULL;
- } else if (loc == Location.BEFORE) {
- return TruthValue.NO_NULL;
- } else {
- return TruthValue.YES_NO_NULL;
- }
- case IN:
- if (minValue.equals(maxValue)) {
- // for a single value, look through to see if that value is in the
- // set
- for (Object arg : predicate.getLiteralList(PredicateLeaf.FileFormat.ORC)) {
- predObj = getBaseObjectForComparison(arg, minValue);
+ case NULL_SAFE_EQUALS:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.BEFORE || loc == Location.AFTER) {
+ return TruthValue.NO;
+ } else {
+ return TruthValue.YES_NO;
+ }
+ case EQUALS:
+ if (bloomFilter != null) {
+ return checkInBloomFilter(bloomFilter, predObj);
+ } else {
loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.MIN) {
+ if (minValue.equals(maxValue) && loc == Location.MIN) {
return TruthValue.YES_NULL;
+ } else if (loc == Location.BEFORE || loc == Location.AFTER) {
+ return TruthValue.NO_NULL;
+ } else {
+ return TruthValue.YES_NO_NULL;
}
}
- return TruthValue.NO_NULL;
- } else {
- // are all of the values outside of the range?
- for (Object arg : predicate.getLiteralList(PredicateLeaf.FileFormat.ORC)) {
- predObj = getBaseObjectForComparison(arg, minValue);
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.MIN || loc == Location.MIDDLE ||
- loc == Location.MAX) {
- return TruthValue.YES_NO_NULL;
+ case LESS_THAN:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.AFTER) {
+ return TruthValue.YES_NULL;
+ } else if (loc == Location.BEFORE || loc == Location.MIN) {
+ return TruthValue.NO_NULL;
+ } else {
+ return TruthValue.YES_NO_NULL;
+ }
+ case LESS_THAN_EQUALS:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.AFTER || loc == Location.MAX) {
+ return TruthValue.YES_NULL;
+ } else if (loc == Location.BEFORE) {
+ return TruthValue.NO_NULL;
+ } else {
+ return TruthValue.YES_NO_NULL;
+ }
+ case IN:
+ if (minValue.equals(maxValue)) {
+ // for a single value, look through to see if that value is in the
+ // set
+ for (Object arg : predicate.getLiteralList(PredicateLeaf.FileFormat.ORC)) {
+ predObj = getBaseObjectForComparison(arg, minValue);
+
+ if (bloomFilter != null) {
+ if (checkInBloomFilter(bloomFilter, predObj) != TruthValue.NO_NULL) {
+ return TruthValue.YES_NO_NULL;
+ }
+ } else {
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.MIN) {
+ return TruthValue.YES_NULL;
+ }
+ }
}
+ return TruthValue.NO_NULL;
+ } else {
+ // are all of the values outside of the range?
+ for (Object arg : predicate.getLiteralList(PredicateLeaf.FileFormat.ORC)) {
+ predObj = getBaseObjectForComparison(arg, minValue);
+
+ if (bloomFilter != null) {
+ if (checkInBloomFilter(bloomFilter, predObj) != TruthValue.NO_NULL) {
+ return TruthValue.YES_NO_NULL;
+ }
+ } else {
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.MIN || loc == Location.MIDDLE ||
+ loc == Location.MAX) {
+ return TruthValue.YES_NO_NULL;
+ }
+ }
+ }
+ return TruthValue.NO_NULL;
}
- return TruthValue.NO_NULL;
- }
- case BETWEEN:
- List