diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 48f02ea..66b0cac 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -41,8 +41,11 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.io.InputFormatChecker; +import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.shims.HadoopShims; import org.apache.hadoop.hive.shims.ShimLoader; @@ -165,7 +168,8 @@ static void includeColumnRecursive(List types, public static SearchArgument createSarg(List types, Configuration conf) { String serializedPushdown = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR); if (serializedPushdown == null - || conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) == null) { + || (conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) == null + && conf.get(serdeConstants.LIST_COLUMNS) == null)) { LOG.info("No ORC pushdown predicate"); return null; } @@ -531,9 +535,48 @@ void createSplit(long offset, long length) throws IOException { public void run() { try { Reader orcReader = OrcFile.createReader(fs, file.getPath()); + Configuration conf = context.conf; + List types = orcReader.getTypes(); + SearchArgument sarg = createSarg(types, conf); + List stripeStats = null; + int[] filterColumns = null; + if (sarg != null) { + List sargLeaves = null; + String[] columnNames = conf.get(serdeConstants.LIST_COLUMNS).split(","); + if (sarg != null) { + sargLeaves = sarg.getLeaves(); + filterColumns = new int[sargLeaves.size()]; + for (int i = 0; i < filterColumns.length; ++i) { + String colName = sargLeaves.get(i).getColumnName(); + filterColumns[i] = RecordReaderImpl.findColumns(columnNames, colName); + } + } + + Metadata metadata = orcReader.getMetadata(); + stripeStats = metadata.getStripeStatisticsList(); + } + long currentOffset = -1; long currentLength = 0; + int idx = -1; for(StripeInformation stripe: orcReader.getStripes()) { + idx++; + + // eliminate stripes that doesn't satisfy the predicate condition + if (sarg != null && !isStripeSatisfyPredicate(stripeStats.get(idx), sarg, filterColumns)) { + // if a stripe doesn't satisfy predicate condition then skip it + if (LOG.isDebugEnabled()) { + LOG.debug("Eliminating ORC stripe-" + idx + " of file '" + file.getPath() + + "' as it did not satisfy predicate condition."); + } + // create split for the previous unfinished stripe + if (currentOffset != -1) { + createSplit(currentOffset, currentLength); + currentOffset = -1; + } + continue; + } + // if we are working on a stripe, over the min stripe size, and // crossed a block boundary, cut the input split here. if (currentOffset != -1 && currentLength > context.minSize && @@ -562,6 +605,55 @@ public void run() { } } } + + private boolean isStripeSatisfyPredicate(StripeStatistics stripeStatistics, + SearchArgument sarg, int[] filterColumns) { + if (sarg != null && filterColumns != null) { + List predLeaves = sarg.getLeaves(); + TruthValue[] truthValues = new TruthValue[predLeaves.size()]; + for (int pred = 0; pred < truthValues.length; pred++) { + if (filterColumns[pred] != -1) { + // column statistics at index 0 contains only the number of rows + ColumnStatistics stats = stripeStatistics.getColumnStatistics()[filterColumns[pred] + 1]; + Object minValue = getMin(stats); + Object maxValue = getMax(stats); + truthValues[pred] = RecordReaderImpl.evaluatePredicateRange(predLeaves.get(pred), + minValue, maxValue); + } + } + return sarg.evaluate(truthValues).isNotNeeded(); + } + return true; + } + + private Object getMax(ColumnStatistics index) { + if (index instanceof IntegerColumnStatistics) { + return ((IntegerColumnStatistics) index).getMaximum(); + } else if (index instanceof DoubleColumnStatistics) { + return ((DoubleColumnStatistics) index).getMaximum(); + } else if (index instanceof StringColumnStatistics) { + return ((StringColumnStatistics) index).getMaximum(); + } else if (index instanceof DateColumnStatistics) { + return ((DateColumnStatistics) index).getMaximum(); + } else { + return null; + } + } + + private Object getMin(ColumnStatistics index) { + if (index instanceof IntegerColumnStatistics) { + return ((IntegerColumnStatistics) index).getMinimum(); + } else if (index instanceof DoubleColumnStatistics) { + return ((DoubleColumnStatistics) index).getMinimum(); + } else if (index instanceof StringColumnStatistics) { + return ((StringColumnStatistics) index).getMinimum(); + } else if (index instanceof DateColumnStatistics) { + return ((DateColumnStatistics) index).getMinimum(); + } else { + return null; + } + } + } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index 71484a3..bd6b2be 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -132,7 +132,7 @@ advanceToNextRow(0L); } - private static int findColumns(String[] columnNames, + static int findColumns(String[] columnNames, String columnName) { for(int i=0; i < columnNames.length; ++i) { if (columnName.equals(columnNames[i])) { @@ -2025,6 +2025,11 @@ static TruthValue evaluatePredicate(OrcProto.ColumnStatistics index, } } Object maxValue = getMax(index); + return evaluatePredicateRange(predicate, minValue, maxValue); + } + + static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object minValue, + Object maxValue) { Location loc; switch (predicate.getOperator()) { case NULL_SAFE_EQUALS: diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSplitElimination.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSplitElimination.java new file mode 100644 index 0000000..9494a18 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSplitElimination.java @@ -0,0 +1,385 @@ +package org.apache.hadoop.hive.ql.io.orc; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.List; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import com.google.common.collect.Lists; + +public class TestOrcSplitElimination { + + public static class AllTypesRow { + Long userid; + Text string1; + Double subtype; + HiveDecimal decimal1; + Timestamp ts; + + AllTypesRow(Long uid, String s1, Double d1, HiveDecimal decimal, Timestamp ts) { + this.userid = uid; + this.string1 = new Text(s1); + this.subtype = d1; + this.decimal1 = decimal; + this.ts = ts; + } + } + + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + + JobConf conf; + FileSystem fs; + Path testFilePath; + + @Rule + public TestName testCaseName = new TestName(); + + @Before + public void openFileSystem() throws Exception { + conf = new JobConf(); + conf.set("columns", "userid,string1,subtype,decimal1,ts"); + conf.set("columns.types", "bigint,string,double,decimal,timestamp"); + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcFile." + + testCaseName.getMethodName() + ".orc"); + fs.delete(testFilePath, false); + } + + @Test + public void testSplitEliminationSmallMaxSplit() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory + .getReflectionObjectInspector(AllTypesRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, + 100000, CompressionKind.NONE, 10000, 10000); + writeData(writer); + writer.close(); + conf.set("mapred.min.split.size", "1000"); + conf.set("mapred.max.split.size", "5000"); + InputFormat in = new OrcInputFormat(); + FileInputFormat.setInputPaths(conf, testFilePath.toString()); + + GenericUDF udf = new GenericUDFOPEqualOrLessThan(); + List childExpr = Lists.newArrayList(); + ExprNodeColumnDesc col = new ExprNodeColumnDesc(Long.class, "userid", "T", false); + ExprNodeConstantDesc con = new ExprNodeConstantDesc(100); + childExpr.add(col); + childExpr.add(con); + ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + String sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + InputSplit[] splits = in.getSplits(conf, 1); + assertEquals(5, splits.length); + + con = new ExprNodeConstantDesc(1); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + assertEquals(0, splits.length); + + con = new ExprNodeConstantDesc(2); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + assertEquals(1, splits.length); + + con = new ExprNodeConstantDesc(5); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + assertEquals(2, splits.length); + + con = new ExprNodeConstantDesc(13); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + assertEquals(3, splits.length); + + con = new ExprNodeConstantDesc(29); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + assertEquals(4, splits.length); + + con = new ExprNodeConstantDesc(70); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + assertEquals(5, splits.length); + } + + @Test + public void testSplitEliminationLargeMaxSplit() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory + .getReflectionObjectInspector(AllTypesRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, + 100000, CompressionKind.NONE, 10000, 10000); + writeData(writer); + writer.close(); + conf.set("mapred.min.split.size", "1000"); + conf.set("mapred.max.split.size", "150000"); + InputFormat in = new OrcInputFormat(); + FileInputFormat.setInputPaths(conf, testFilePath.toString()); + + GenericUDF udf = new GenericUDFOPEqualOrLessThan(); + List childExpr = Lists.newArrayList(); + ExprNodeColumnDesc col = new ExprNodeColumnDesc(Long.class, "userid", "T", false); + ExprNodeConstantDesc con = new ExprNodeConstantDesc(100); + childExpr.add(col); + childExpr.add(con); + ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + String sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + InputSplit[] splits = in.getSplits(conf, 1); + assertEquals(2, splits.length); + + con = new ExprNodeConstantDesc(0); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // no stripes satisfies the condition + assertEquals(0, splits.length); + + con = new ExprNodeConstantDesc(2); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // only first stripe will satisfy condition and hence single split + assertEquals(1, splits.length); + + con = new ExprNodeConstantDesc(5); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // first stripe will satisfy the predicate and will be a single split, last stripe will be a + // separate split + assertEquals(2, splits.length); + + con = new ExprNodeConstantDesc(13); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // first 2 stripes will satisfy the predicate and merged to single split, last stripe will be a + // separate split + assertEquals(2, splits.length); + + con = new ExprNodeConstantDesc(29); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // first 3 stripes will satisfy the predicate and merged to single split, last stripe will be a + // separate split + assertEquals(2, splits.length); + + con = new ExprNodeConstantDesc(70); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + sargStr = Utilities.serializeExpression(en); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // first 2 stripes will satisfy the predicate and merged to single split, last two stripe will + // be a separate split + assertEquals(2, splits.length); + } + + + @Test + public void testSplitEliminationComplexExpr() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory + .getReflectionObjectInspector(AllTypesRow.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, + 100000, CompressionKind.NONE, 10000, 10000); + writeData(writer); + writer.close(); + conf.set("mapred.min.split.size", "1000"); + conf.set("mapred.max.split.size", "150000"); + InputFormat in = new OrcInputFormat(); + FileInputFormat.setInputPaths(conf, testFilePath.toString()); + + // predicate expression: userid <= 100 and subtype <= 1000.0 + GenericUDF udf = new GenericUDFOPEqualOrLessThan(); + List childExpr = Lists.newArrayList(); + ExprNodeColumnDesc col = new ExprNodeColumnDesc(Long.class, "userid", "T", false); + ExprNodeConstantDesc con = new ExprNodeConstantDesc(100); + childExpr.add(col); + childExpr.add(con); + ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + + GenericUDF udf1 = new GenericUDFOPEqualOrLessThan(); + List childExpr1 = Lists.newArrayList(); + ExprNodeColumnDesc col1 = new ExprNodeColumnDesc(Double.class, "subtype", "T", false); + ExprNodeConstantDesc con1 = new ExprNodeConstantDesc(1000.0); + childExpr1.add(col1); + childExpr1.add(con1); + ExprNodeGenericFuncDesc en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); + + GenericUDF udf2 = new GenericUDFOPAnd(); + List childExpr2 = Lists.newArrayList(); + childExpr2.add(en); + childExpr2.add(en1); + ExprNodeGenericFuncDesc en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); + + String sargStr = Utilities.serializeExpression(en2); + conf.set("hive.io.filter.expr.serialized", sargStr); + InputSplit[] splits = in.getSplits(conf, 1); + assertEquals(2, splits.length); + + con = new ExprNodeConstantDesc(2); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + + con1 = new ExprNodeConstantDesc(0.0); + childExpr1.set(1, con1); + en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); + + childExpr2.set(0, en); + childExpr2.set(1, en1); + en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); + + sargStr = Utilities.serializeExpression(en2); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // no stripe will satisfy the predicate + assertEquals(0, splits.length); + + con = new ExprNodeConstantDesc(2); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + + con1 = new ExprNodeConstantDesc(1.0); + childExpr1.set(1, con1); + en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); + + childExpr2.set(0, en); + childExpr2.set(1, en1); + en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); + + sargStr = Utilities.serializeExpression(en2); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // only first stripe will satisfy condition and hence single split + assertEquals(1, splits.length); + + udf = new GenericUDFOPEqual(); + con = new ExprNodeConstantDesc(13); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + + con1 = new ExprNodeConstantDesc(80.0); + childExpr1.set(1, con1); + en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); + + childExpr2.set(0, en); + childExpr2.set(1, en1); + en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); + + sargStr = Utilities.serializeExpression(en2); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // first two stripes will satisfy condition and hence single split + assertEquals(2, splits.length); + + udf = new GenericUDFOPEqual(); + con = new ExprNodeConstantDesc(13); + childExpr.set(1, con); + en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); + + udf1 = new GenericUDFOPEqual(); + con1 = new ExprNodeConstantDesc(80.0); + childExpr1.set(1, con1); + en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); + + childExpr2.set(0, en); + childExpr2.set(1, en1); + en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); + + sargStr = Utilities.serializeExpression(en2); + conf.set("hive.io.filter.expr.serialized", sargStr); + splits = in.getSplits(conf, 1); + // only second stripes will satisfy condition and hence single split + assertEquals(1, splits.length); + } + + private void writeData(Writer writer) throws IOException { + for (int i = 0; i < 25000; i++) { + if (i == 0) { + writer.addRow(new AllTypesRow(2L, "foo", 0.8, HiveDecimal.create("1.2"), new Timestamp(0))); + } else if (i == 5000) { + writer.addRow(new AllTypesRow(13L, "bar", 80.0, HiveDecimal.create("2.2"), new Timestamp( + 5000))); + } else if (i == 10000) { + writer.addRow(new AllTypesRow(29L, "cat", 8.0, HiveDecimal.create("3.3"), new Timestamp( + 10000))); + } else if (i == 15000) { + writer.addRow(new AllTypesRow(70L, "dog", 1.8, HiveDecimal.create("4.4"), new Timestamp( + 15000))); + } else if (i == 20000) { + writer.addRow(new AllTypesRow(5L, "eat", 0.8, HiveDecimal.create("5.5"), new Timestamp( + 20000))); + } else { + writer.addRow(new AllTypesRow(100L, "zebra", 8.0, HiveDecimal.create("0.0"), new Timestamp( + 250000))); + } + } + } +} diff --git ql/src/test/queries/clientpositive/orc_split_elimination.q ql/src/test/queries/clientpositive/orc_split_elimination.q new file mode 100644 index 0000000..85f3954 --- /dev/null +++ ql/src/test/queries/clientpositive/orc_split_elimination.q @@ -0,0 +1,93 @@ +create table orc_split_elim (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc; + +load data local inpath '../data/files/orc_split_elim.orc' into table orc_split_elim; + +SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET mapred.min.split.size=1000; +SET mapred.max.split.size=5000; +SET hive.optimize.index.filter=false; + +-- The above table will have 5 splits with the followings stats +-- Stripe 1: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 2 max: 100 sum: 499902 +-- Column 2: count: 5000 min: foo max: zebra sum: 24998 +-- Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8 +-- Column 4: count: 5000 min: 0 max: 1.2 sum: 1.2 +-- Column 5: count: 5000 +-- Stripe 2: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 13 max: 100 sum: 499913 +-- Column 2: count: 5000 min: bar max: zebra sum: 24998 +-- Column 3: count: 5000 min: 8.0 max: 80.0 sum: 40072.0 +-- Column 4: count: 5000 min: 0 max: 2.2 sum: 2.2 +-- Column 5: count: 5000 +-- Stripe 3: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 29 max: 100 sum: 499929 +-- Column 2: count: 5000 min: cat max: zebra sum: 24998 +-- Column 3: count: 5000 min: 8.0 max: 8.0 sum: 40000.0 +-- Column 4: count: 5000 min: 0 max: 3.3 sum: 3.3 +-- Column 5: count: 5000 +-- Stripe 4: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 70 max: 100 sum: 499970 +-- Column 2: count: 5000 min: dog max: zebra sum: 24998 +-- Column 3: count: 5000 min: 1.8 max: 8.0 sum: 39993.8 +-- Column 4: count: 5000 min: 0 max: 4.4 sum: 4.4 +-- Column 5: count: 5000 +-- Stripe 5: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 5 max: 100 sum: 499905 +-- Column 2: count: 5000 min: eat max: zebra sum: 24998 +-- Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8 +-- Column 4: count: 5000 min: 0 max: 5.5 sum: 5.5 +-- Column 5: count: 5000 + +-- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0; + +SET hive.optimize.index.filter=true; +-- 0 mapper +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0; +SET hive.optimize.index.filter=false; + +-- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2 order by userid; + +SET hive.optimize.index.filter=true; +-- 1 mapper +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2 order by userid; +SET hive.optimize.index.filter=false; + +-- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5 order by userid; + +SET hive.optimize.index.filter=true; +-- 2 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5 order by userid; +SET hive.optimize.index.filter=false; + +-- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13 order by userid; + +SET hive.optimize.index.filter=true; +-- 3 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13 order by userid; +SET hive.optimize.index.filter=false; + +-- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29 order by userid; + +SET hive.optimize.index.filter=true; +-- 4 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29 order by userid; +SET hive.optimize.index.filter=false; + +-- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70 order by userid; + +SET hive.optimize.index.filter=true; +-- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70 order by userid; +SET hive.optimize.index.filter=false; diff --git ql/src/test/results/clientpositive/orc_split_elimination.q.out ql/src/test/results/clientpositive/orc_split_elimination.q.out new file mode 100644 index 0000000..dde15b1 --- /dev/null +++ ql/src/test/results/clientpositive/orc_split_elimination.q.out @@ -0,0 +1,235 @@ +PREHOOK: query: create table orc_split_elim (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table orc_split_elim (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@orc_split_elim +PREHOOK: query: load data local inpath '../data/files/orc_split_elim.orc' into table orc_split_elim +PREHOOK: type: LOAD +PREHOOK: Output: default@orc_split_elim +POSTHOOK: query: load data local inpath '../data/files/orc_split_elim.orc' into table orc_split_elim +POSTHOOK: type: LOAD +POSTHOOK: Output: default@orc_split_elim +PREHOOK: query: -- The above table will have 5 splits with the followings stats +-- Stripe 1: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 2 max: 100 sum: 499902 +-- Column 2: count: 5000 min: foo max: zebra sum: 24998 +-- Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8 +-- Column 4: count: 5000 min: 0 max: 1.2 sum: 1.2 +-- Column 5: count: 5000 +-- Stripe 2: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 13 max: 100 sum: 499913 +-- Column 2: count: 5000 min: bar max: zebra sum: 24998 +-- Column 3: count: 5000 min: 8.0 max: 80.0 sum: 40072.0 +-- Column 4: count: 5000 min: 0 max: 2.2 sum: 2.2 +-- Column 5: count: 5000 +-- Stripe 3: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 29 max: 100 sum: 499929 +-- Column 2: count: 5000 min: cat max: zebra sum: 24998 +-- Column 3: count: 5000 min: 8.0 max: 8.0 sum: 40000.0 +-- Column 4: count: 5000 min: 0 max: 3.3 sum: 3.3 +-- Column 5: count: 5000 +-- Stripe 4: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 70 max: 100 sum: 499970 +-- Column 2: count: 5000 min: dog max: zebra sum: 24998 +-- Column 3: count: 5000 min: 1.8 max: 8.0 sum: 39993.8 +-- Column 4: count: 5000 min: 0 max: 4.4 sum: 4.4 +-- Column 5: count: 5000 +-- Stripe 5: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 5 max: 100 sum: 499905 +-- Column 2: count: 5000 min: eat max: zebra sum: 24998 +-- Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8 +-- Column 4: count: 5000 min: 0 max: 5.5 sum: 5.5 +-- Column 5: count: 5000 + +-- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- The above table will have 5 splits with the followings stats +-- Stripe 1: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 2 max: 100 sum: 499902 +-- Column 2: count: 5000 min: foo max: zebra sum: 24998 +-- Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8 +-- Column 4: count: 5000 min: 0 max: 1.2 sum: 1.2 +-- Column 5: count: 5000 +-- Stripe 2: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 13 max: 100 sum: 499913 +-- Column 2: count: 5000 min: bar max: zebra sum: 24998 +-- Column 3: count: 5000 min: 8.0 max: 80.0 sum: 40072.0 +-- Column 4: count: 5000 min: 0 max: 2.2 sum: 2.2 +-- Column 5: count: 5000 +-- Stripe 3: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 29 max: 100 sum: 499929 +-- Column 2: count: 5000 min: cat max: zebra sum: 24998 +-- Column 3: count: 5000 min: 8.0 max: 8.0 sum: 40000.0 +-- Column 4: count: 5000 min: 0 max: 3.3 sum: 3.3 +-- Column 5: count: 5000 +-- Stripe 4: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 70 max: 100 sum: 499970 +-- Column 2: count: 5000 min: dog max: zebra sum: 24998 +-- Column 3: count: 5000 min: 1.8 max: 8.0 sum: 39993.8 +-- Column 4: count: 5000 min: 0 max: 4.4 sum: 4.4 +-- Column 5: count: 5000 +-- Stripe 5: +-- Column 0: count: 5000 +-- Column 1: count: 5000 min: 5 max: 100 sum: 499905 +-- Column 2: count: 5000 min: eat max: zebra sum: 24998 +-- Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8 +-- Column 4: count: 5000 min: 0 max: 5.5 sum: 5.5 +-- Column 5: count: 5000 + +-- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +PREHOOK: query: -- 0 mapper +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 0 mapper +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +PREHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +PREHOOK: query: -- 1 mapper +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 1 mapper +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +PREHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +5 eat 0.8 5.5 1969-12-31 16:00:20 +PREHOOK: query: -- 2 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 2 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +5 eat 0.8 5.5 1969-12-31 16:00:20 +PREHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +5 eat 0.8 5.5 1969-12-31 16:00:20 +13 bar 80.0 2.2 1969-12-31 16:00:05 +PREHOOK: query: -- 3 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 3 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +5 eat 0.8 5.5 1969-12-31 16:00:20 +13 bar 80.0 2.2 1969-12-31 16:00:05 +PREHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +5 eat 0.8 5.5 1969-12-31 16:00:20 +13 bar 80.0 2.2 1969-12-31 16:00:05 +29 cat 8.0 3.3 1969-12-31 16:00:10 +PREHOOK: query: -- 4 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 4 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +5 eat 0.8 5.5 1969-12-31 16:00:20 +13 bar 80.0 2.2 1969-12-31 16:00:05 +29 cat 8.0 3.3 1969-12-31 16:00:10 +PREHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +5 eat 0.8 5.5 1969-12-31 16:00:20 +13 bar 80.0 2.2 1969-12-31 16:00:05 +29 cat 8.0 3.3 1969-12-31 16:00:10 +70 dog 1.8 4.4 1969-12-31 16:00:15 +PREHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70 order by userid +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +POSTHOOK: query: -- 5 mappers +select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70 order by userid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_split_elim +#### A masked pattern was here #### +2 foo 0.8 1.2 1969-12-31 16:00:00 +5 eat 0.8 5.5 1969-12-31 16:00:20 +13 bar 80.0 2.2 1969-12-31 16:00:05 +29 cat 8.0 3.3 1969-12-31 16:00:10 +70 dog 1.8 4.4 1969-12-31 16:00:15