commit f5eb77d6d17d14878d794c9ec54ed2628a7ac2f9 Author: Owen O'Malley Date: Sat Mar 29 00:20:50 2014 -0700 HIVE-6818. Fix array out of bounds with ORC and predicate pushdown. diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 0ccc3ad..62a5bcd 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -106,6 +106,7 @@ SHIMS.getHadoopConfNames().get("MAPREDMINSPLITSIZE"); static final String MAX_SPLIT_SIZE = SHIMS.getHadoopConfNames().get("MAPREDMAXSPLITSIZE"); + static final String SARG_PUSHDOWN = "sarg.pushdown"; private static final long DEFAULT_MIN_SPLIT_SIZE = 16 * 1024 * 1024; private static final long DEFAULT_MAX_SPLIT_SIZE = 256 * 1024 * 1024; @@ -268,21 +269,28 @@ static void setSearchArgument(Reader.Options options, boolean isOriginal) { int rootColumn = getRootColumn(isOriginal); String serializedPushdown = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR); + String sargPushdown = conf.get(SARG_PUSHDOWN); String columnNamesString = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR); - if (serializedPushdown == null || columnNamesString == null) { + if ((sargPushdown == null && serializedPushdown == null) + || columnNamesString == null) { LOG.debug("No ORC pushdown predicate"); options.searchArgument(null, null); } else { - SearchArgument sarg = SearchArgument.FACTORY.create - (Utilities.deserializeExpression(serializedPushdown)); + SearchArgument sarg; + if (serializedPushdown != null) { + sarg = SearchArgument.FACTORY.create + (Utilities.deserializeExpression(serializedPushdown)); + } else { + sarg = SearchArgument.FACTORY.create(sargPushdown); + } LOG.info("ORC pushdown predicate: " + sarg); String[] neededColumnNames = columnNamesString.split(","); String[] columnNames = new String[types.size() - rootColumn]; boolean[] includedColumns = options.getInclude(); int i = 0; for(int columnId: types.get(rootColumn).getSubtypesList()) { - if (includedColumns == null || includedColumns[columnId]) { + if (includedColumns == null || includedColumns[columnId - rootColumn]) { // this is guaranteed to be positive because types only have children // ids greater than their own id. columnNames[columnId - rootColumn] = neededColumnNames[i++]; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java index 0c47f9f..5a73611 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java @@ -166,6 +166,12 @@ public boolean isNeeded() { public TruthValue evaluate(TruthValue[] leaves); /** + * Serialize the SARG as a kyro object and return the base64 strig. + * @return the serialized SARG + */ + public String toKryo(); + + /** * A factory for creating SearchArguments. Java doesn't allow static methods * in interfaces. *DOH* */ @@ -177,6 +183,10 @@ public SearchArgument create(ExprNodeGenericFuncDesc expression) { public Builder newBuilder() { return SearchArgumentImpl.newBuilder(); } + + public SearchArgument create(String kryo) { + return SearchArgumentImpl.fromKryo(kryo); + } } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java index 3bdb059..2c53f65 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java @@ -26,6 +26,10 @@ import java.util.List; import java.util.Map; +import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.io.Input; +import com.esotericsoftware.kryo.io.Output; +import org.apache.commons.codec.binary.Base64; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -65,6 +69,14 @@ private final Object literal; private final List literalList; + PredicateLeafImpl() { + operator = null; + type = null; + columnName = null; + literal = null; + literalList = null; + } + PredicateLeafImpl(Operator operator, Type type, String columnName, @@ -166,6 +178,13 @@ public int hashCode() { private final int leaf; private final TruthValue constant; + ExpressionTree() { + operator = null; + children = null; + leaf = 0; + constant = null; + } + ExpressionTree(Operator op, ExpressionTree... kids) { operator = op; children = new ArrayList(); @@ -818,6 +837,11 @@ ExpressionTree expression(ExpressionTree expr, } } + SearchArgumentImpl() { + leaves = null; + expression = null; + } + SearchArgumentImpl(ExpressionTree expression, List leaves) { this.expression = expression; this.leaves = leaves; @@ -852,6 +876,18 @@ public String toString() { return buffer.toString(); } + public String toKryo() { + Output out = new Output(4 * 1024, 10 * 1024 * 1024); + new Kryo().writeObject(out, this); + out.close(); + return Base64.encodeBase64String(out.toBytes()); + } + + static SearchArgument fromKryo(String value) { + Input input = new Input(Base64.decodeBase64(value)); + return new Kryo().readObject(input, SearchArgumentImpl.class); + } + private static class BuilderImpl implements Builder { private final Deque currentTree = new ArrayDeque(); diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java index 76ddb1e..f3f3619 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java @@ -71,7 +71,7 @@ //Is this an expression that should perform a comparison for sorted searches private boolean isSortedExpr; - public ExprNodeGenericFuncDesc() { + public ExprNodeGenericFuncDesc() {; } /* If the function has an explicit name like func(args) then call a diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index f0e1e39..0440af7 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -58,6 +58,8 @@ import org.apache.hadoop.hive.ql.io.HiveInputFormat; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.io.InputFormatChecker; +import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; @@ -1238,4 +1240,49 @@ public void testCombinationInputFormatWithAcid() throws Exception { ioe.getMessage()); } } + + @Test + public void testSetSearchArgument() throws Exception { + Reader.Options options = new Reader.Options(); + List types = new ArrayList(); + OrcProto.Type.Builder builder = OrcProto.Type.newBuilder(); + builder.setKind(OrcProto.Type.Kind.STRUCT) + .addAllFieldNames(Arrays.asList("op", "otid", "bucket", "rowid", "ctid", + "row")) + .addAllSubtypes(Arrays.asList(1,2,3,4,5,6)); + types.add(builder.build()); + builder.clear().setKind(OrcProto.Type.Kind.INT); + types.add(builder.build()); + types.add(builder.build()); + types.add(builder.build()); + types.add(builder.build()); + types.add(builder.build()); + builder.clear().setKind(OrcProto.Type.Kind.STRUCT) + .addAllFieldNames(Arrays.asList("url", "purchase", "cost", "store")) + .addAllSubtypes(Arrays.asList(7, 8, 9, 10)); + types.add(builder.build()); + builder.clear().setKind(OrcProto.Type.Kind.STRING); + types.add(builder.build()); + builder.clear().setKind(OrcProto.Type.Kind.INT); + types.add(builder.build()); + types.add(builder.build()); + types.add(builder.build()); + SearchArgument isNull = SearchArgument.FACTORY.newBuilder() + .startAnd().isNull("cost").end().build(); + conf.set(OrcInputFormat.SARG_PUSHDOWN, isNull.toKryo()); + conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, + "url,cost"); + options.include(new boolean[]{true, true, false, true, false}); + OrcInputFormat.setSearchArgument(options, types, conf, false); + String[] colNames = options.getColumnNames(); + assertEquals(null, colNames[0]); + assertEquals("url", colNames[1]); + assertEquals(null, colNames[2]); + assertEquals("cost", colNames[3]); + assertEquals(null, colNames[4]); + SearchArgument arg = options.getSearchArgument(); + List leaves = arg.getLeaves(); + assertEquals("cost", leaves.get(0).getColumnName()); + assertEquals(PredicateLeaf.Operator.IS_NULL, leaves.get(0).getOperator()); + } }