diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 9cc7987..8fae69c 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1524,7 +1524,7 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { HIVEOUTERJOINSUPPORTSFILTERS("hive.outerjoin.supports.filters", true, ""), - HIVEFETCHTASKCONVERSION("hive.fetch.task.conversion", "more", new StringSet("none", "minimal", "more"), + HIVEFETCHTASKCONVERSION("hive.fetch.task.conversion", "none", new StringSet("none", "minimal", "more"), "Some select queries can be converted to single FETCH task minimizing latency.\n" + "Currently the query should be single sourced not having any subquery and should not have\n" + "any aggregations or distincts (which incurs RS), lateral views and joins.\n" + @@ -2057,7 +2057,7 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { "and use it to run queries."), // Vectorization enabled - HIVE_VECTORIZATION_ENABLED("hive.vectorized.execution.enabled", false, + HIVE_VECTORIZATION_ENABLED("hive.vectorized.execution.enabled", true, "This flag should be set to true to enable vectorized mode of query execution.\n" + "The default value is false."), HIVE_VECTORIZATION_REDUCE_ENABLED("hive.vectorized.execution.reduce.enabled", true, @@ -2094,6 +2094,16 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { HIVE_VECTORIZATION_GROUPBY_FLUSH_PERCENT("hive.vectorized.groupby.flush.percent", (float) 0.1, "Percent of entries in the group by aggregation hash flushed when the memory threshold is exceeded."), + HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT("hive.vectorized.use.vectorized.input.format", true, + "This flag should be set to true to enable vectorizing with vectorized input file format capable SerDe.\n" + + "The default value is true."), + HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE("hive.vectorized.use.vector.serde.deserialize", true, + "This flag should be set to true to enable vectorizing rows using vector deserialize.\n" + + "The default value is true."), + HIVE_VECTORIZATION_USE_ROW_DESERIALIZE("hive.vectorized.use.row.serde.deserialize", true, + "This flag should be set to true to enable vectorizing using row deserialize.\n" + + "The default value is true."), + HIVE_TYPE_CHECK_ON_INSERT("hive.typecheck.on.insert", true, "This property has been extended to control " + "whether to check, convert, and normalize partition value to conform to its column type in " + "partition operations including but not limited to insert, such as alter, describe etc."), diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java new file mode 100644 index 0000000..1724419 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java @@ -0,0 +1,150 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.Operator.State; +import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.MapWork; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.Reporter; + + +/** + * Map operator. This triggers overall map side processing. This is a little + * different from regular operators in that it starts off by processing a + * Writable data structure from a Table (instead of a Hive Object). + **/ +@SuppressWarnings("deprecation") +public abstract class AbstractMapOperator extends Operator implements Serializable, Cloneable { + + private static final long serialVersionUID = 1L; + + /** + * Initialization call sequence: + * + * (Operator) setConf(MapWork conf); + * (Operator) initialize(Configuration hconf, ObjectInspector[] inputOIs); + * + * (AbstractMapOperator) setChildren(Configuration hconf) + * + * (Operator) passExecContext(ExecMapperContext execContext) + * (Operator) initializeLocalWork(Configuration hconf) + * + * (AbstractMapOperator) initializeMapOperator(Configuration hconf) + * + * [ (AbstractMapOperator) initializeContexts() ] // exec.tez.MapRecordProcessor only. + * + * (Operator) setReporter(Reporter rep) + * + */ + /** + * Counter. + * + */ + public static enum Counter { + DESERIALIZE_ERRORS, + RECORDS_IN + } + + protected final transient LongWritable deserialize_error_count = new LongWritable(); + protected final transient LongWritable recordCounter = new LongWritable(); + protected transient long numRows = 0; + + private transient final Map normalizedPaths = new HashMap(); + + private Path normalizePath(String onefile, boolean schemaless) { + //creating Path is expensive, so cache the corresponding + //Path object in normalizedPaths + Path path = normalizedPaths.get(onefile); + if (path == null) { + path = new Path(onefile); + if (schemaless && path.toUri().getScheme() != null) { + path = new Path(path.toUri().getPath()); + } + normalizedPaths.put(onefile, path); + } + return path; + } + + public String getNominalPath(Path fpath) { + String nominal = null; + boolean schemaless = fpath.toUri().getScheme() == null; + for (String onefile : conf.getPathToAliases().keySet()) { + Path onepath = normalizePath(onefile, schemaless); + Path curfpath = fpath; + if(!schemaless && onepath.toUri().getScheme() == null) { + curfpath = new Path(fpath.toUri().getPath()); + } + // check for the operators who will process rows coming to this Map Operator + if (onepath.toUri().relativize(curfpath.toUri()).equals(curfpath.toUri())) { + // not from this + continue; + } + if (nominal != null) { + throw new IllegalStateException("Ambiguous input path " + fpath); + } + nominal = onefile; + } + if (nominal == null) { + throw new IllegalStateException("Invalid input path " + fpath); + } + return nominal; + } + + public abstract void setChildren(Configuration hconf) throws Exception; + + public void initializeMapOperator(Configuration hconf) throws HiveException { + // set that parent initialization is done and call initialize on children + state = State.INIT; + + statsMap.put(Counter.DESERIALIZE_ERRORS.toString(), deserialize_error_count); + + numRows = 0; + + String context = hconf.get(Operator.CONTEXT_NAME_KEY, ""); + if (context != null && !context.isEmpty()) { + context = "_" + context.replace(" ","_"); + } + statsMap.put(Counter.RECORDS_IN + context, recordCounter); + } + + public abstract void initializeContexts() throws HiveException; + + public abstract Deserializer getCurrentDeserializer(); + + public abstract void process(Writable value) throws HiveException; + + @Override + public void closeOp(boolean abort) throws HiveException { + recordCounter.set(numRows); + super.closeOp(abort); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java index d5ea96a..c3cbf55 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java @@ -72,22 +72,10 @@ * Writable data structure from a Table (instead of a Hive Object). **/ @SuppressWarnings("deprecation") -public class MapOperator extends Operator implements Serializable, Cloneable { +public class MapOperator extends AbstractMapOperator { private static final long serialVersionUID = 1L; - /** - * Counter. - * - */ - public static enum Counter { - DESERIALIZE_ERRORS, - RECORDS_IN - } - - private final transient LongWritable deserialize_error_count = new LongWritable(); - private final transient LongWritable recordCounter = new LongWritable(); - protected transient long numRows = 0; protected transient long cntr = 1; protected transient long logEveryNRows = 0; @@ -100,7 +88,6 @@ // context for current input file protected transient MapOpCtx[] currentCtxs; - private transient final Map normalizedPaths = new HashMap(); protected static class MapOpCtx { @@ -392,63 +379,23 @@ private void initOperatorContext(List> children } } - private String getNominalPath(Path fpath) { - String nominal = null; - boolean schemaless = fpath.toUri().getScheme() == null; - for (String onefile : conf.getPathToAliases().keySet()) { - Path onepath = normalizePath(onefile, schemaless); - Path curfpath = fpath; - if(!schemaless && onepath.toUri().getScheme() == null) { - curfpath = new Path(fpath.toUri().getPath()); - } - // check for the operators who will process rows coming to this Map Operator - if (onepath.toUri().relativize(curfpath.toUri()).equals(curfpath.toUri())) { - // not from this - continue; - } - if (nominal != null) { - throw new IllegalStateException("Ambiguous input path " + fpath); - } - nominal = onefile; - } - if (nominal == null) { - throw new IllegalStateException("Invalid input path " + fpath); - } - return nominal; - } - @Override public Collection> initializeOp(Configuration hconf) throws HiveException { return super.initializeOp(hconf); } public void initializeMapOperator(Configuration hconf) throws HiveException { - // set that parent initialization is done and call initialize on children - state = State.INIT; - statsMap.put(Counter.DESERIALIZE_ERRORS.toString(), deserialize_error_count); + super.initializeMapOperator(hconf); - numRows = 0; cntr = 1; logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS); - String context = hconf.get(Operator.CONTEXT_NAME_KEY, ""); - if (context != null && !context.isEmpty()) { - context = "_" + context.replace(" ","_"); - } - statsMap.put(Counter.RECORDS_IN + context, recordCounter); - for (Entry, StructObjectInspector> entry : childrenOpToOI.entrySet()) { Operator child = entry.getKey(); child.initialize(hconf, new ObjectInspector[] {entry.getValue()}); } } - @Override - public void closeOp(boolean abort) throws HiveException { - recordCounter.set(numRows); - super.closeOp(abort); - } - // Find context for current input file @Override public void cleanUpInputFileChangedOp() throws HiveException { @@ -478,20 +425,6 @@ public void cleanUpInputFileChangedOp() throws HiveException { currentCtxs = contexts.values().toArray(new MapOpCtx[contexts.size()]); } - private Path normalizePath(String onefile, boolean schemaless) { - //creating Path is expensive, so cache the corresponding - //Path object in normalizedPaths - Path path = normalizedPaths.get(onefile); - if (path == null) { - path = new Path(onefile); - if (schemaless && path.toUri().getScheme() != null) { - path = new Path(path.toUri().getPath()); - } - normalizedPaths.put(onefile, path); - } - return path; - } - public void process(Writable value) throws HiveException { // A mapper can span multiple files/partitions. // The serializers need to be reset if the input file changed diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index ca86301..db0ac60 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -473,9 +473,9 @@ private static BaseWork getBaseWork(Configuration conf, String name) { } } - public static Map getMapWorkVectorScratchColumnTypeMap(Configuration hiveConf) { + public static String[] getMapWorkVectorScratchColumnTypeNames(Configuration hiveConf) { MapWork mapWork = getMapWork(hiveConf); - return mapWork.getVectorScratchColumnTypeMap(); + return mapWork.getVectorScratchColumnTypeNames(); } public static void setWorkflowAdjacencies(Configuration conf, QueryPlan plan) { @@ -3737,13 +3737,13 @@ private static void resetUmaskInConf(Configuration conf, boolean unsetUmask, Str /** * Returns true if a plan is both configured for vectorized execution - * and vectorization is allowed. The plan may be configured for vectorization - * but vectorization disallowed eg. for FetchOperator execution. + * and reading using the VectorizedInputFileFormat is allowed. The plan may be configured + * for vectorization but using that format is not allowed. */ - public static boolean isVectorMode(Configuration conf) { + public static boolean getUseVectorizedInputFileFormat(Configuration conf) { if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) && Utilities.getPlanPath(conf) != null && Utilities - .getMapWork(conf).getVectorMode()) { + .getMapWork(conf).getUseVectorizedInputFileFormat()) { return true; } return false; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java index 5999265..a2e0108 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java @@ -28,6 +28,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.AbstractMapOperator; import org.apache.hadoop.hive.ql.exec.MapOperator; import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.Operator; @@ -59,7 +60,7 @@ public class ExecMapper extends MapReduceBase implements Mapper { private static final String PLAN_KEY = "__MAP_PLAN__"; - private MapOperator mo; + private AbstractMapOperator mo; private OutputCollector oc; private JobConf jc; private boolean abort = false; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkMapRecordHandler.java ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkMapRecordHandler.java index d6e2853..9af081b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkMapRecordHandler.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkMapRecordHandler.java @@ -24,6 +24,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.AbstractMapOperator; import org.apache.hadoop.hive.ql.exec.MapOperator; import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.Operator; @@ -55,7 +56,7 @@ public class SparkMapRecordHandler extends SparkRecordHandler { private static final Log LOG = LogFactory.getLog(SparkMapRecordHandler.class); private static final String PLAN_KEY = "__MAP_PLAN__"; - private MapOperator mo; + private AbstractMapOperator mo; private MapredLocalWork localWork = null; private boolean isLogInfoEnabled = false; private ExecMapperContext execContext; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java index bedccc3..4165cca 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java @@ -155,7 +155,7 @@ public void init(JobConf job, OutputCollector output, Reporter reporter) throws ObjectPair pair = VectorizedBatchUtil. constructVectorizedRowBatch(keyStructInspector, - valueStructInspectors[tag], gWork.getVectorScratchColumnTypeMap()); + valueStructInspectors[tag], gWork.getVectorScratchColumnTypeNames()); batches[tag] = pair.getFirst(); final int totalColumns = keysColumnOffset + valueStructInspectors[tag].getAllStructFieldRefs().size(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java index f606ec0..7ad6f22 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java @@ -34,6 +34,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.AbstractMapOperator; import org.apache.hadoop.hive.ql.exec.DummyStoreOperator; import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator; import org.apache.hadoop.hive.ql.exec.MapOperator; @@ -71,8 +72,8 @@ public class MapRecordProcessor extends RecordProcessor { - private MapOperator mapOp; - private final List mergeMapOpList = new ArrayList(); + private AbstractMapOperator mapOp; + private final List mergeMapOpList = new ArrayList(); public static final Log l4j = LogFactory.getLog(MapRecordProcessor.class); private MapRecordSource[] sources; private final Map multiMRInputMap = new HashMap(); @@ -154,7 +155,7 @@ void init(MRTaskReporter mrReporter, connectOps.clear(); if (mergeWorkList != null) { - MapOperator mergeMapOp = null; + AbstractMapOperator mergeMapOp = null; for (BaseWork mergeWork : mergeWorkList) { MapWork mergeMapWork = (MapWork) mergeWork; if (mergeMapWork.getVectorMode()) { @@ -197,7 +198,7 @@ void init(MRTaskReporter mrReporter, initializeMapRecordSources(); mapOp.initializeMapOperator(jconf); if ((mergeMapOpList != null) && mergeMapOpList.isEmpty() == false) { - for (MapOperator mergeMapOp : mergeMapOpList) { + for (AbstractMapOperator mergeMapOp : mergeMapOpList) { jconf.set(Utilities.INPUT_NAME, mergeMapOp.getConf().getName()); mergeMapOp.initializeMapOperator(jconf); } @@ -245,7 +246,7 @@ private void initializeMapRecordSources() throws Exception { reader = legacyMRInput.getReader(); } sources[position].init(jconf, mapOp, reader); - for (MapOperator mapOp : mergeMapOpList) { + for (AbstractMapOperator mapOp : mergeMapOpList) { int tag = mapOp.getConf().getTag(); sources[tag] = new MapRecordSource(); String inputName = mapOp.getConf().getName(); @@ -260,7 +261,7 @@ private void initializeMapRecordSources() throws Exception { @SuppressWarnings("deprecation") private KeyValueReader getKeyValueReader(Collection keyValueReaders, - MapOperator mapOp) + AbstractMapOperator mapOp) throws Exception { List kvReaderList = new ArrayList(keyValueReaders); // this sets up the map operator contexts correctly @@ -310,7 +311,7 @@ void close(){ } mapOp.close(abort); if (mergeMapOpList.isEmpty() == false) { - for (MapOperator mergeMapOp : mergeMapOpList) { + for (AbstractMapOperator mergeMapOp : mergeMapOpList) { mergeMapOp.close(abort); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordSource.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordSource.java index f7d2661..9a66e9e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordSource.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordSource.java @@ -21,7 +21,7 @@ import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.ql.exec.MapOperator; +import org.apache.hadoop.hive.ql.exec.AbstractMapOperator; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.Writable; @@ -38,11 +38,11 @@ public static final Log LOG = LogFactory.getLog(MapRecordSource.class); private ExecMapperContext execContext = null; - private MapOperator mapOp = null; + private AbstractMapOperator mapOp = null; private KeyValueReader reader = null; private final boolean grouped = false; - void init(JobConf jconf, MapOperator mapOp, KeyValueReader reader) throws IOException { + void init(JobConf jconf, AbstractMapOperator mapOp, KeyValueReader reader) throws IOException { execContext = mapOp.getExecContext(); this.mapOp = mapOp; this.reader = reader; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java index d649672..0c96d59 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java @@ -237,7 +237,7 @@ private void initializeSourceForTag(ReduceWork redWork, int tag, ObjectInspector boolean vectorizedRecordSource = (tag == bigTablePosition) && redWork.getVectorMode(); sources[tag].init(jconf, redWork.getReducer(), vectorizedRecordSource, keyTableDesc, valueTableDesc, reader, tag == bigTablePosition, (byte) tag, - redWork.getVectorScratchColumnTypeMap()); + redWork.getVectorScratchColumnTypeNames()); ois[tag] = sources[tag].getObjectInspector(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java index 89f7572..5e7f86a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java @@ -123,7 +123,7 @@ void init(JobConf jconf, Operator reducer, boolean vectorized, TableDesc keyTableDesc, TableDesc valueTableDesc, Reader reader, boolean handleGroupKey, byte tag, - Map vectorScratchColumnTypeMap) + String[] vectorScratchColumnTypeNames) throws Exception { ObjectInspector keyObjectInspector; @@ -175,7 +175,8 @@ void init(JobConf jconf, Operator reducer, boolean vectorized, TableDesc keyT .genVectorStructExpressionWritables(valueStructInspectors))); ObjectPair pair = - VectorizedBatchUtil.constructVectorizedRowBatch(keyStructInspector, valueStructInspectors, vectorScratchColumnTypeMap); + VectorizedBatchUtil.constructVectorizedRowBatch(keyStructInspector, + valueStructInspectors, vectorScratchColumnTypeNames); rowObjectInspector = pair.getSecond(); batch = pair.getFirst(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java index 809d7d4..5d6d12d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java @@ -19,8 +19,11 @@ package org.apache.hadoop.hive.ql.exec.vector; import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.common.type.HiveChar; @@ -40,10 +43,13 @@ import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; @@ -473,6 +479,90 @@ void assign(int batchIndex, Object object) { } } + //------------------------------------------------------------------------------------------------ + + private class IntToDoubleAssigner extends AbstractDoubleAssigner { + + IntToDoubleAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + IntWritable iw = (IntWritable) object; + vector[batchIndex] = iw.get(); + } + } + } + + // Currently, we only support these no-precision-loss or promotion data type conversions: + // + // UNDONE: And, all of them stay within the vector column type (Long, Double, Bytes, Decimal) + // UNDONE: for now. + // + // Short -> Int IMPLICIT WITH VECTORIZATION + // Short -> BigInt IMPLICIT WITH VECTORIZATION + // Float -> Double IMPLICIT WITH VECTORIZATION + // Int --> BigInt IMPLICIT WITH VECTORIZATION + // (Char | VarChar) -> String IMPLICIT WITH VECTORIZATION + // + + private static HashMap, + Class> primitiveCategoryPairToConversionAssignerMap = + new HashMap, + Class>(); + static { + primitiveCategoryPairToConversionAssignerMap.put( + new ImmutablePair(PrimitiveCategory.SHORT, PrimitiveCategory.INT), + ShortAssigner.class); + primitiveCategoryPairToConversionAssignerMap.put( + new ImmutablePair(PrimitiveCategory.SHORT, PrimitiveCategory.LONG), + ShortAssigner.class); + primitiveCategoryPairToConversionAssignerMap.put( + new ImmutablePair(PrimitiveCategory.INT, PrimitiveCategory.LONG), + IntAssigner.class); + primitiveCategoryPairToConversionAssignerMap.put( + new ImmutablePair(PrimitiveCategory.FLOAT, PrimitiveCategory.DOUBLE), + FloatAssigner.class); + primitiveCategoryPairToConversionAssignerMap.put( + new ImmutablePair(PrimitiveCategory.CHAR, PrimitiveCategory.STRING), + CharAssigner.class); + primitiveCategoryPairToConversionAssignerMap.put( + new ImmutablePair(PrimitiveCategory.VARCHAR, PrimitiveCategory.STRING), + VarCharAssigner.class); + } + + private Assigner createConversionAssigner(PrimitiveTypeInfo sourcePrimitiveTypeInfo, + PrimitiveTypeInfo targetPrimitiveTypeInfo, int columnIndex) throws HiveException { + + PrimitiveCategory sourcePrimitiveCategory = sourcePrimitiveTypeInfo.getPrimitiveCategory(); + PrimitiveCategory targetPrimitiveCategory = targetPrimitiveTypeInfo.getPrimitiveCategory(); + + Class assignerClass = + primitiveCategoryPairToConversionAssignerMap.get( + new ImmutablePair(sourcePrimitiveCategory, targetPrimitiveCategory)); + if (assignerClass == null) { + + throw new HiveException("No conversion from primitive type category " + + sourcePrimitiveCategory.name() + " to " + targetPrimitiveCategory.name()); + } + + Assigner assigner; + try { + assigner = (Assigner) assignerClass.getDeclaredConstructor(int.class).newInstance( + columnIndex); + } catch (Exception e) { + throw new HiveException(e); + } + + return assigner; + } + + //------------------------------------------------------------------------------------------------ + private Assigner createAssigner(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) throws HiveException { PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); Assigner assigner; @@ -553,6 +643,53 @@ public void init(StructObjectInspector structObjectInspector, List proj } } + public void init(StructObjectInspector structObjectInspector) throws HiveException { + + List fields = structObjectInspector.getAllStructFieldRefs(); + assigners = new Assigner[fields.size()]; + + int i = 0; + for (StructField field : fields) { + ObjectInspector fieldInspector = field.getFieldObjectInspector(); + PrimitiveTypeInfo primitiveTypeInfo = + (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString( + fieldInspector.getTypeName()); + assigners[i] = createAssigner(primitiveTypeInfo, i); + i++; + } + } + + public void init(TypeInfo[] sourceTypeInfos, TypeInfo[] targetTypeInfos, + boolean[] conversionFlags, boolean[] columnsToIncludeTruncated) throws HiveException { + + int columnCount = columnsToIncludeTruncated == null ? + sourceTypeInfos.length : columnsToIncludeTruncated.length; + + assigners = new Assigner[columnCount]; + + for (int i = 0; i < columnCount; i++) { + + Assigner assigner; + + if (columnsToIncludeTruncated != null && !columnsToIncludeTruncated[i]) { + + // Field not included in query. + assigner = null; + + } else { + PrimitiveTypeInfo targetPrimitiveTypeInfo = (PrimitiveTypeInfo) targetTypeInfos[i]; + + if (conversionFlags != null && conversionFlags[i]) { + assigner = createConversionAssigner((PrimitiveTypeInfo) sourceTypeInfos[i], + targetPrimitiveTypeInfo, i); + } else { + assigner = createAssigner(targetPrimitiveTypeInfo, i); + } + } + + assigners[i] = assigner; + } + } public void init(List typeNames) throws HiveException { assigners = new Assigner[typeNames.size()]; @@ -568,17 +705,21 @@ public void init(List typeNames) throws HiveException { protected void setBatch(VectorizedRowBatch batch) throws HiveException { for (int i = 0; i < assigners.length; i++) { Assigner assigner = assigners[i]; - int columnIndex = assigner.getColumnIndex(); - if (batch.cols[columnIndex] == null) { - throw new HiveException("Unexpected null vector column " + columnIndex); + if (assigner != null) { + int columnIndex = assigner.getColumnIndex(); + if (batch.cols[columnIndex] == null) { + throw new HiveException("Unexpected null vector column " + columnIndex); + } + assigner.setColumnVector(batch); } - assigner.setColumnVector(batch); } } protected void forgetBatch() { for (Assigner assigner : assigners) { - assigner.forgetColumnVector(); + if (assigner != null) { + assigner.forgetColumnVector(); + } } } @@ -589,8 +730,26 @@ public void assignRowColumn(int batchIndex, int logicalColumnIndex, Object objec public void assignRow(int batchIndex, Object[] objects) { int i = 0; for (Assigner assigner : assigners) { - assigner.assign(batchIndex, objects[i++]); + if (assigner != null) { + assigner.assign(batchIndex, objects[i]); + } + i++; } } + public void assignRow(int batchIndex, Object object, + StructObjectInspector structObjectInspector) { + + /* Convert input row to standard objects. */ + List standardObjects = new ArrayList(); + ObjectInspectorUtils.copyToStandardObject(standardObjects, object, + structObjectInspector, ObjectInspectorCopyOption.WRITABLE); + + for (int i = 0; i < standardObjects.size(); i++) { + Assigner assigner = assigners[i]; + if (assigner != null) { + assigner.assign(batchIndex, standardObjects.get(i)); + } + } + } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java index 8452abd..0a32178 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java @@ -21,15 +21,20 @@ import java.io.EOFException; import java.io.IOException; import java.sql.Timestamp; +import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; +import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.fast.DeserializeRead; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; @@ -38,7 +43,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hive.common.util.DateUtils; @@ -77,7 +82,7 @@ private VectorDeserializeRow() { private abstract class Reader { protected int columnIndex; - Reader(int columnIndex) { + Reader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { this.columnIndex = columnIndex; } @@ -86,15 +91,15 @@ private VectorDeserializeRow() { private abstract class AbstractLongReader extends Reader { - AbstractLongReader(int columnIndex) { - super(columnIndex); + AbstractLongReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } } private class BooleanReader extends AbstractLongReader { - BooleanReader(int columnIndex) { - super(columnIndex); + BooleanReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } @Override @@ -112,8 +117,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private class ByteReader extends AbstractLongReader { - ByteReader(int columnIndex) { - super(columnIndex); + ByteReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } @Override @@ -131,8 +136,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private class ShortReader extends AbstractLongReader { - ShortReader(int columnIndex) { - super(columnIndex); + ShortReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } @Override @@ -150,8 +155,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private class IntReader extends AbstractLongReader { - IntReader(int columnIndex) { - super(columnIndex); + IntReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } @Override @@ -169,8 +174,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private class LongReader extends AbstractLongReader { - LongReader(int columnIndex) { - super(columnIndex); + LongReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } @Override @@ -190,8 +195,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { DeserializeRead.ReadDateResults readDateResults; - DateReader(int columnIndex) { - super(columnIndex); + DateReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readDateResults = deserializeRead.createReadDateResults(); } @@ -212,8 +217,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { DeserializeRead.ReadTimestampResults readTimestampResults; - TimestampReader(int columnIndex) { - super(columnIndex); + TimestampReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readTimestampResults = deserializeRead.createReadTimestampResults(); } @@ -235,8 +240,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { DeserializeRead.ReadIntervalYearMonthResults readIntervalYearMonthResults; - IntervalYearMonthReader(int columnIndex) { - super(columnIndex); + IntervalYearMonthReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readIntervalYearMonthResults = deserializeRead.createReadIntervalYearMonthResults(); } @@ -258,8 +263,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { DeserializeRead.ReadIntervalDayTimeResults readIntervalDayTimeResults; - IntervalDayTimeReader(int columnIndex) { - super(columnIndex); + IntervalDayTimeReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readIntervalDayTimeResults = deserializeRead.createReadIntervalDayTimeResults(); } @@ -279,15 +284,15 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private abstract class AbstractDoubleReader extends Reader { - AbstractDoubleReader(int columnIndex) { - super(columnIndex); + AbstractDoubleReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } } private class FloatReader extends AbstractDoubleReader { - FloatReader(int columnIndex) { - super(columnIndex); + FloatReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } @Override @@ -305,8 +310,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private class DoubleReader extends AbstractDoubleReader { - DoubleReader(int columnIndex) { - super(columnIndex); + DoubleReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } @Override @@ -324,8 +329,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private abstract class AbstractBytesReader extends Reader { - AbstractBytesReader(int columnIndex) { - super(columnIndex); + AbstractBytesReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); } } @@ -333,8 +338,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private DeserializeRead.ReadStringResults readStringResults; - StringReaderByValue(int columnIndex) { - super(columnIndex); + StringReaderByValue(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readStringResults = deserializeRead.createReadStringResults(); } @@ -356,8 +361,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private DeserializeRead.ReadStringResults readStringResults; - StringReaderByReference(int columnIndex) { - super(columnIndex); + StringReaderByReference(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readStringResults = deserializeRead.createReadStringResults(); } @@ -381,10 +386,10 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private CharTypeInfo charTypeInfo; - CharReaderByValue(CharTypeInfo charTypeInfo, int columnIndex) { - super(columnIndex); + CharReaderByValue(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readStringResults = deserializeRead.createReadStringResults(); - this.charTypeInfo = charTypeInfo; + this.charTypeInfo = (CharTypeInfo) primitiveTypeInfo; } @Override @@ -410,10 +415,10 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private CharTypeInfo charTypeInfo; - CharReaderByReference(CharTypeInfo charTypeInfo, int columnIndex) { - super(columnIndex); + CharReaderByReference(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readStringResults = deserializeRead.createReadStringResults(); - this.charTypeInfo = charTypeInfo; + this.charTypeInfo = (CharTypeInfo) primitiveTypeInfo; } @Override @@ -439,10 +444,10 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private VarcharTypeInfo varcharTypeInfo; - VarcharReaderByValue(VarcharTypeInfo varcharTypeInfo, int columnIndex) { - super(columnIndex); + VarcharReaderByValue(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readStringResults = deserializeRead.createReadStringResults(); - this.varcharTypeInfo = varcharTypeInfo; + this.varcharTypeInfo = (VarcharTypeInfo) primitiveTypeInfo; } @Override @@ -468,10 +473,10 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private VarcharTypeInfo varcharTypeInfo; - VarcharReaderByReference(VarcharTypeInfo varcharTypeInfo, int columnIndex) { - super(columnIndex); + VarcharReaderByReference(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readStringResults = deserializeRead.createReadStringResults(); - this.varcharTypeInfo = varcharTypeInfo; + this.varcharTypeInfo = (VarcharTypeInfo) primitiveTypeInfo; } @Override @@ -495,8 +500,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private DeserializeRead.ReadBinaryResults readBinaryResults; - BinaryReaderByValue(int columnIndex) { - super(columnIndex); + BinaryReaderByValue(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readBinaryResults = deserializeRead.createReadBinaryResults(); } @@ -518,8 +523,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private DeserializeRead.ReadBinaryResults readBinaryResults; - BinaryReaderByReference(int columnIndex) { - super(columnIndex); + BinaryReaderByReference(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readBinaryResults = deserializeRead.createReadBinaryResults(); } @@ -541,8 +546,8 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { private DeserializeRead.ReadDecimalResults readDecimalResults; - HiveDecimalReader(int columnIndex) { - super(columnIndex); + HiveDecimalReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); readDecimalResults = deserializeRead.createReadDecimalResults(); } @@ -560,6 +565,115 @@ void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { } } + private class NotIncludedColumnReader extends Reader { + + NotIncludedColumnReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + + if (deserializeRead.readCheckNull()) { + // Ignore not included column. + } else { + throw new RuntimeException("Expected a NULL for not included column"); + } + } + } + + + //------------------------------------------------------------------------------------------------ + + private class IntToDoubleReader extends AbstractDoubleReader { + + IntToDoubleReader(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) { + super(primitiveTypeInfo, columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + DoubleColumnVector colVector = (DoubleColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + int value = deserializeRead.readInt(); + colVector.vector[batchIndex] = (double) value; + } + } + } + + // Currently, we only support these no-precision-loss or promotion data type conversions: + // + // UNDONE: And, all of them stay within the vector column type (Long, Double, Bytes, Decimal) + // UNDONE: for now. + // + // Short -> Int IMPLICIT WITH VECTORIZATION + // Short -> BigInt IMPLICIT WITH VECTORIZATION + // Float -> Double IMPLICIT WITH VECTORIZATION + // Int --> BigInt IMPLICIT WITH VECTORIZATION + // (Char | VarChar) -> String IMPLICIT WITH VECTORIZATION + // + + private static HashMap, + Class> primitiveCategoryPairToConversionReaderMap = + new HashMap, + Class>(); + static { + primitiveCategoryPairToConversionReaderMap.put( + new ImmutablePair(PrimitiveCategory.SHORT, PrimitiveCategory.INT), + ShortReader.class); + primitiveCategoryPairToConversionReaderMap.put( + new ImmutablePair(PrimitiveCategory.SHORT, PrimitiveCategory.LONG), + ShortReader.class); + primitiveCategoryPairToConversionReaderMap.put( + new ImmutablePair(PrimitiveCategory.INT, PrimitiveCategory.LONG), + IntReader.class); + primitiveCategoryPairToConversionReaderMap.put( + new ImmutablePair(PrimitiveCategory.FLOAT, PrimitiveCategory.DOUBLE), + FloatReader.class); + primitiveCategoryPairToConversionReaderMap.put( + new ImmutablePair(PrimitiveCategory.CHAR, PrimitiveCategory.STRING), + CharReaderByValue.class); + primitiveCategoryPairToConversionReaderMap.put( + new ImmutablePair(PrimitiveCategory.VARCHAR, PrimitiveCategory.STRING), + VarcharReaderByValue.class); + } + + private void addConversionReader(PrimitiveTypeInfo targetPrimitiveTypeInfo, int index, + int outputColumn) throws HiveException { + + + PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[index]; + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + + PrimitiveCategory targetPrimitiveCategory = targetPrimitiveTypeInfo.getPrimitiveCategory(); + + Class readerClass = + primitiveCategoryPairToConversionReaderMap.get( + new ImmutablePair(primitiveCategory, targetPrimitiveCategory)); + if (readerClass == null) { + + throw new HiveException("No conversion from primitive type category " + + primitiveCategory.name() + " to " + targetPrimitiveCategory.name()); + } + + Reader reader; + try { + reader = (Reader) readerClass.getDeclaredConstructor( + PrimitiveTypeInfo.class, int.class).newInstance( + primitiveTypeInfo, outputColumn); + } catch (Exception e) { + throw new HiveException(e); + } + + readersByValue[index] = reader; + readersByReference[index] = reader; + } + + //------------------------------------------------------------------------------------------------ + private void addReader(int index, int outputColumn) throws HiveException { Reader readerByValue = null; Reader readerByReference = null; @@ -571,62 +685,56 @@ private void addReader(int index, int outputColumn) throws HiveException { // UNDONE: // break; case BOOLEAN: - readerByValue = new BooleanReader(outputColumn); + readerByValue = new BooleanReader(primitiveTypeInfo, outputColumn); break; case BYTE: - readerByValue = new ByteReader(outputColumn); + readerByValue = new ByteReader(primitiveTypeInfo, outputColumn); break; case SHORT: - readerByValue = new ShortReader(outputColumn); + readerByValue = new ShortReader(primitiveTypeInfo, outputColumn); break; case INT: - readerByValue = new IntReader(outputColumn); + readerByValue = new IntReader(primitiveTypeInfo, outputColumn); break; case LONG: - readerByValue = new LongReader(outputColumn); + readerByValue = new LongReader(primitiveTypeInfo, outputColumn); break; case DATE: - readerByValue = new DateReader(outputColumn); + readerByValue = new DateReader(primitiveTypeInfo, outputColumn); break; case TIMESTAMP: - readerByValue = new TimestampReader(outputColumn); + readerByValue = new TimestampReader(primitiveTypeInfo, outputColumn); break; case FLOAT: - readerByValue = new FloatReader(outputColumn); + readerByValue = new FloatReader(primitiveTypeInfo, outputColumn); break; case DOUBLE: - readerByValue = new DoubleReader(outputColumn); + readerByValue = new DoubleReader(primitiveTypeInfo, outputColumn); break; case STRING: - readerByValue = new StringReaderByValue(outputColumn); - readerByReference = new StringReaderByReference(outputColumn); + readerByValue = new StringReaderByValue(primitiveTypeInfo, outputColumn); + readerByReference = new StringReaderByReference(primitiveTypeInfo, outputColumn); break; case CHAR: - { - CharTypeInfo charTypeInfo = (CharTypeInfo) primitiveTypeInfo; - readerByValue = new CharReaderByValue(charTypeInfo, outputColumn); - readerByReference = new CharReaderByReference(charTypeInfo, outputColumn); - } + readerByValue = new CharReaderByValue(primitiveTypeInfo, outputColumn); + readerByReference = new CharReaderByReference(primitiveTypeInfo, outputColumn); break; case VARCHAR: - { - VarcharTypeInfo varcharTypeInfo = (VarcharTypeInfo) primitiveTypeInfo; - readerByValue = new VarcharReaderByValue(varcharTypeInfo, outputColumn); - readerByReference = new VarcharReaderByReference(varcharTypeInfo, outputColumn); - } + readerByValue = new VarcharReaderByValue(primitiveTypeInfo, outputColumn); + readerByReference = new VarcharReaderByReference(primitiveTypeInfo, outputColumn); break; case BINARY: - readerByValue = new BinaryReaderByValue(outputColumn); - readerByReference = new BinaryReaderByReference(outputColumn); + readerByValue = new BinaryReaderByValue(primitiveTypeInfo, outputColumn); + readerByReference = new BinaryReaderByReference(primitiveTypeInfo, outputColumn); break; case DECIMAL: - readerByValue = new HiveDecimalReader(outputColumn); + readerByValue = new HiveDecimalReader(primitiveTypeInfo, outputColumn); break; case INTERVAL_YEAR_MONTH: - readerByValue = new IntervalYearMonthReader(outputColumn); + readerByValue = new IntervalYearMonthReader(primitiveTypeInfo, outputColumn); break; case INTERVAL_DAY_TIME: - readerByValue = new IntervalDayTimeReader(outputColumn); + readerByValue = new IntervalDayTimeReader(primitiveTypeInfo, outputColumn); break; default: throw new HiveException("Unexpected primitive type category " + primitiveCategory); @@ -673,6 +781,74 @@ public void init(int startColumn) throws HiveException { } } + public void init(boolean[] columnsToIncludeTruncated) throws HiveException { + + if (columnsToIncludeTruncated != null) { + deserializeRead.setColumnsToInclude(columnsToIncludeTruncated); + } + + final int columnCount = (columnsToIncludeTruncated == null ? + primitiveTypeInfos.length : columnsToIncludeTruncated.length); + + readersByValue = new Reader[columnCount]; + readersByReference = new Reader[columnCount]; + + for (int i = 0; i < columnCount; i++) { + + if (columnsToIncludeTruncated != null && !columnsToIncludeTruncated[i]) { + + // Field not included in query. + + Reader notIncludedColumnReader = new NotIncludedColumnReader(null, i); + readersByValue[i] = notIncludedColumnReader; + readersByReference[i] = notIncludedColumnReader; + + } else { + + addReader(i, i); + + } + } + } + + public void init(PrimitiveTypeInfo[] targetPrimitiveTypeInfos, boolean[] conversionFlags, + boolean[] columnsToIncludeTruncated) throws HiveException { + + if (columnsToIncludeTruncated != null) { + deserializeRead.setColumnsToInclude(columnsToIncludeTruncated); + } + + final int columnCount = (columnsToIncludeTruncated == null ? + primitiveTypeInfos.length : columnsToIncludeTruncated.length); + + readersByValue = new Reader[columnCount]; + readersByReference = new Reader[columnCount]; + + for (int i = 0; i < columnCount; i++) { + + if (columnsToIncludeTruncated != null && !columnsToIncludeTruncated[i]) { + + // Field not included in query. + + Reader notIncludedColumnReader = new NotIncludedColumnReader(null, i); + readersByValue[i] = notIncludedColumnReader; + readersByReference[i] = notIncludedColumnReader; + + } else { + + if (conversionFlags != null && conversionFlags[i]) { + + addConversionReader(targetPrimitiveTypeInfos[i], i, i); + + } else { + + addReader(i, i); + + } + } + } + } + public void init() throws HiveException { init(0); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java index 917f406..ca1915f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java @@ -815,7 +815,7 @@ public VectorGroupByOperator() { outputFieldNames, objectInspectors); if (isVectorOutput) { vrbCtx = new VectorizedRowBatchCtx(); - vrbCtx.init(vOutContext.getScratchColumnTypeMap(), (StructObjectInspector) outputObjInspector); + vrbCtx.init((StructObjectInspector) outputObjInspector, vOutContext.getScratchColumnTypeNames()); outputBatch = vrbCtx.createVectorizedRowBatch(); vectorAssignRowSameBatch = new VectorAssignRowSameBatch(); vectorAssignRowSameBatch.init((StructObjectInspector) outputObjInspector, vOutContext.getProjectedColumns()); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinBaseOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinBaseOperator.java index 0baec2c..d9f5d2f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinBaseOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinBaseOperator.java @@ -91,7 +91,7 @@ public VectorMapJoinBaseOperator (VectorizationContext vContext, OperatorDesc co Collection> result = super.initializeOp(hconf); vrbCtx = new VectorizedRowBatchCtx(); - vrbCtx.init(vOutContext.getScratchColumnTypeMap(), (StructObjectInspector) this.outputObjInspector); + vrbCtx.init((StructObjectInspector) this.outputObjInspector, vOutContext.getScratchColumnTypeNames()); outputBatch = vrbCtx.createVectorizedRowBatch(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java index 9bd811c..69f7340 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java @@ -122,6 +122,11 @@ public void assign(VectorExpressionWriter[] writers, List keyDesc = conf.getKeys().get(posBigTable); keyOutputWriters = VectorExpressionWriterFactory.getExpressionWriters(keyDesc); + vrbCtx = new VectorizedRowBatchCtx(); + vrbCtx.init((StructObjectInspector) this.outputObjInspector, vOutContext.getScratchColumnTypeNames()); + + outputBatch = vrbCtx.createVectorizedRowBatch(); + keyWrapperBatch = VectorHashKeyWrapperBatch.compileKeyWrapperBatch(keyExpressions); Map> valueExpressions = conf.getExprs(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java index aa0d5a5..3040040 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java @@ -18,38 +18,622 @@ package org.apache.hadoop.hive.ql.exec.vector; -import org.apache.hadoop.hive.ql.exec.MapOperator; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.AbstractMapOperator; +import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; +import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorMapOperatorReadType; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.StructObject; +import org.apache.hadoop.hive.serde2.fast.DeserializeRead; +import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead; +import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.BinaryComparable; import org.apache.hadoop.io.Writable; -public class VectorMapOperator extends MapOperator { +public class VectorMapOperator extends AbstractMapOperator { private static final long serialVersionUID = 1L; + private transient HashMap fileToPartitionContextMap; + + private transient Operator oneRootOperator; + + private transient StructObjectInspector tableStructObjectInspector; + + private transient TypeInfo[] tableRowTypeInfos; + + private transient VectorMapOperatorReadType currentReadType; + private transient PartitionContext currentPartContext; + + private transient int currentNonPartColumnCount; + + private transient DeserializeRead currentDeserializeRead; + private transient VectorDeserializeRow currentVectorDeserializeRow; + + private Deserializer currentPartDeserializer; + private StructObjectInspector currentPartRawRowObjectInspector; + private VectorAssignRowSameBatch currentVectorAssign; + + private transient VectorizedRowBatchCtx batchContext; + private transient VectorizedRowBatch batch; + + private transient int nonPartitionColumnCount; + private transient int partitionColumnCount; + + private transient boolean[] columnsToIncludeTruncated; + + protected abstract class PartitionContext { + + protected final PartitionDesc partDesc; + + String tableName; + String partName; + + private PartitionContext(PartitionDesc partDesc) { + this.partDesc = partDesc; + + TableDesc td = partDesc.getTableDesc(); + + // Use table properties in case of unpartitioned tables, + // and the union of table properties and partition properties, with partition + // taking precedence, in the case of partitioned tables + Properties overlayedProps = + SerDeUtils.createOverlayedProperties(td.getProperties(), partDesc.getProperties()); + + Map partSpec = partDesc.getPartSpec(); + + tableName = String.valueOf(overlayedProps.getProperty("name")); + partName = String.valueOf(partSpec); + + } + + public PartitionDesc getPartDesc() { + return partDesc; + } + + public abstract void init(Configuration hconf, PrimitiveTypeInfo[] tableRowPrimitiveTypeInfos) + throws SerDeException, Exception; + } + + protected class VectorizedInputFileFormatPartitionContext extends PartitionContext { + + private VectorizedInputFileFormatPartitionContext(PartitionDesc partDesc) { + super(partDesc); + } + + public void init(Configuration hconf, PrimitiveTypeInfo[] tableRowPrimitiveTypeInfos) { + } + } + + protected class VectorDeserializePartitionContext extends PartitionContext { + + // This helper object deserializes known deserialization / input file format combination into + // columns of a row in a vectorized row batch. + private VectorDeserializeRow vectorDeserializeRow; + + private DeserializeRead deserializeRead; + + private VectorDeserializePartitionContext(PartitionDesc partDesc) { + super(partDesc); + } + + public VectorDeserializeRow getVectorDeserializeRow() { + return vectorDeserializeRow; + } + + DeserializeRead getDeserializeRead() { + return deserializeRead; + } + + public void init(Configuration hconf, PrimitiveTypeInfo[] tableRowPrimitiveTypeInfos) + throws SerDeException, HiveException { + VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); + + switch (vectorPartDesc.getVectorDeserializeType()) { + case LAZY_SIMPLE: + { + LazySerDeParameters simpleSerdeParams = + new LazySerDeParameters(hconf, partDesc.getTableDesc().getProperties(), + LazySimpleSerDe.class.getName()); + + // The LazySimple deserialization will fill in the type we want for the table. + // Hence, we pass tableRowPrimitiveTypeInfos. + deserializeRead = + new LazySimpleDeserializeRead(tableRowPrimitiveTypeInfos, simpleSerdeParams); + + vectorDeserializeRow = new VectorDeserializeRow(deserializeRead); + vectorDeserializeRow.init(columnsToIncludeTruncated); + } + break; + + case LAZY_BINARY: + { + // This type information is what the partition actually has and is necessary + // for LazyBinary to correctly read the data. + TypeInfo[] nonPartTypeInfos = vectorPartDesc.getTypeInfos(); + PrimitiveTypeInfo[] nonPartTypePrimitiveTypeInfos = + new PrimitiveTypeInfo[nonPartTypeInfos.length]; + for (int i = 0; i < nonPartTypeInfos.length; i++) { + nonPartTypePrimitiveTypeInfos[i] = (PrimitiveTypeInfo) nonPartTypeInfos[i]; + } + + deserializeRead = + new LazyBinaryDeserializeRead(nonPartTypePrimitiveTypeInfos); + + boolean[] conversionFlags = vectorPartDesc.getConversionFlags(); + + vectorDeserializeRow = new VectorDeserializeRow(deserializeRead); + + // Initialize with data type conversion parameters. + vectorDeserializeRow.init(tableRowPrimitiveTypeInfos, conversionFlags, + columnsToIncludeTruncated); + } + break; + + default: + throw new RuntimeException( + "Unexpected vector deserialize row type " + vectorPartDesc.getVectorDeserializeType().name()); + } + } + } + + protected class RowDeserializePartitionContext extends PartitionContext { + + private Deserializer partDeserializer; + private StructObjectInspector partRawRowObjectInspector; + private VectorAssignRowSameBatch vectorAssign; + + private RowDeserializePartitionContext(PartitionDesc partDesc) { + super(partDesc); + } + + public Deserializer getPartDeserializer() { + return partDeserializer; + } + + public StructObjectInspector getPartRawRowObjectInspector() { + return partRawRowObjectInspector; + } + + public VectorAssignRowSameBatch getVectorAssign() { + return vectorAssign; + } + + public void init(Configuration hconf, PrimitiveTypeInfo[] tableRowPrimitiveTypeInfos) + throws Exception { + VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); + + partDeserializer = partDesc.getDeserializer(hconf); + + partRawRowObjectInspector = + (StructObjectInspector) partDeserializer.getObjectInspector(); + + boolean[] conversionFlags= vectorPartDesc.getConversionFlags(); + + TypeInfo[] nonPartTypeInfos = vectorPartDesc.getTypeInfos(); + + vectorAssign = new VectorAssignRowSameBatch(); + + // Initialize with data type conversion parameters. + vectorAssign.init(nonPartTypeInfos, tableRowTypeInfos, conversionFlags, + columnsToIncludeTruncated); + + vectorAssign.setOneBatch(batch); + } + } + + public PartitionContext CreateAndInitPartitionContext(PartitionDesc partDesc, + Configuration hconf, PrimitiveTypeInfo[] tableRowPrimitiveTypeInfos) + throws SerDeException, Exception { + + VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); + if (vectorPartDesc == null) { + LOG.info("debug"); + } + PartitionContext partitionContext; + VectorMapOperatorReadType vectorMapOperatorReadType = + vectorPartDesc.getVectorMapOperatorReadType(); + switch (vectorMapOperatorReadType) { + case VECTORIZED_INPUT_FILE_FORMAT: + partitionContext = new VectorizedInputFileFormatPartitionContext(partDesc); + break; + + case VECTOR_DESERIALIZE: + partitionContext = new VectorDeserializePartitionContext(partDesc); + break; + + case ROW_DESERIALIZE: + partitionContext = new RowDeserializePartitionContext(partDesc); + break; + + default: + throw new RuntimeException("Unexpected vector MapOperator read type " + + vectorMapOperatorReadType.name()); + } + + partitionContext.init(hconf, tableRowPrimitiveTypeInfos); + + return partitionContext; + } + + private void determineColumnsToInclude(Configuration hconf) { + + columnsToIncludeTruncated = null; + + List columnsToIncludeTruncatedList = ColumnProjectionUtils.getReadColumnIDs(hconf); + if (columnsToIncludeTruncatedList != null && + columnsToIncludeTruncatedList.size() > 0 && columnsToIncludeTruncatedList.size() < nonPartitionColumnCount ) { + + // Partitioned columns will not be in the include list. + + boolean[] columnsToInclude = new boolean[nonPartitionColumnCount]; + Arrays.fill(columnsToInclude, false); + for (int columnNum : columnsToIncludeTruncatedList) { + columnsToInclude[columnNum] = true; + } + + // Work backwards to find the highest wanted column. + + int highestWantedColumnNum = -1; + for (int i = nonPartitionColumnCount - 1; i >= 0; i--) { + if (columnsToInclude[i]) { + highestWantedColumnNum = i; + break; + } + } + if (highestWantedColumnNum == -1) { + throw new RuntimeException("No columns to include?"); + } + int newColumnCount = highestWantedColumnNum + 1; + if (newColumnCount == nonPartitionColumnCount) { + columnsToIncludeTruncated = columnsToInclude; + } else { + columnsToIncludeTruncated = Arrays.copyOf(columnsToInclude, newColumnCount); + } + } + } + + // Create a file to VectorFileContext map. + // Where VectorFileContext describes how to process "rows" (could be VRBs). + // + @Override + public void setChildren(Configuration hconf) throws Exception { + + // Get the one TableScanOperator. + oneRootOperator = conf.getAliasToWork().values().iterator().next(); + + // UNDONE: Put this in the MapWork to make it available to Pass-Thru VectorizedInputFileFormat + // UNDONE: readers. + // UNDONE: + batchContext = new VectorizedRowBatchCtx(); + batchContext.init(conf); + + batch = batchContext.createVectorizedRowBatch(); + + nonPartitionColumnCount = batchContext.getNonPartitionColumnCount(); + partitionColumnCount = batchContext.getPartitionColumnCount(); + + determineColumnsToInclude(hconf); + + // Create table related objects + TypeInfo tableStructTypeInfo = TypeInfoFactory.getStructTypeInfo( + Arrays.asList(batchContext.getRowColumnNames()), + Arrays.asList(batchContext.getRowColumnTypeInfos())); + tableStructObjectInspector = + (StructObjectInspector) TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo( + tableStructTypeInfo); + + tableRowTypeInfos = conf.getVectorColumnTypeInfos(); + + // For now, vectorization only handles primitive types. Get an array of PrimitiveTypeInfo. + PrimitiveTypeInfo[] tableRowPrimitiveTypeInfos = + new PrimitiveTypeInfo[batchContext.getNonPartitionColumnCount()]; + for (int i = 0; i < nonPartitionColumnCount; i++) { + tableRowPrimitiveTypeInfos[i] = (PrimitiveTypeInfo) tableRowTypeInfos[i]; + } + + // The Vectorizer class enforces that there is only one TableScanOperator, so + // we don't need the more complicated multiple root operator mapping that MapOperator has. + + fileToPartitionContextMap = new HashMap(); + + // Temporary map so we only create one partition context entry. + HashMap partitionContextMap = + new HashMap(); + + for (Map.Entry> entry : conf.getPathToAliases().entrySet()) { + String path = entry.getKey(); + PartitionDesc partDesc = conf.getPathToPartitionInfo().get(path); + ArrayList aliases = entry.getValue(); + + VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); + if (vectorPartDesc == null) { + LOG.info("Ignoring path " + path + " with aliases " + aliases + " since it does not have vector partition descriptor? Did not get examined by the Vectorizer class"); + continue; // UNDONE: Need to understand what the new PartitionDesc is about... + } + LOG.info("VectorMapOperator path: " + path + ", read type " + vectorPartDesc.getVectorMapOperatorReadType().name() + + ", vector deserialize type " + vectorPartDesc.getVectorDeserializeType().name() + ", aliases " + aliases); + + PartitionContext partitionContext; + if (!partitionContextMap.containsKey(partDesc)) { + partitionContext = CreateAndInitPartitionContext(partDesc, hconf, + tableRowPrimitiveTypeInfos); + partitionContextMap.put(partDesc, partitionContext); + } else { + partitionContext = partitionContextMap.get(partDesc); + } + + fileToPartitionContextMap.put(path, partitionContext); + } + + // Create list of one. + List> children = + new ArrayList>(); + children.add(oneRootOperator); + + setChildOperators(children); + } + + @Override + public void initializeMapOperator(Configuration hconf) throws HiveException { + super.initializeMapOperator(hconf); + + oneRootOperator.initialize(hconf, new ObjectInspector[] {tableStructObjectInspector}); + } + + public void initializeContexts() throws HiveException { + Path fpath = getExecContext().getCurrentInputPath(); + String nominalPath = getNominalPath(fpath); + currentPartContext = fileToPartitionContextMap.get(nominalPath); + setupPartitionContextVars(); + } + + // Find context for current input file + @Override + public void cleanUpInputFileChangedOp() throws HiveException { + super.cleanUpInputFileChangedOp(); + Path fpath = getExecContext().getCurrentInputPath(); + String nominalPath = getNominalPath(fpath); + currentPartContext = fileToPartitionContextMap.get(nominalPath); + setupPartitionContextVars(); + + // Add alias, table name, and partitions to hadoop conf so that their + // children will inherit these + oneRootOperator.setInputContext(nominalPath, currentPartContext.tableName, + currentPartContext.partName); + } + + private void setupPartitionContextVars() throws HiveException { + + PartitionDesc partDesc = currentPartContext.getPartDesc(); + VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); + currentReadType = vectorPartDesc.getVectorMapOperatorReadType(); + + if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) { + + // We will get a pass-thru VectorizedRowBatch as a row from the reader. + + currentNonPartColumnCount = 0; + + currentDeserializeRead = null; + currentVectorDeserializeRow = null; + + currentPartDeserializer = null; + currentPartRawRowObjectInspector = null; + currentVectorAssign = null; + + } else { + + // We will get an un-deserialized row from the reader. + + if (batch.size > 0) { + + // Clear out any rows in the batch from previous partition since we are going to change + // the repeating partition column values. + + oneRootOperator.process(batch, 0); + if (oneRootOperator.getDone()) { + setDone(true); + } + batch.reset(); + } + + currentNonPartColumnCount = vectorPartDesc.getNonPartColumnCount(); + + if (currentNonPartColumnCount < nonPartitionColumnCount) { + + // Default default any ALTER TABLE ADD COLUMN(s) columns to NULL once for the file. + + for (int i = currentNonPartColumnCount; i < nonPartitionColumnCount; i++) { + ColumnVector colVector = batch.cols[i]; + colVector.isNull[batch.size] = true; + colVector.noNulls = false; + colVector.isRepeating = true; + } + } + + if (batchContext.getPartitionColumnCount() > 0) { + + // The partition columns are set once for the partition and are marked repeating. + + batchContext.getPartitionValues(partDesc); + batchContext.addPartitionColsToBatch(batch); + } + + switch (currentReadType) { + case VECTOR_DESERIALIZE: + { + VectorDeserializePartitionContext vectorDeserPartContext = + (VectorDeserializePartitionContext) currentPartContext; + + currentDeserializeRead = vectorDeserPartContext.getDeserializeRead(); + currentVectorDeserializeRow = vectorDeserPartContext.getVectorDeserializeRow(); + + currentPartDeserializer = null; + currentPartRawRowObjectInspector = null; + currentVectorAssign = null; + + } + break; + + case ROW_DESERIALIZE: + { + RowDeserializePartitionContext rowDeserPartContext = + (RowDeserializePartitionContext) currentPartContext; + + currentDeserializeRead = null; + currentVectorDeserializeRow = null; + + currentPartDeserializer = rowDeserPartContext.getPartDeserializer(); + currentPartRawRowObjectInspector = rowDeserPartContext.getPartRawRowObjectInspector(); + currentVectorAssign = rowDeserPartContext.getVectorAssign(); + } + break; + + default: + throw new RuntimeException("Unexpected vector MapOperator read type " + + currentReadType.name()); + } + } + } + + @Override + public Deserializer getCurrentDeserializer() { + // Not applicable. + return null; + } + @Override public void process(Writable value) throws HiveException { + // A mapper can span multiple files/partitions. - // The serializers need to be reset if the input file changed + // The PartitionContext need to be changed if the input file changed ExecMapperContext context = getExecContext(); if (context != null && context.inputFileChanged()) { // The child operators cleanup if input file has changed cleanUpInputFileChanged(); } - // The row has been converted to comply with table schema, irrespective of partition schema. - // So, use tblOI (and not partOI) for forwarding - try { - int childrenDone = 0; - for (MapOpCtx current : currentCtxs) { - if (!current.forward(value)) { - childrenDone++; + if (!oneRootOperator.getDone()) { + try { + if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) { + + // We pass-true VectorizedRowBatch as a row. + + oneRootOperator.process(value, 0); + if (oneRootOperator.getDone()) { + setDone(true); + return; + } + + } else { + + // We have an un-deserialized row from the reader. + + if (batch.size == batch.DEFAULT_SIZE) { + + // Feed full batch to operator tree. + oneRootOperator.process(batch, 0); + if (oneRootOperator.getDone()) { + setDone(true); + return; + } + + // Only reset the non-partition columns. + for (int c = 0; c < nonPartitionColumnCount; c++) { + batch.cols[c].reset(); + batch.cols[c].init(); + } + batch.selectedInUse = false; + batch.size = 0; + batch.endOfFile = false; + } + + switch (currentReadType) { + case VECTOR_DESERIALIZE: + { + BinaryComparable binComp = (BinaryComparable) value; + currentDeserializeRead.set(binComp.getBytes(), 0, binComp.getLength()); + + currentVectorDeserializeRow.deserializeByValue(batch, batch.size); + } + break; + + case ROW_DESERIALIZE: + { + Object deserialized = currentPartDeserializer.deserialize(value); + currentVectorAssign.assignRow(batch.size, deserialized, + currentPartRawRowObjectInspector); + } + break; + + default: + throw new RuntimeException("Unexpected vector MapOperator read type " + + currentReadType.name()); + } + batch.size++; } + } catch (Exception e) { + throw new HiveException("Hive Runtime Error while processing row ", e); } + } + } + + @Override + public void process(Object row, int tag) throws HiveException { + throw new HiveException("Hive 2 Internal error: should not be called!"); + } - rowsForwarded(childrenDone, ((VectorizedRowBatch)value).size); - } catch (Exception e) { - throw new HiveException("Hive Runtime Error while processing row ", e); + @Override + public void closeOp(boolean abort) throws HiveException { + if (!abort && oneRootOperator != null && !oneRootOperator.getDone()) { + if (batch.size > 0) { + oneRootOperator.process(batch, 0); + } } + super.closeOp(abort); + } + + @Override + public String getName() { + return getOperatorName(); + } + + static public String getOperatorName() { + return "MAP"; + } + + @Override + public OperatorType getType() { + return null; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java index a2f8091..ce6a121 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java @@ -135,7 +135,7 @@ public VectorSMBMapJoinOperator(VectorizationContext vContext, OperatorDesc conf Collection> result = super.initializeOp(hconf); vrbCtx = new VectorizedRowBatchCtx(); - vrbCtx.init(vOutContext.getScratchColumnTypeMap(), (StructObjectInspector) this.outputObjInspector); + vrbCtx.init((StructObjectInspector) this.outputObjInspector, vOutContext.getScratchColumnTypeNames()); outputBatch = vrbCtx.createVectorizedRowBatch(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 5b702bb..7fdb5b1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -135,6 +135,8 @@ VectorExpressionDescriptor vMap; + private List initialColumnNames; + private List projectedColumns; private List projectionColumnNames; private Map projectionColumnMap; @@ -149,6 +151,7 @@ public VectorizationContext(String contextName, List initialColumnNames) this.contextName = contextName; level = 0; LOG.info("VectorizationContext consructor contextName " + contextName + " level " + level + " initialColumnNames " + initialColumnNames.toString()); + this.initialColumnNames = initialColumnNames; this.projectionColumnNames = initialColumnNames; projectedColumns = new ArrayList(); @@ -169,7 +172,8 @@ public VectorizationContext(String contextName) { this.contextName = contextName; level = 0; LOG.info("VectorizationContext consructor contextName " + contextName + " level " + level); - projectedColumns = new ArrayList(); + initialColumnNames = new ArrayList(); + projectedColumns = new ArrayList(); projectionColumnNames = new ArrayList(); projectionColumnMap = new HashMap(); this.ocm = new OutputColumnManager(0); @@ -184,6 +188,7 @@ public VectorizationContext(String contextName, VectorizationContext vContext) { this.contextName = contextName; level = vContext.level + 1; LOG.info("VectorizationContext consructor reference contextName " + contextName + " level " + level); + this.initialColumnNames = vContext.initialColumnNames; this.projectedColumns = new ArrayList(); this.projectionColumnNames = new ArrayList(); this.projectionColumnMap = new HashMap(); @@ -196,6 +201,7 @@ public VectorizationContext(String contextName, VectorizationContext vContext) { // Add an initial column to a vectorization context when // a vectorized row batch is being created. public void addInitialColumn(String columnName) { + initialColumnNames.add(columnName); int index = projectedColumns.size(); projectedColumns.add(index); projectionColumnNames.add(columnName); @@ -224,6 +230,10 @@ public void addProjectionColumn(String columnName, int vectorBatchColIndex) { projectionColumnMap.put(columnName, vectorBatchColIndex); } + public List getInitialColumnNames() { + return initialColumnNames; + } + public List getProjectedColumns() { return projectedColumns; } @@ -2235,13 +2245,16 @@ public int firstOutputColumnIndex() { return firstOutputColumnIndex; } - public Map getScratchColumnTypeMap() { - Map map = new HashMap(); + public String[] getScratchColumnTypeNames() { + String[] result = new String[ocm.outputColCount]; for (int i = 0; i < ocm.outputColCount; i++) { - String type = ocm.outputColumnsTypes[i]; - map.put(i+this.firstOutputColumnIndex, type); + String typeName = ocm.outputColumnsTypes[i]; + // if (typeName.equalsIgnoreCase("long")) { + // typeName = "bigint"; + // } + result[i] = typeName; } - return map; + return result; } @Override @@ -2261,9 +2274,7 @@ public int compare(Integer o1, Integer o2) { } sb.append("sorted projectionColumnMap ").append(sortedColumnMap).append(", "); - Map sortedScratchColumnTypeMap = new TreeMap(comparerInteger); - sortedScratchColumnTypeMap.putAll(getScratchColumnTypeMap()); - sb.append("sorted scratchColumnTypeMap ").append(sortedScratchColumnTypeMap); + sb.append("scratchColumnTypeNames ").append(getScratchColumnTypeNames().toString()); return sb.toString(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java index 3780113..50631f0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java @@ -199,13 +199,13 @@ public static VectorizedRowBatch constructVectorizedRowBatch( * struct object inspector, not just any struct object inspector. * @param keyInspector * @param valueInspector - * @param vectorScratchColumnTypeMap + * @param vectorScratchColumnTypeNames * @return VectorizedRowBatch, OI * @throws HiveException */ public static ObjectPair constructVectorizedRowBatch( - StructObjectInspector keyInspector, StructObjectInspector valueInspector, Map vectorScratchColumnTypeMap) - throws HiveException { + StructObjectInspector keyInspector, StructObjectInspector valueInspector, + String[] vectorScratchColumnTypeNames) throws HiveException { ArrayList colNames = new ArrayList(); ArrayList ois = new ArrayList(); @@ -222,7 +222,7 @@ public static VectorizedRowBatch constructVectorizedRowBatch( StandardStructObjectInspector rowObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(colNames, ois); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); - batchContext.init(vectorScratchColumnTypeMap, rowObjectInspector); + batchContext.init(rowObjectInspector, vectorScratchColumnTypeNames); return new ObjectPair<>(batchContext.createVectorizedRowBatch(), rowObjectInspector); } @@ -584,6 +584,34 @@ public static StandardStructObjectInspector convertToStandardStructObjectInspect return ObjectInspectorFactory.getStandardStructObjectInspector(columnNames,oids); } + public static String[] columnNamesFromStructObjectInspector( + StructObjectInspector structObjectInspector) throws HiveException { + + List fields = structObjectInspector.getAllStructFieldRefs(); + String[] result = new String[fields.size()]; + + int i = 0; + for(StructField field : fields) { + result[i++] = field.getFieldName(); + } + return result; + } + + public static TypeInfo[] typeInfosFromStructObjectInspector( + StructObjectInspector structObjectInspector) throws HiveException { + + List fields = structObjectInspector.getAllStructFieldRefs(); + TypeInfo[] result = new TypeInfo[fields.size()]; + + int i = 0; + for(StructField field : fields) { + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString( + field.getFieldObjectInspector().getTypeName()); + result[i++] = typeInfo; + } + return result; + } + public static PrimitiveTypeInfo[] primitiveTypeInfosFromStructObjectInspector( StructObjectInspector structObjectInspector) throws HiveException { @@ -599,6 +627,28 @@ public static StandardStructObjectInspector convertToStandardStructObjectInspect return result; } + public static PrimitiveTypeInfo[] primitiveTypeInfosFromTypeInfos(TypeInfo[] typeInfos) { + PrimitiveTypeInfo[] primitiveTypeInfos = new PrimitiveTypeInfo[typeInfos.length]; + + for (int i = 0; i < typeInfos.length; i++) { + TypeInfo typeInfo = typeInfos[i]; + primitiveTypeInfos[i] = (PrimitiveTypeInfo) typeInfo; + } + return primitiveTypeInfos; + } + + public static TypeInfo[] typeInfosFromTypeNames( + String[] typeNames) throws HiveException { + + TypeInfo[] result = new TypeInfo[typeNames.length]; + + for(int i = 0; i < typeNames.length; i++) { + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeNames[i]); + result[i] = typeInfo; + } + return result; + } + public static PrimitiveTypeInfo[] primitiveTypeInfosFromTypeNames( String[] typeNames) throws HiveException { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java index 82d4a8f..c28f7c0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; import org.apache.hadoop.hive.ql.io.IOPrepareCache; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; @@ -58,6 +59,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.DataOutputBuffer; @@ -66,7 +69,7 @@ import org.apache.hive.common.util.DateUtils; /** - * Context for Vectorized row batch. this calss does eager deserialization of row data using serde + * Context for Vectorized row batch. this class does eager deserialization of row data using serde * in the RecordReader layer. * It has supports partitions in this layer so that the vectorized batch is populated correctly * with the partition column. @@ -75,72 +78,72 @@ private static final Log LOG = LogFactory.getLog(VectorizedRowBatchCtx.class.getName()); - // OI for raw row data (EG without partition cols) - private StructObjectInspector rawRowOI; - - // OI for the row (Raw row OI + partition OI) - private StructObjectInspector rowOI; - - // Deserializer for the row data - private Deserializer deserializer; - - // Hash map of partition values. Key=TblColName value=PartitionValue - private Map partitionValues; - - //partition types - private Map partitionTypes; - - // partition column positions, for use by classes that need to know whether a given column is a - // partition column - private Set partitionCols; - // Column projection list - List of column indexes to include. This // list does not contain partition columns private List colsToInclude; - private Map scratchColumnTypeMap = null; + private String[] rowColumnNames; + private TypeInfo[] rowColumnTypeInfos; + private int nonPartitionColumnCount; + private int partitionColumnCount; + private Object[] partitionValues; - /** - * Constructor for VectorizedRowBatchCtx - * - * @param rawRowOI - * OI for raw row data (EG without partition cols) - * @param rowOI - * OI for the row (Raw row OI + partition OI) - * @param deserializer - * Deserializer for the row data - * @param partitionValues - * Hash map of partition values. Key=TblColName value=PartitionValue - */ - public VectorizedRowBatchCtx(StructObjectInspector rawRowOI, StructObjectInspector rowOI, - Deserializer deserializer, Map partitionValues, - Map partitionTypes) { - this.rowOI = rowOI; - this.rawRowOI = rawRowOI; - this.deserializer = deserializer; - this.partitionValues = partitionValues; - this.partitionTypes = partitionTypes; - } + private String[] scratchColumnTypeNames; /** * Constructor for VectorizedRowBatchCtx */ public VectorizedRowBatchCtx() { + } + public String[] getRowColumnNames() { + return rowColumnNames; + } + + public TypeInfo[] getRowColumnTypeInfos() { + return rowColumnTypeInfos; + } + + public int getNonPartitionColumnCount() { + return nonPartitionColumnCount; + } + + public int getPartitionColumnCount() { + return partitionColumnCount; } /** - * Initializes the VectorizedRowBatch context based on an scratch column type map and + * Initializes the VectorizedRowBatch context based on an scratch column type names and * object inspector. - * @param scratchColumnTypeMap - * @param rowOI + * @param structObjectInspector + * @param scratchColumnTypeNames * Object inspector that shapes the column types + * @throws HiveException */ - public void init(Map scratchColumnTypeMap, - StructObjectInspector rowOI) { - this.scratchColumnTypeMap = scratchColumnTypeMap; - this.rowOI= rowOI; - this.rawRowOI = rowOI; + public void init(StructObjectInspector structObjectInspector, String[] scratchColumnTypeNames) + throws HiveException { + + // Row column information. + rowColumnNames = VectorizedBatchUtil.columnNamesFromStructObjectInspector(structObjectInspector); + rowColumnTypeInfos = VectorizedBatchUtil.typeInfosFromStructObjectInspector(structObjectInspector); + partitionColumnCount = 0; + nonPartitionColumnCount = rowColumnTypeInfos.length; + + // Scratch column information. + this.scratchColumnTypeNames = scratchColumnTypeNames; + } + + public void init(MapWork mapWork) { + + // Row column information. + rowColumnNames = mapWork.getVectorColumnNames(); + rowColumnTypeInfos = mapWork.getVectorColumnTypeInfos(); + partitionColumnCount = mapWork.getVectorPartitionColumnCount(); + nonPartitionColumnCount = rowColumnTypeInfos.length - partitionColumnCount; + partitionValues = new Object[partitionColumnCount]; + + // Scratch column information. + scratchColumnTypeNames = mapWork.getVectorScratchColumnTypeNames(); } /** @@ -151,254 +154,102 @@ public void init(Map scratchColumnTypeMap, * Hive configuration using Hive plan is extracted * @param split * File split of the file being read - * @throws ClassNotFoundException - * @throws IOException - * @throws SerDeException - * @throws InstantiationException - * @throws IllegalAccessException - * @throws HiveException + * @throws IOException */ - public void init(Configuration hiveConf, FileSplit split) throws ClassNotFoundException, - IOException, - SerDeException, - InstantiationException, - IllegalAccessException, - HiveException { - - Map pathToPartitionInfo = Utilities - .getMapWork(hiveConf).getPathToPartitionInfo(); - - PartitionDesc part = HiveFileFormatUtils - .getPartitionDescFromPathRecursively(pathToPartitionInfo, - split.getPath(), IOPrepareCache.get().getPartitionDescMap()); - - String partitionPath = split.getPath().getParent().toString(); - scratchColumnTypeMap = Utilities.getMapWorkVectorScratchColumnTypeMap(hiveConf); - // LOG.info("VectorizedRowBatchCtx init scratchColumnTypeMap " + scratchColumnTypeMap.toString()); - - Properties partProps = - (part.getPartSpec() == null || part.getPartSpec().isEmpty()) ? - part.getTableDesc().getProperties() : part.getProperties(); - - Class serdeclass = hiveConf.getClassByName(part.getSerdeClassName()); - Deserializer partDeserializer = (Deserializer) serdeclass.newInstance(); - SerDeUtils.initializeSerDe(partDeserializer, hiveConf, part.getTableDesc().getProperties(), - partProps); - StructObjectInspector partRawRowObjectInspector = (StructObjectInspector) partDeserializer - .getObjectInspector(); - - deserializer = partDeserializer; - - // Check to see if this split is part of a partition of a table - String pcols = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); - - String[] partKeys = null; - if (pcols != null && pcols.length() > 0) { - - // Partitions exist for this table. Get the partition object inspector and - // raw row object inspector (row with out partition col) - LinkedHashMap partSpec = part.getPartSpec(); - partKeys = pcols.trim().split("/"); - String pcolTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES); - String[] partKeyTypes = pcolTypes.trim().split(":"); - - if (partKeys.length > partKeyTypes.length) { - throw new HiveException("Internal error : partKeys length, " +partKeys.length + - " greater than partKeyTypes length, " + partKeyTypes.length); - } - - List partNames = new ArrayList(partKeys.length); - List partObjectInspectors = new ArrayList(partKeys.length); - partitionValues = new LinkedHashMap(); - partitionTypes = new LinkedHashMap(); - for (int i = 0; i < partKeys.length; i++) { - String key = partKeys[i]; - partNames.add(key); - ObjectInspector objectInspector = null; - Object objectVal; - if (partSpec == null) { - // for partitionless table, initialize partValue to empty string. - // We can have partitionless table even if we have partition keys - // when there is only only partition selected and the partition key is not - // part of the projection/include list. - objectVal = null; - objectInspector = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - partitionTypes.put(key, PrimitiveCategory.STRING); - } else { - // Create a Standard java object Inspector - objectInspector = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo( - TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i])); - objectVal = - ObjectInspectorConverters. - getConverter(PrimitiveObjectInspectorFactory. - javaStringObjectInspector, objectInspector). - convert(partSpec.get(key)); - partitionTypes.put(key, TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i]).getPrimitiveCategory()); - } - if (LOG.isDebugEnabled()) { - LOG.debug("Partition column: name: " + key + ", value: " + objectVal + ", type: " + partitionTypes.get(key)); - } - partitionValues.put(key, objectVal); - partObjectInspectors.add(objectInspector); - } + public void init(Configuration hiveConf, FileSplit split) throws IOException { - // Create partition OI - StructObjectInspector partObjectInspector = ObjectInspectorFactory - .getStandardStructObjectInspector(partNames, partObjectInspectors); - - // Get row OI from partition OI and raw row OI - StructObjectInspector rowObjectInspector = ObjectInspectorFactory - .getUnionStructObjectInspector(Arrays - .asList(new StructObjectInspector[] {partRawRowObjectInspector, partObjectInspector})); - rowOI = rowObjectInspector; - rawRowOI = partRawRowObjectInspector; - - // We have to do this after we've set rowOI, as getColIndexBasedOnColName uses it - partitionCols = new HashSet(); - if (pcols != null && pcols.length() > 0) { - for (int i = 0; i < partKeys.length; i++) { - partitionCols.add(getColIndexBasedOnColName(partKeys[i])); - } - } + MapWork mapWork = Utilities.getMapWork(hiveConf); + + // Row column information. + rowColumnNames = mapWork.getVectorColumnNames(); + rowColumnTypeInfos = mapWork.getVectorColumnTypeInfos(); + partitionColumnCount = mapWork.getVectorPartitionColumnCount(); + nonPartitionColumnCount = rowColumnTypeInfos.length - partitionColumnCount; - } else { + // Scratch column information. + scratchColumnTypeNames = mapWork.getVectorScratchColumnTypeNames(); - // No partitions for this table, hence row OI equals raw row OI - rowOI = partRawRowObjectInspector; - rawRowOI = partRawRowObjectInspector; + if (partitionColumnCount > 0) { + + // Need to extract partition values. + + Map pathToPartitionInfo = Utilities + .getMapWork(hiveConf).getPathToPartitionInfo(); + + PartitionDesc part = HiveFileFormatUtils + .getPartitionDescFromPathRecursively(pathToPartitionInfo, + split.getPath(), IOPrepareCache.get().getPartitionDescMap()); + + partitionValues = new Object[partitionColumnCount]; + + getPartitionValues(part); } colsToInclude = ColumnProjectionUtils.getReadColumnIDs(hiveConf); } - - /** - * Creates a Vectorized row batch and the column vectors. - * - * @return VectorizedRowBatch - * @throws HiveException - */ - public VectorizedRowBatch createVectorizedRowBatch() throws HiveException - { - List fieldRefs = rowOI.getAllStructFieldRefs(); - VectorizedRowBatch result = new VectorizedRowBatch(fieldRefs.size()); - for (int j = 0; j < fieldRefs.size(); j++) { - // If the column is included in the include list or if the column is a - // partition column then create the column vector. Also note that partition columns are not - // in the included list. - if ((colsToInclude == null) || colsToInclude.contains(j) - || ((partitionValues != null) && - partitionValues.containsKey(fieldRefs.get(j).getFieldName()))) { - ObjectInspector foi = fieldRefs.get(j).getFieldObjectInspector(); - switch (foi.getCategory()) { - case PRIMITIVE: { - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi; - // Vectorization currently only supports the following data types: - // BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, BINARY, STRING, CHAR, VARCHAR, TIMESTAMP, - // DATE and DECIMAL - switch (poi.getPrimitiveCategory()) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - case TIMESTAMP: - case DATE: - case INTERVAL_YEAR_MONTH: - case INTERVAL_DAY_TIME: - result.cols[j] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); - break; - case FLOAT: - case DOUBLE: - result.cols[j] = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE); - break; - case BINARY: - case STRING: - case CHAR: - case VARCHAR: - result.cols[j] = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); - break; - case DECIMAL: - DecimalTypeInfo tInfo = (DecimalTypeInfo) poi.getTypeInfo(); - result.cols[j] = new DecimalColumnVector(VectorizedRowBatch.DEFAULT_SIZE, - tInfo.precision(), tInfo.scale()); - break; - default: - throw new RuntimeException("Vectorizaton is not supported for datatype:" - + poi.getPrimitiveCategory()); - } - break; - } - case LIST: - case MAP: - case STRUCT: - case UNION: - throw new HiveException("Vectorizaton is not supported for datatype:" - + foi.getCategory()); - default: - throw new HiveException("Unknown ObjectInspector category!"); - } + + public void getPartitionValues(PartitionDesc partDesc) { + + LinkedHashMap partSpec = partDesc.getPartSpec(); + + for (int i = 0; i < partitionColumnCount; i++) { + Object objectValue; + if (partSpec == null) { + // For partition-less table, initialize partValue to empty string. + // We can have partition-less table even if we have partition keys + // when there is only only partition selected and the partition key is not + // part of the projection/include list. + objectValue = null; + } else { + String key = rowColumnNames[nonPartitionColumnCount + i]; + + // Create a Standard java object Inspector + ObjectInspector objectInspector = + TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo( + rowColumnTypeInfos[nonPartitionColumnCount + i]); + objectValue = + ObjectInspectorConverters. + getConverter(PrimitiveObjectInspectorFactory. + javaStringObjectInspector, objectInspector). + convert(partSpec.get(key)); } + partitionValues[i] = objectValue; } - result.numCols = fieldRefs.size(); - this.addScratchColumnsToBatch(result); - result.reset(); - return result; } /** - * Adds the row to the batch after deserializing the row + * Creates a Vectorized row batch and the column vectors. * - * @param rowIndex - * Row index in the batch to which the row is added - * @param rowBlob - * Row blob (serialized version of row) - * @param batch - * Vectorized batch to which the row is added - * @param buffer a buffer to copy strings into + * @return VectorizedRowBatch * @throws HiveException - * @throws SerDeException */ - public void addRowToBatch(int rowIndex, Writable rowBlob, - VectorizedRowBatch batch, - DataOutputBuffer buffer - ) throws HiveException, SerDeException + public VectorizedRowBatch createVectorizedRowBatch() throws HiveException { - Object row = this.deserializer.deserialize(rowBlob); - VectorizedBatchUtil.addRowToBatch(row, this.rawRowOI, rowIndex, batch, buffer); - } + int totalColumnCount = rowColumnTypeInfos.length + scratchColumnTypeNames.length; - /** - * Deserialized set of rows and populates the batch - * - * @param rowBlob - * to deserialize - * @param batch - * Vectorized row batch which contains deserialized data - * @throws SerDeException - */ - public void convertRowBatchBlobToVectorizedBatch(Object rowBlob, int rowsInBlob, - VectorizedRowBatch batch) - throws SerDeException { - - if (deserializer instanceof VectorizedSerde) { - ((VectorizedSerde) deserializer).deserializeVector(rowBlob, rowsInBlob, batch); - } else { - throw new SerDeException( - "Not able to deserialize row batch. Serde does not implement VectorizedSerde"); - } - } + VectorizedRowBatch result = new VectorizedRowBatch(totalColumnCount); + for (int i = 0; i < totalColumnCount; i++) { - private int getColIndexBasedOnColName(String colName) throws HiveException - { - List fieldRefs = rowOI.getAllStructFieldRefs(); - for (int i = 0; i < fieldRefs.size(); i++) { - if (fieldRefs.get(i).getFieldName().equals(colName)) { - return i; + // If the column is NOT included in the include list, then do not allocate the column. + // It will not be used. Also note that partition columns are not in the included list. + if (i < nonPartitionColumnCount && + colsToInclude != null && !colsToInclude.contains(i)) { + continue; } + + if (i < rowColumnTypeInfos.length) { + TypeInfo typeInfo = rowColumnTypeInfos[i]; + result.cols[i] = allocateColumnVector(typeInfo, result.DEFAULT_SIZE); + } else { + String typeName = scratchColumnTypeNames[i - rowColumnTypeInfos.length]; + result.cols[i] = allocateColumnVector(typeName, result.DEFAULT_SIZE); + } + } - throw new HiveException("Not able to find column name in row object inspector"); + result.reset(); + return result; } - + /** * Add the partition values to the batch * @@ -407,16 +258,14 @@ private int getColIndexBasedOnColName(String colName) throws HiveException */ public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveException { - int colIndex; - Object value; - PrimitiveCategory pCategory; if (partitionValues != null) { - for (String key : partitionValues.keySet()) { - colIndex = getColIndexBasedOnColName(key); - value = partitionValues.get(key); - pCategory = partitionTypes.get(key); - - switch (pCategory) { + for (int i = 0; i < partitionColumnCount; i++) { + Object value = partitionValues[i]; + + int colIndex = nonPartitionColumnCount + i; + String partitionColumnName = rowColumnNames[colIndex]; + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) rowColumnTypeInfos[colIndex]; + switch (primitiveTypeInfo.getPrimitiveCategory()) { case BOOLEAN: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { @@ -604,8 +453,8 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveExcepti break; default: - throw new HiveException("Unable to recognize the partition type " + pCategory + - " for column " + key); + throw new HiveException("Unable to recognize the partition type " + primitiveTypeInfo.getPrimitiveCategory() + + " for column " + partitionColumnName); } } } @@ -613,64 +462,68 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveExcepti /** * Determine whether a given column is a partition column - * @param colnum column number in + * @param colNum column number in * {@link org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch}s created by this context. * @return true if it is a partition column, false otherwise */ - public final boolean isPartitionCol(int colnum) { - return (partitionCols == null) ? false : partitionCols.contains(colnum); + public final boolean isPartitionCol(int colNum) { + return colNum >= nonPartitionColumnCount && colNum < rowColumnTypeInfos.length; } - private void addScratchColumnsToBatch(VectorizedRowBatch vrb) throws HiveException { - if (scratchColumnTypeMap != null && !scratchColumnTypeMap.isEmpty()) { - int origNumCols = vrb.numCols; - int newNumCols = vrb.cols.length+scratchColumnTypeMap.keySet().size(); - vrb.cols = Arrays.copyOf(vrb.cols, newNumCols); - for (int i = origNumCols; i < newNumCols; i++) { - String typeName = scratchColumnTypeMap.get(i); - if (typeName == null) { - throw new HiveException("No type entry found for column " + i + " in map " + scratchColumnTypeMap.toString()); - } - vrb.cols[i] = allocateColumnVector(typeName, - VectorizedRowBatch.DEFAULT_SIZE); - } - vrb.numCols = vrb.cols.length; + public static ColumnVector allocateColumnVector(String typeName, int defaultSize) throws HiveException { + typeName = typeName.toLowerCase(); + + // Allow undecorated CHAR and VARCHAR to support scratch column type names. + if (typeName.equals("char") || typeName.equals("varchar")) { + return new BytesColumnVector(defaultSize); + } else if (typeName.equals("long")) { + typeName = "bigint"; } - } - /** - * Get the scale and precision for the given decimal type string. The decimal type is assumed to be - * of the format decimal(precision,scale) e.g. decimal(20,10). - * @param decimalType The given decimal type string. - * @return An integer array of size 2 with first element set to precision and second set to scale. - */ - private static int[] getScalePrecisionFromDecimalType(String decimalType) { - Pattern p = Pattern.compile("\\d+"); - Matcher m = p.matcher(decimalType); - m.find(); - int precision = Integer.parseInt(m.group()); - m.find(); - int scale = Integer.parseInt(m.group()); - int [] precScale = { precision, scale }; - return precScale; + TypeInfo typeInfo = (TypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); + return allocateColumnVector(typeInfo, defaultSize); } - public static ColumnVector allocateColumnVector(String type, int defaultSize) { - if (type.equalsIgnoreCase("double")) { - return new DoubleColumnVector(defaultSize); - } else if (VectorizationContext.isStringFamily(type)) { - return new BytesColumnVector(defaultSize); - } else if (VectorizationContext.decimalTypePattern.matcher(type).matches()){ - int [] precisionScale = getScalePrecisionFromDecimalType(type); - return new DecimalColumnVector(defaultSize, precisionScale[0], precisionScale[1]); - } else if (type.equalsIgnoreCase("long") || - type.equalsIgnoreCase("date") || - type.equalsIgnoreCase("timestamp") || - type.equalsIgnoreCase(serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME) || - type.equalsIgnoreCase(serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME)) { - return new LongColumnVector(defaultSize); - } else { - throw new RuntimeException("Cannot allocate vector column for " + type); + public static ColumnVector allocateColumnVector(TypeInfo typeInfo, int defaultSize) throws HiveException { + switch (typeInfo.getCategory()) { + case PRIMITIVE: { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + switch (primitiveTypeInfo.getPrimitiveCategory()) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case TIMESTAMP: + case DATE: + case INTERVAL_YEAR_MONTH: + case INTERVAL_DAY_TIME: + return new LongColumnVector(defaultSize); + case FLOAT: + case DOUBLE: + return new DoubleColumnVector(defaultSize); + case BINARY: + case STRING: + case CHAR: + case VARCHAR: + return new BytesColumnVector(defaultSize); + case DECIMAL: + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; + return new DecimalColumnVector(defaultSize, + decimalTypeInfo.precision(), decimalTypeInfo.scale()); + default: + throw new RuntimeException("Vectorizaton is not supported for datatype:" + + primitiveTypeInfo.getPrimitiveCategory().name()); + } + } + case LIST: + case MAP: + case STRUCT: + case UNION: + throw new HiveException("Vectorizaton is not supported for datatype:" + + typeInfo.getCategory()); + default: + throw new HiveException("Unknown type category " + typeInfo.getCategory().name()); } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java index 87ebcf2..52e7e76 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java @@ -665,22 +665,12 @@ protected HashTableLoader getHashTableLoader(Configuration hconf) { * build join output results in. */ protected VectorizedRowBatch setupOverflowBatch() throws HiveException { + + int initialColumnCount = vContext.firstOutputColumnIndex(); VectorizedRowBatch overflowBatch; - Map scratchColumnTypeMap = vOutContext.getScratchColumnTypeMap(); - int maxColumn = 0; - for (int i = 0; i < outputProjection.length; i++) { - int outputColumn = outputProjection[i]; - if (maxColumn < outputColumn) { - maxColumn = outputColumn; - } - } - for (int outputColumn : scratchColumnTypeMap.keySet()) { - if (maxColumn < outputColumn) { - maxColumn = outputColumn; - } - } - overflowBatch = new VectorizedRowBatch(maxColumn + 1); + int totalNumColumns = initialColumnCount + vOutContext.getScratchColumnTypeNames().length; + overflowBatch = new VectorizedRowBatch(totalNumColumns); // First, just allocate just the projection columns we will be using. for (int i = 0; i < outputProjection.length; i++) { @@ -690,9 +680,9 @@ protected VectorizedRowBatch setupOverflowBatch() throws HiveException { } // Now, add any scratch columns needed for children operators. - for (int outputColumn : scratchColumnTypeMap.keySet()) { - String typeName = scratchColumnTypeMap.get(outputColumn); - allocateOverflowBatchColumnVector(overflowBatch, outputColumn, typeName); + int outputColumn = initialColumnCount; + for (String typeName : vOutContext.getScratchColumnTypeNames()) { + allocateOverflowBatchColumnVector(overflowBatch, outputColumn++, typeName); } overflowBatch.projectedColumns = outputProjection; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileInputFormat.java deleted file mode 100644 index faad5f2..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileInputFormat.java +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io; - -import java.io.IOException; -import java.util.ArrayList; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; - -/** - * A MapReduce/Hive Vectorized input format for RC files. - */ -public class VectorizedRCFileInputFormat extends FileInputFormat - implements InputFormatChecker { - - public VectorizedRCFileInputFormat() { - setMinSplitSize(SequenceFile.SYNC_INTERVAL); - } - - @Override - @SuppressWarnings("unchecked") - public RecordReader getRecordReader(InputSplit split, JobConf job, - Reporter reporter) throws IOException { - - reporter.setStatus(split.toString()); - - return new VectorizedRCFileRecordReader(job, (FileSplit) split); - } - - @Override - public boolean validateInput(FileSystem fs, HiveConf conf, - ArrayList files) throws IOException { - if (files.size() <= 0) { - return false; - } - for (int fileId = 0; fileId < files.size(); fileId++) { - RCFile.Reader reader = null; - try { - reader = new RCFile.Reader(fs, files.get(fileId) - .getPath(), conf); - reader.close(); - reader = null; - } catch (IOException e) { - return false; - } finally { - if (null != reader) { - reader.close(); - } - } - } - return true; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java deleted file mode 100644 index 4cc1c2f..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java +++ /dev/null @@ -1,261 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io; - -import java.io.IOException; -import java.util.Collections; -import java.util.Map; -import java.util.WeakHashMap; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; -import org.apache.hadoop.hive.ql.io.RCFile.KeyBuffer; -import org.apache.hadoop.hive.ql.io.RCFile.Reader; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; -import org.apache.hadoop.io.DataOutputBuffer; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.RecordReader; - -/** - * RCFileRecordReader. - */ -public class VectorizedRCFileRecordReader implements RecordReader { - - private final Reader in; - private final long start; - private final long end; - private boolean more = true; - protected Configuration conf; - private final FileSplit split; - private final boolean useCache; - private VectorizedRowBatchCtx rbCtx; - private final LongWritable keyCache = new LongWritable(); - private final BytesRefArrayWritable colsCache = new BytesRefArrayWritable(); - private boolean addPartitionCols = true; - private final DataOutputBuffer buffer = new DataOutputBuffer(); - - private static RCFileSyncCache syncCache = new RCFileSyncCache(); - - private static final class RCFileSyncEntry { - long end; - long endSync; - } - - private static final class RCFileSyncCache { - - private final Map cache; - - public RCFileSyncCache() { - cache = Collections.synchronizedMap(new WeakHashMap()); - } - - public void put(FileSplit split, long endSync) { - Path path = split.getPath(); - long end = split.getStart() + split.getLength(); - String key = path.toString() + "+" + String.format("%d", end); - - RCFileSyncEntry entry = new RCFileSyncEntry(); - entry.end = end; - entry.endSync = endSync; - if (entry.endSync >= entry.end) { - cache.put(key, entry); - } - } - - public long get(FileSplit split) { - Path path = split.getPath(); - long start = split.getStart(); - String key = path.toString() + "+" + String.format("%d", start); - RCFileSyncEntry entry = cache.get(key); - if (entry != null) { - return entry.endSync; - } - return -1; - } - } - - public VectorizedRCFileRecordReader(Configuration conf, FileSplit split) - throws IOException { - - Path path = split.getPath(); - FileSystem fs = path.getFileSystem(conf); - this.in = new RCFile.Reader(fs, path, conf); - this.end = split.getStart() + split.getLength(); - this.conf = conf; - this.split = split; - - useCache = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEUSERCFILESYNCCACHE); - - if (split.getStart() > in.getPosition()) { - long oldSync = useCache ? syncCache.get(split) : -1; - if (oldSync == -1) { - in.sync(split.getStart()); // sync to start - } else { - in.seek(oldSync); - } - } - - this.start = in.getPosition(); - - more = start < end; - try { - rbCtx = new VectorizedRowBatchCtx(); - rbCtx.init(conf, split); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - public Class getKeyClass() { - return LongWritable.class; - } - - public Class getValueClass() { - return BytesRefArrayWritable.class; - } - - @Override - public NullWritable createKey() { - return NullWritable.get(); - } - - @Override - public VectorizedRowBatch createValue() { - VectorizedRowBatch result; - try { - result = rbCtx.createVectorizedRowBatch(); - } catch (HiveException e) { - throw new RuntimeException("Error creating a batch", e); - } - return result; - } - - public boolean nextBlock() throws IOException { - return in.nextBlock(); - } - - @Override - public boolean next(NullWritable key, VectorizedRowBatch value) throws IOException { - - // Reset column fields noNull values to true - VectorizedBatchUtil.setNoNullFields(value); - buffer.reset(); - value.selectedInUse = false; - for (int i = 0; i < value.numCols; i++) { - value.cols[i].isRepeating = false; - } - - int i = 0; - try { - - for (; i < VectorizedRowBatch.DEFAULT_SIZE; i++) { - more = next(keyCache); - if (more) { - // Check and update partition cols if necessary. Ideally this should be done - // in CreateValue() as the partition is constant per split. But since Hive uses - // CombineHiveRecordReader and as this does not call CreateValue() for - // each new RecordReader it creates, this check is required in next() - if (addPartitionCols) { - rbCtx.addPartitionColsToBatch(value); - addPartitionCols = false; - } - in.getCurrentRow(colsCache); - // Currently RCFile reader does not support reading vectorized - // data. Populating the batch by adding one row at a time. - rbCtx.addRowToBatch(i, (Writable) colsCache, value, buffer); - } else { - break; - } - } - } catch (Exception e) { - throw new RuntimeException("Error while getting next row", e); - } - value.size = i; - return more; - } - - protected boolean next(LongWritable key) throws IOException { - if (!more) { - return false; - } - - more = in.next(key); - - long lastSeenSyncPos = in.lastSeenSyncPos(); - - if (lastSeenSyncPos >= end) { - if (useCache) { - syncCache.put(split, lastSeenSyncPos); - } - more = false; - return more; - } - return more; - } - - /** - * Return the progress within the input split. - * - * @return 0.0 to 1.0 of the input byte range - */ - public float getProgress() throws IOException { - if (end == start) { - return 0.0f; - } else { - return Math.min(1.0f, (in.getPosition() - start) / (float) (end - start)); - } - } - - public long getPos() throws IOException { - return in.getPosition(); - } - - public KeyBuffer getKeyBuffer() { - return in.getCurrentKeyBufferObj(); - } - - protected void seek(long pos) throws IOException { - in.seek(pos); - } - - public void sync(long pos) throws IOException { - in.sync(pos); - } - - public void resetBuffer() { - in.resetBuffer(); - } - - public long getStart() { - return start; - } - - public void close() throws IOException { - in.close(); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 4e6dd7a..c362e5c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -322,7 +322,7 @@ public boolean validateInput(FileSystem fs, HiveConf conf, ArrayList files ) throws IOException { - if (Utilities.isVectorMode(conf)) { + if (Utilities.getUseVectorizedInputFileFormat(conf)) { return new VectorizedOrcInputFormat().validateInput(fs, conf, files); } @@ -1109,7 +1109,7 @@ private static void cancelFutures(List> futures) { public org.apache.hadoop.mapred.RecordReader getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { - boolean vectorMode = Utilities.isVectorMode(conf); + boolean vectorMode = Utilities.getUseVectorizedInputFileFormat(conf); // if HiveCombineInputFormat gives us FileSplits instead of OrcSplits, // we know it is not ACID. (see a check in CombineHiveInputFormat.getSplits() that assures this) diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java index b1a32bc..b25cf9c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java @@ -617,4 +617,97 @@ static ObjectInspector createObjectInspector(int columnId, type.getKind()); } } + + static TypeInfo getTypeInfo(int columnId, + List types){ + OrcProto.Type type = types.get(columnId); + switch (type.getKind()) { + case FLOAT: + return TypeInfoFactory.floatTypeInfo; + case DOUBLE: + return TypeInfoFactory.doubleTypeInfo; + case BOOLEAN: + return TypeInfoFactory.booleanTypeInfo; + case BYTE: + return TypeInfoFactory.byteTypeInfo; + case SHORT: + return TypeInfoFactory.shortTypeInfo; + case INT: + return TypeInfoFactory.intTypeInfo; + case LONG: + return TypeInfoFactory.longTypeInfo; + case BINARY: + return TypeInfoFactory.binaryTypeInfo; + case STRING: + return TypeInfoFactory.stringTypeInfo; + case CHAR: + if (!type.hasMaximumLength()) { + throw new UnsupportedOperationException( + "Illegal use of char type without length in ORC type definition."); + } + return TypeInfoFactory.getCharTypeInfo(type.getMaximumLength()); + case VARCHAR: + if (!type.hasMaximumLength()) { + throw new UnsupportedOperationException( + "Illegal use of varchar type without length in ORC type definition."); + } + return TypeInfoFactory.getVarcharTypeInfo(type.getMaximumLength()); + case TIMESTAMP: + return TypeInfoFactory.timestampTypeInfo; + case DATE: + return TypeInfoFactory.dateTypeInfo; + case DECIMAL: + int precision = type.hasPrecision() ? type.getPrecision() : HiveDecimal.SYSTEM_DEFAULT_PRECISION; + int scale = type.hasScale()? type.getScale() : HiveDecimal.SYSTEM_DEFAULT_SCALE; + return TypeInfoFactory.getDecimalTypeInfo(precision, scale); + case STRUCT: + return getStructTypeInfo(columnId, types); + case UNION: + return getUnionTypeInfo(columnId, types); + case MAP: + return getMapTypeInfo(columnId, types); + case LIST: + return getListTypeInfo(columnId, types); + default: + throw new UnsupportedOperationException("Unknown type " + + type.getKind()); + } + } + + static TypeInfo getStructTypeInfo(int columnId, List types) { + OrcProto.Type type = types.get(columnId); + int fieldCount = type.getSubtypesCount(); + ArrayList fieldTypeInfos = new ArrayList(fieldCount); + for (int i = 0; i < fieldCount; ++i) { + int fieldType = type.getSubtypes(i); + TypeInfo fieldTypeInfo = getTypeInfo(fieldType, types); + fieldTypeInfos.add(fieldTypeInfo); + } + return TypeInfoFactory.getStructTypeInfo(type.getFieldNamesList(), fieldTypeInfos); + } + + static TypeInfo getUnionTypeInfo(int columnId, List types) { + OrcProto.Type type = types.get(columnId); + ArrayList childrenTypeInfos = new ArrayList(type.getSubtypesCount()); + for (int i = 0; i < type.getSubtypesCount(); ++i) { + childrenTypeInfos.add(getTypeInfo(type.getSubtypes(i), types)); + } + + return TypeInfoFactory.getUnionTypeInfo(childrenTypeInfos); + } + + static TypeInfo getMapTypeInfo(int columnId, List types) { + OrcProto.Type type = types.get(columnId); + TypeInfo keyTypeInfo = getTypeInfo(type.getSubtypes(0), types); + TypeInfo valueTypeInfo = getTypeInfo(type.getSubtypes(1), types); + + return TypeInfoFactory.getMapTypeInfo(keyTypeInfo, valueTypeInfo); + } + + static TypeInfo getListTypeInfo(int columnId, List types) { + OrcProto.Type type = types.get(columnId); + TypeInfo elementTypeInfo = getTypeInfo(type.getSubtypes(0), types); + return TypeInfoFactory.getListTypeInfo(elementTypeInfo); + } + } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReader.java index dba9071..e65c47b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReader.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * A row-by-row iterator for ORC files. @@ -41,6 +42,14 @@ Object next(Object previous) throws IOException; /** + * Set the vector column information expected by the reader. + * @param rowColumnNames the expected column names. + * @param rowColumnTypeInfos the type information for the columns. + * @throws IOException + */ + void setVectorColumnInfo(String[] rowColumnNames, TypeInfo[] rowColumnTypeInfos) throws IOException; + + /** * Read the next row batch. The size of the batch to read cannot be controlled * by the callers. Caller need to look at VectorizedRowBatch.size of the retunred * object to know the batch size read. diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index 0d765b1..cd7f740 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hive.common.DiskRangeList.DiskRangeListCreateHelper; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO; import org.apache.hadoop.hive.ql.io.orc.RecordReaderUtils.ByteBufferAllocatorPool; @@ -47,9 +48,13 @@ import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.PartitionConversion; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.shims.HadoopShims.ZeroCopyReaderShim; import org.apache.hadoop.io.Text; @@ -89,6 +94,12 @@ private final ByteBufferAllocatorPool pool = new ByteBufferAllocatorPool(); private final ZeroCopyReaderShim zcr; + private String[] vectorizedRowBatchColumnNames; + private TypeInfo[] vectorizedRowBatchColumnTypeInfos; + private TypeInfo[] actualVectorColumnTypeInfos; + private boolean[] conversionFlags; + private boolean vectorNeedsDefaultNulls; + public final static class Index { OrcProto.RowIndex[] rowGroupIndex; OrcProto.BloomFilterIndex[] bloomFilterIndex; @@ -1071,6 +1082,38 @@ public Object next(Object previous) throws IOException { } @Override + public void setVectorColumnInfo(String[] vectorizedRowBatchColumnNames, + TypeInfo[] vectorizedRowBatchColumnTypeInfos) throws IOException { + + this.vectorizedRowBatchColumnNames = vectorizedRowBatchColumnNames; + this.vectorizedRowBatchColumnTypeInfos = vectorizedRowBatchColumnTypeInfos; + + // Convert ORC metadata to TypeInfo. + StructTypeInfo structTypeInfo = (StructTypeInfo) OrcStruct.getTypeInfo(0, types); + actualVectorColumnTypeInfos = + structTypeInfo.getAllStructFieldTypeInfos().toArray(new TypeInfo[0]); + vectorNeedsDefaultNulls = + (vectorizedRowBatchColumnTypeInfos.length > actualVectorColumnTypeInfos.length); + + // Verify data type conversion is ok. + + // NOTE: The Vectorizer class will validate the conversions from the partition's meta-data + // point of view and not vectorize unsupported conversions. + // However, we must check because the Schema-On-Read ORC metadata could be anything. + + PartitionConversion partitionConversion = new PartitionConversion(); + partitionConversion.validateConversion(actualVectorColumnTypeInfos, + this.vectorizedRowBatchColumnTypeInfos); + if (!partitionConversion.getValidConversion()) { + throw new IOException( + String.format("ORC data type conversion from %s to %s is not supported", + partitionConversion.getInvalidFromTypeInfo().toString(), + partitionConversion.getInvalidToTypeInfo()).toString()); + } + conversionFlags = partitionConversion.getResultConversionFlags(); + } + + @Override public VectorizedRowBatch nextBatch(VectorizedRowBatch previous) throws IOException { try { final VectorizedRowBatch result; @@ -1093,6 +1136,22 @@ public VectorizedRowBatch nextBatch(VectorizedRowBatch previous) throws IOExcept } result.size = (int) batchSize; + + if (vectorNeedsDefaultNulls) { + + // We don't have a contract about whether the same batch is passed in each time, + // so to be safe, set the repeating default each time. + + for (int i = actualVectorColumnTypeInfos.length; + i < vectorizedRowBatchColumnTypeInfos.length; + i++) { + ColumnVector colVector = result.cols[i]; + colVector.isNull[0] = true; + colVector.noNulls = false; + colVector.isRepeating = true; + } + } + advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true); return result; } catch (IOException e) { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java index a8e5c2e..bdd5f79 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java @@ -58,19 +58,7 @@ this.rowBatchCtx = new VectorizedRowBatchCtx(); this.value = inner.createValue(); this.objectInspector = inner.getObjectInspector(); - try { - rowBatchCtx.init(conf, split); - } catch (ClassNotFoundException e) { - throw new IOException("Failed to initialize context", e); - } catch (SerDeException e) { - throw new IOException("Failed to initialize context", e); - } catch (InstantiationException e) { - throw new IOException("Failed to initialize context", e); - } catch (IllegalAccessException e) { - throw new IOException("Failed to initialize context", e); - } catch (HiveException e) { - throw new IOException("Failed to initialize context", e); - } + rowBatchCtx.init(conf, split); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java index 3992d8c..d714521 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -35,6 +36,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.InputFormatChecker; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileSplit; @@ -75,6 +77,16 @@ } catch (Exception e) { throw new RuntimeException(e); } + + // Tell the vectorized ORC reader which column we are expecting to read. + int nonPartitionColumnCount = rbCtx.getNonPartitionColumnCount(); + String[] rowColumnNames = rbCtx.getRowColumnNames(); + TypeInfo[] rowColumnTypeInfos = rbCtx.getRowColumnTypeInfos(); + if (nonPartitionColumnCount < rowColumnNames.length){ + rowColumnNames = Arrays.copyOf(rowColumnNames, nonPartitionColumnCount); + rowColumnTypeInfos = Arrays.copyOf(rowColumnTypeInfos, nonPartitionColumnCount); + } + reader.setVectorColumnInfo(rowColumnNames, rowColumnTypeInfos); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java index d82e93c..d212752 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java @@ -59,7 +59,7 @@ protected MapredParquetInputFormat(final ParquetInputFormat input final org.apache.hadoop.mapred.Reporter reporter ) throws IOException { try { - if (Utilities.isVectorMode(job)) { + if (Utilities.getUseVectorizedInputFileFormat(job)) { if (LOG.isDebugEnabled()) { LOG.debug("Using vectorized record reader"); } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 82c3e50..a5306ae 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -26,11 +26,13 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.Stack; import java.util.regex.Pattern; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; @@ -86,6 +88,7 @@ import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PartitionConversion; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; @@ -98,6 +101,8 @@ import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; +import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorDeserializeType; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.ql.udf.UDFAcos; import org.apache.hadoop.hive.ql.udf.UDFAsin; @@ -138,11 +143,18 @@ import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.*; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextInputFormat; public class Vectorizer implements PhysicalPlanResolver { @@ -304,17 +316,53 @@ public Vectorizer() { supportedAggregationUdfs.add("stddev_samp"); } + private class VectorTaskColumnInfo { + List columnNames; + List typeInfos; + int partitionColumnCount; + boolean useVectorizedInputFileFormat; + + String[] scratchTypeNameArray; + + VectorTaskColumnInfo() { + partitionColumnCount = 0; + } + + public void setColumnNames(List columnNames) { + this.columnNames = columnNames; + } + public void setTypeInfos(List typeInfos) { + this.typeInfos = typeInfos; + } + public void setPartitionColumnCount(int partitionColumnCount) { + this.partitionColumnCount = partitionColumnCount; + } + public void setScratchTypeNameArray(String[] scratchTypeNameArray) { + this.scratchTypeNameArray = scratchTypeNameArray; + } + public void setUseVectorizedInputFileFormat(boolean useVectorizedInputFileFormat) { + this.useVectorizedInputFileFormat = useVectorizedInputFileFormat; + } + + public void transferToBaseWork(BaseWork baseWork) { + String[] columnNameArray = columnNames.toArray(new String[0]); + baseWork.setVectorColumnNames(columnNameArray); + TypeInfo[] typeInfoArray = typeInfos.toArray(new TypeInfo[0]); + baseWork.setVectorColumnTypeInfos(typeInfoArray); + baseWork.setVectorPartitionColumnCount(partitionColumnCount); + + baseWork.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat); + + baseWork.setVectorScratchColumnTypeNames(scratchTypeNameArray); + } + } + class VectorizationDispatcher implements Dispatcher { private final PhysicalContext physicalContext; - private List reduceColumnNames; - private List reduceTypeInfos; - public VectorizationDispatcher(PhysicalContext physicalContext) { this.physicalContext = physicalContext; - reduceColumnNames = null; - reduceTypeInfos = null; } @Override @@ -352,9 +400,10 @@ public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) } private void convertMapWork(MapWork mapWork, boolean isTez) throws SemanticException { - boolean ret = validateMapWork(mapWork, isTez); + VectorTaskColumnInfo vectorTaskColumnInfo = new VectorTaskColumnInfo(); + boolean ret = validateMapWork(mapWork, vectorTaskColumnInfo, isTez); if (ret) { - vectorizeMapWork(mapWork, isTez); + vectorizeMapWork(mapWork, vectorTaskColumnInfo, isTez); } } @@ -365,40 +414,418 @@ private void addMapWorkRules(Map opRules, NodeProcessor np) + ReduceSinkOperator.getOperatorName()), np); } - private boolean validateMapWork(MapWork mapWork, boolean isTez) throws SemanticException { - LOG.info("Validating MapWork..."); + private ImmutablePair verifyOnlyOneTableScanOperator(MapWork mapWork) { // Eliminate MR plans with more than one TableScanOperator. + LinkedHashMap> aliasToWork = mapWork.getAliasToWork(); if ((aliasToWork == null) || (aliasToWork.size() == 0)) { - return false; + return null; } int tableScanCount = 0; - for (Operator op : aliasToWork.values()) { + String alias = ""; + TableScanOperator tableScanOperator = null; + for (Entry> entry : aliasToWork.entrySet()) { + Operator op = entry.getValue(); if (op == null) { LOG.warn("Map work has invalid aliases to work with. Fail validation!"); - return false; + return null; } if (op instanceof TableScanOperator) { tableScanCount++; + alias = entry.getKey(); + tableScanOperator = (TableScanOperator) op; } } if (tableScanCount > 1) { - LOG.warn("Map work has more than 1 TableScanOperator aliases to work with. Fail validation!"); - return false; + LOG.warn("Map work has more than 1 TableScanOperator. Fail validation!"); + return null; } + return new ImmutablePair(alias, tableScanOperator); + } + + private void getTableScanOperatorSchemaInfo(TableScanOperator tableScanOperator, + List logicalColumnNameList, List logicalTypeInfoList) { + + TableScanDesc tableScanDesc = tableScanOperator.getConf(); + + // Add all non-virtual columns to make a vectorization context for + // the TableScan operator. + RowSchema rowSchema = tableScanOperator.getSchema(); + for (ColumnInfo c : rowSchema.getSignature()) { + // Validation will later exclude vectorization of virtual columns usage (HIVE-5560). + if (!isVirtualColumn(c)) { + String columnName = c.getInternalName(); + String typeName = c.getTypeName(); + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + + logicalColumnNameList.add(columnName); + logicalTypeInfoList.add(typeInfo); + } + } + } + + private String getColumnsString(List columnNames) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < columnNames.size(); i++) { + if (i > 0) { + sb.append(","); + } + sb.append(columnNames.get(i)); + } + return sb.toString(); + } + + private String getTypesString(List typeInfos) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < typeInfos.size(); i++) { + if (i > 0) { + sb.append(":"); + } + sb.append(typeInfos.get(i).getTypeName()); + } + return sb.toString(); + } + + private String getColumns(List columnNames, int start, int length, + Character separator) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + if (i > start) { + sb.append(separator); + } + sb.append(columnNames.get(i)); + } + return sb.toString(); + } + + private String getTypes(List typeInfos, int start, int length, + Character separator) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + if (i > start) { + sb.append(separator); + } + sb.append(typeInfos.get(i).getTypeName()); + } + return sb.toString(); + } + + private boolean verifyInputFormat(PartitionDesc pd, boolean useVectorizedInputFileFormat, + boolean useVectorDeserialize, boolean useRowDeserialize) { + + // Look for Pass-Thru case where InputFileFormat has VectorizedInputFormatInterface + // and reads VectorizedRowBatch as a "row". + + if (useVectorizedInputFileFormat) { - // Validate the input format - for (String path : mapWork.getPathToPartitionInfo().keySet()) { - PartitionDesc pd = mapWork.getPathToPartitionInfo().get(path); List> interfaceList = Arrays.asList(pd.getInputFileFormatClass().getInterfaces()); - if (!interfaceList.contains(VectorizedInputFormatInterface.class)) { - LOG.info("Input format: " + pd.getInputFileFormatClassName() - + ", doesn't provide vectorized input"); + if (interfaceList.contains(VectorizedInputFormatInterface.class)) { + + pd.setVectorPartitionDesc(VectorPartitionDesc.VectorizedInputFileFormat()); + + return true; + } + } + + String inputFileFormatClassName = pd.getInputFileFormatClassName(); + String deserializerClassName = pd.getDeserializerClassName(); + + // Look for InputFileFormat / Serde combinations we can deserialize more efficiently + // using VectorDeserializeRow and a deserialize class with the DeserializeRead interface. + // + // Do the "vectorized" row-by-row deserialization into a VectorizedRowBatch in the + // VectorMapOperator. + + if (useVectorDeserialize) { + + // Currently, we support LazySimple deserialization: + // + // org.apache.hadoop.mapred.TextInputFormat + // org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + // + // AND + // + // org.apache.hadoop.mapred.SequenceFileInputFormat + // org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + if (inputFileFormatClassName.equals(TextInputFormat.class.getName()) && + deserializerClassName.equals(LazySimpleSerDe.class.getName())) { + + pd.setVectorPartitionDesc( + VectorPartitionDesc.VectorDeserialize(VectorDeserializeType.LAZY_SIMPLE)); + + return true; + } else if (inputFileFormatClassName.equals(SequenceFileInputFormat.class.getName()) && + deserializerClassName.equals(LazyBinarySerDe.class.getName())) { + + pd.setVectorPartitionDesc( + VectorPartitionDesc.VectorDeserialize(VectorDeserializeType.LAZY_BINARY)); + + return true; + } + } + + // Otherwise, if enabled, deserialize rows using regular Serde and add the object + // inspectable Object[] row to a VectorizedRowBatch in the VectorMapOperator. + + if (useRowDeserialize) { + + pd.setVectorPartitionDesc(VectorPartitionDesc.RowDeserialize()); + + return true; + + } + + // UNDONE: LOG.info ??? + return false; + } + + private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String alias, + TableScanOperator tableScanOperator, VectorTaskColumnInfo vectorTaskColumnInfo) { + + boolean useVectorizedInputFileFormat = + HiveConf.getBoolVar(hiveConf, + HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT); + boolean useVectorDeserialize = + HiveConf.getBoolVar(hiveConf, + HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE); + boolean useRowDeserialize = + HiveConf.getBoolVar(hiveConf, + HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE); + + final List logicalColumnNameList = new ArrayList(); + final List logicalTypeInfoList = new ArrayList(); + + getTableScanOperatorSchemaInfo(tableScanOperator, + logicalColumnNameList, logicalTypeInfoList); + final int logicalCount = logicalColumnNameList.size(); + + // Validate input format and schema evolution capability. + + // For the table, enter a null value in the multi-key map indicating no conversion necessary + // if the schema matches the table. + + HashMap conversionMap = new HashMap(); + + boolean isFirst = true; + int nonPartitionColumnCount = 0; + int partitionColumnCount = 0; + + List nonPartColumnList = null; + String nonPartColumnsString = ""; + List nonPartTypeInfoList = null; + String partColumnsString = ""; + String partTypesString = ""; + + /* + for (String path : mapWork.getPathToPartitionInfo().keySet()) { + PartitionDesc partDesc = mapWork.getPathToPartitionInfo().get(path); + LOG.info("PathToPartitionInfo path: " + path); + LOG.info("PathToPartitionInfo path: " + path + " deserializer " + partDesc.getDeserializerClassName()); + LOG.info("PathToPartitionInfo path: " + path + " input file format " + partDesc.getInputFileFormatClassName()); + LOG.info("PathToPartitionInfo path: " + path + " output file format " + partDesc.getOutputFileFormatClassName()); + LOG.info("PathToPartitionInfo path: " + path + " serde " + partDesc.getSerdeClassName()); + LOG.info("PathToPartitionInfo path: " + path + " base file name " + partDesc.getBaseFileName()); + } + */ + + // Validate the input format + PartitionConversion partitionConversion = new PartitionConversion(); + LinkedHashMap> pathToAliases = mapWork.getPathToAliases(); + LinkedHashMap pathToPartitionInfo = mapWork.getPathToPartitionInfo(); + for (Entry> entry: pathToAliases.entrySet()) { + String path = entry.getKey(); + List aliases = entry.getValue(); + boolean isPresent = (aliases != null && aliases.indexOf(alias) != -1); + if (!isPresent) { + LOG.info("Alias " + alias + " not present in aliases " + aliases); + return false; + } + PartitionDesc partDesc = pathToPartitionInfo.get(path); + if (partDesc.getVectorPartitionDesc() != null) { + // We seen this already. + continue; + } + if (!verifyInputFormat(partDesc, useVectorizedInputFileFormat, useVectorDeserialize, + useRowDeserialize)) { return false; } + VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); + LOG.info("Vectorizer path: " + path + ", read type " + vectorPartDesc.getVectorMapOperatorReadType().name() + + ", vector deserialize type " + vectorPartDesc.getVectorDeserializeType().name() + ", aliases " + aliases); + + Properties partProps = partDesc.getProperties(); + + String nextNonPartColumnsString = + partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMNS); + String[] nextNonPartColumns = nextNonPartColumnsString.split(","); + + String nextNonPartTypesString = + partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMN_TYPES); + + // We convert to an array of TypeInfo using a library routine since it parses the information + // and can handle use of different separators, etc. We cannot use the raw type string + // for comparison in the map because of the different separators used. + List nextNonPartTypeInfoList = + TypeInfoUtils.getTypeInfosFromTypeString(nextNonPartTypesString); + + String nextPartColumnsString = + partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); + String nextPartTypesString = + partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES); + + if (isFirst) { + + // We establish with the first one whether the table is partitioned or not. + + if (nextPartColumnsString == null) { + partitionColumnCount = 0; + nonPartitionColumnCount = logicalCount; + } else { + partitionColumnCount = nextPartColumnsString.split("/").length; + nonPartitionColumnCount = logicalCount - partitionColumnCount; + } + + nonPartColumnList = logicalColumnNameList.subList(0, nonPartitionColumnCount); + nonPartColumnsString = getColumns(logicalColumnNameList, 0, nonPartitionColumnCount, ','); + nonPartTypeInfoList = logicalTypeInfoList.subList(0, nonPartitionColumnCount); + + if (partitionColumnCount > 0) { + partColumnsString = + getColumns(logicalColumnNameList, nonPartitionColumnCount, partitionColumnCount, '/'); + partTypesString = + getTypes(logicalTypeInfoList, nonPartitionColumnCount, partitionColumnCount, ':'); + + if (!partColumnsString.equalsIgnoreCase(nextPartColumnsString)) { + LOG.info( + String.format("Could not vectorize partition %s. Its partition column names %s do not match the table's partition column names %s", + path, nextPartColumnsString, partColumnsString)); + return false; + } + if (!partTypesString.equalsIgnoreCase(nextPartTypesString)) { + LOG.info( + String.format("Could not vectorize partition %s. Its partition column types %s do not match the table's partition column types %s", + path, nextPartTypesString, partTypesString)); + return false; + } + } + + // Add the table (non-partitioned) columns and types into the map as not needing + // conversion (i.e. null). + conversionMap.put( + new ImmutablePair(nonPartColumnsString, nonPartTypeInfoList), null); + + isFirst = false; + } else { + if (partitionColumnCount > 0) { + if (!partColumnsString.equalsIgnoreCase(nextPartColumnsString)) { + LOG.info( + String.format("Could not vectorize partition %s. Its partition column names %s do not match the other partition column names %s", + path, nextPartColumnsString, partColumnsString)); + return false; + } + if (!partTypesString.equalsIgnoreCase(nextPartTypesString)) { + LOG.info( + String.format("Could not vectorize partition %s. Its partition column types %s do not match the other partition column types %s", + path, nextPartTypesString, partTypesString)); + return false; + } + } + } + + ImmutablePair columnNamesAndTypesCombination = + new ImmutablePair(nextNonPartColumnsString, nextNonPartTypeInfoList); + + boolean[] conversionFlags; + if (conversionMap.containsKey(columnNamesAndTypesCombination)) { + + conversionFlags = conversionMap.get(columnNamesAndTypesCombination); + + } else { + + List nextNonPartColumnList = Arrays.asList(nextNonPartColumns); + + // Validate the column names that are present are the same. Missing columns will be + // implicitly defaulted to null. + + if (nextNonPartColumnList.size() > nonPartColumnList.size()) { + LOG.info( + String.format("Could not vectorize partition %s. The partition column names %d is greater than the number of table columns %d", + path, nextNonPartColumnList.size(), nonPartColumnList.size())); + return false; + } + for (int i = 0; i < nextNonPartColumnList.size(); i++) { + String nextColumnName = nextNonPartColumnList.get(i); + String tableColumnName = nonPartColumnList.get(i); + if (!nextColumnName.equals(tableColumnName)) { + LOG.info( + String.format("Could not vectorize partition %s. The partition column name %s is does not match table column name %s", + path, nextColumnName, tableColumnName)); + return false; + } + } + + // The table column types might have been changed with ALTER. There are restrictions + // here for vectorization. + + // Some readers / deserializers take responsibility for conversion themselves. + + // If we need to check for conversion, the conversion object may come back null + // indicating from a vectorization point of view the conversion is implicit. That is, + // all implicit integer upgrades. + + if (vectorPartDesc.getNeedsDataTypeConversionCheck() && + !nextNonPartTypeInfoList.equals(nonPartTypeInfoList)) { + + // The results will be in 2 members: validConversion and conversionFlags + partitionConversion.validateConversion(nextNonPartTypeInfoList, nonPartTypeInfoList); + if (!partitionConversion.getValidConversion()) { + return false; + } + conversionFlags = partitionConversion.getResultConversionFlags(); + } else { + conversionFlags = null; + } + + // We enter this in our map so we don't have to check again for subsequent partitions. + + conversionMap.put(columnNamesAndTypesCombination, conversionFlags); + } + + vectorPartDesc.setConversionFlags(conversionFlags); + + vectorPartDesc.setTypeInfos(nextNonPartTypeInfoList); + } + + vectorTaskColumnInfo.setColumnNames(logicalColumnNameList); + List typeInfoList = new ArrayList(); + vectorTaskColumnInfo.setTypeInfos(logicalTypeInfoList); + vectorTaskColumnInfo.setPartitionColumnCount(partitionColumnCount); + vectorTaskColumnInfo.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat); + + return true; + } + + private boolean validateMapWork(MapWork mapWork, VectorTaskColumnInfo vectorTaskColumnInfo, boolean isTez) + throws SemanticException { + + LOG.info("Validating MapWork..."); + + ImmutablePair pair = verifyOnlyOneTableScanOperator(mapWork); + if (pair == null) { + return false; } + String alias = pair.left; + TableScanOperator tableScanOperator = pair.right; + + // This call fills in the column names, types, and partition column count in + // vectorTaskColumnInfo. + if (!validateInputFormatAndSchemaEvolution(mapWork, alias, tableScanOperator, vectorTaskColumnInfo)) { + return false; + } + Map opRules = new LinkedHashMap(); MapWorkValidationNodeProcessor vnp = new MapWorkValidationNodeProcessor(mapWork, isTez); addMapWorkRules(opRules, vnp); @@ -420,11 +847,14 @@ private boolean validateMapWork(MapWork mapWork, boolean isTez) throws SemanticE return true; } - private void vectorizeMapWork(MapWork mapWork, boolean isTez) throws SemanticException { + private void vectorizeMapWork(MapWork mapWork, VectorTaskColumnInfo vectorTaskColumnInfo, + boolean isTez) throws SemanticException { + LOG.info("Vectorizing MapWork..."); mapWork.setVectorMode(true); Map opRules = new LinkedHashMap(); - MapWorkVectorizationNodeProcessor vnp = new MapWorkVectorizationNodeProcessor(mapWork, isTez); + MapWorkVectorizationNodeProcessor vnp = + new MapWorkVectorizationNodeProcessor(mapWork, isTez, vectorTaskColumnInfo); addMapWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new PreOrderWalker(disp); @@ -434,9 +864,9 @@ private void vectorizeMapWork(MapWork mapWork, boolean isTez) throws SemanticExc HashMap nodeOutput = new HashMap(); ogw.startWalking(topNodes, nodeOutput); - mapWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap()); - mapWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap()); - mapWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap()); + vectorTaskColumnInfo.setScratchTypeNameArray(vnp.getVectorScratchColumnTypeNames()); + + vectorTaskColumnInfo.transferToBaseWork(mapWork); if (LOG.isDebugEnabled()) { debugDisplayAllMaps(mapWork); @@ -446,13 +876,19 @@ private void vectorizeMapWork(MapWork mapWork, boolean isTez) throws SemanticExc } private void convertReduceWork(ReduceWork reduceWork, boolean isTez) throws SemanticException { - boolean ret = validateReduceWork(reduceWork); + VectorTaskColumnInfo vectorTaskColumnInfo = new VectorTaskColumnInfo(); + boolean ret = validateReduceWork(reduceWork, vectorTaskColumnInfo, isTez); if (ret) { - vectorizeReduceWork(reduceWork, isTez); + vectorizeReduceWork(reduceWork, vectorTaskColumnInfo, isTez); } } - private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork) throws SemanticException { + private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork, + VectorTaskColumnInfo vectorTaskColumnInfo) throws SemanticException { + + ArrayList reduceColumnNames = new ArrayList(); + ArrayList reduceTypeInfos = new ArrayList(); + try { // Check key ObjectInspector. ObjectInspector keyObjectInspector = reduceWork.getKeyObjectInspector(); @@ -476,9 +912,6 @@ private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork) throws Sema StructObjectInspector valueStructObjectInspector = (StructObjectInspector)valueObjectInspector; List valueFields = valueStructObjectInspector.getAllStructFieldRefs(); - reduceColumnNames = new ArrayList(); - reduceTypeInfos = new ArrayList(); - for (StructField field: keyFields) { reduceColumnNames.add(Utilities.ReduceField.KEY.toString() + "." + field.getFieldName()); reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName())); @@ -490,6 +923,10 @@ private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork) throws Sema } catch (Exception e) { throw new SemanticException(e); } + + vectorTaskColumnInfo.setColumnNames(reduceColumnNames); + vectorTaskColumnInfo.setTypeInfos(reduceTypeInfos); + return true; } @@ -498,11 +935,13 @@ private void addReduceWorkRules(Map opRules, NodeProcessor opRules.put(new RuleRegExp("R2", SelectOperator.getOperatorName() + ".*"), np); } - private boolean validateReduceWork(ReduceWork reduceWork) throws SemanticException { + private boolean validateReduceWork(ReduceWork reduceWork, + VectorTaskColumnInfo vectorTaskColumnInfo, boolean isTez) throws SemanticException { + LOG.info("Validating ReduceWork..."); // Validate input to ReduceWork. - if (!getOnlyStructObjectInspectors(reduceWork)) { + if (!getOnlyStructObjectInspectors(reduceWork, vectorTaskColumnInfo)) { return false; } // Now check the reduce operator tree. @@ -526,7 +965,9 @@ private boolean validateReduceWork(ReduceWork reduceWork) throws SemanticExcepti return true; } - private void vectorizeReduceWork(ReduceWork reduceWork, boolean isTez) throws SemanticException { + private void vectorizeReduceWork(ReduceWork reduceWork, + VectorTaskColumnInfo vectorTaskColumnInfo, boolean isTez) throws SemanticException { + LOG.info("Vectorizing ReduceWork..."); reduceWork.setVectorMode(true); @@ -535,7 +976,7 @@ private void vectorizeReduceWork(ReduceWork reduceWork, boolean isTez) throws Se // VectorizationContext... Do we use PreOrderWalker instead of DefaultGraphWalker. Map opRules = new LinkedHashMap(); ReduceWorkVectorizationNodeProcessor vnp = - new ReduceWorkVectorizationNodeProcessor(reduceColumnNames, reduceTypeInfos, isTez); + new ReduceWorkVectorizationNodeProcessor(vectorTaskColumnInfo, isTez); addReduceWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new PreOrderWalker(disp); @@ -550,9 +991,9 @@ private void vectorizeReduceWork(ReduceWork reduceWork, boolean isTez) throws Se // Necessary since we are vectorizing the root operator in reduce. reduceWork.setReducer(vnp.getRootVectorOp()); - reduceWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap()); - reduceWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap()); - reduceWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap()); + vectorTaskColumnInfo.setScratchTypeNameArray(vnp.getVectorScratchColumnTypeNames()); + + vectorTaskColumnInfo.transferToBaseWork(reduceWork); if (LOG.isDebugEnabled()) { debugDisplayAllMaps(reduceWork); @@ -615,23 +1056,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // The vectorization context for the Map or Reduce task. protected VectorizationContext taskVectorizationContext; - // The input projection column type name map for the Map or Reduce task. - protected Map taskColumnTypeNameMap; - VectorizationNodeProcessor() { - taskColumnTypeNameMap = new HashMap(); } - public Map getVectorColumnNameMap() { - return taskVectorizationContext.getProjectionColumnMap(); - } - - public Map getVectorColumnTypeMap() { - return taskColumnTypeNameMap; - } - - public Map getVectorScratchColumnTypeMap() { - return taskVectorizationContext.getScratchColumnTypeMap(); + public String[] getVectorScratchColumnTypeNames() { + return taskVectorizationContext.getScratchColumnTypeNames(); } protected final Set> opsDone = @@ -701,11 +1130,14 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, class MapWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { private final MapWork mWork; + private VectorTaskColumnInfo vectorTaskColumnInfo; private final boolean isTez; - public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTez) { + public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTez, + VectorTaskColumnInfo vectorTaskColumnInfo) { super(); this.mWork = mWork; + this.vectorTaskColumnInfo = vectorTaskColumnInfo; this.isTez = isTez; } @@ -719,8 +1151,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, if (op instanceof TableScanOperator) { if (taskVectorizationContext == null) { - taskVectorizationContext = getVectorizationContext(op.getSchema(), op.getName(), - taskColumnTypeNameMap); + taskVectorizationContext = getVectorizationContext(op.getName(), vectorTaskColumnInfo); } vContext = taskVectorizationContext; } else { @@ -761,8 +1192,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { - private final List reduceColumnNames; - private final List reduceTypeInfos; + private VectorTaskColumnInfo vectorTaskColumnInfo; private boolean isTez; @@ -772,11 +1202,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return rootVectorOp; } - public ReduceWorkVectorizationNodeProcessor(List reduceColumnNames, - List reduceTypeInfos, boolean isTez) { + public ReduceWorkVectorizationNodeProcessor(VectorTaskColumnInfo vectorTaskColumnInfo, + boolean isTez) { + super(); - this.reduceColumnNames = reduceColumnNames; - this.reduceTypeInfos = reduceTypeInfos; + this.vectorTaskColumnInfo = vectorTaskColumnInfo; rootVectorOp = null; this.isTez = isTez; } @@ -792,15 +1222,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, boolean saveRootVectorOp = false; if (op.getParentOperators().size() == 0) { - LOG.info("ReduceWorkVectorizationNodeProcessor process reduceColumnNames " + reduceColumnNames.toString()); + LOG.info("ReduceWorkVectorizationNodeProcessor process reduceColumnNames " + vectorTaskColumnInfo.columnNames.toString()); - vContext = new VectorizationContext("__Reduce_Shuffle__", reduceColumnNames); + vContext = new VectorizationContext("__Reduce_Shuffle__", vectorTaskColumnInfo.columnNames); taskVectorizationContext = vContext; - int i = 0; - for (TypeInfo typeInfo : reduceTypeInfos) { - taskColumnTypeNameMap.put(i, typeInfo.getTypeName()); - i++; - } + saveRootVectorOp = true; if (LOG.isDebugEnabled()) { @@ -1361,23 +1787,11 @@ private boolean validateDataType(String type, VectorExpressionDescriptor.Mode mo return result; } - private VectorizationContext getVectorizationContext(RowSchema rowSchema, String contextName, - Map typeNameMap) { - - VectorizationContext vContext = new VectorizationContext(contextName); + private VectorizationContext getVectorizationContext(String contextName, + VectorTaskColumnInfo vectorTaskColumnInfo) { - // Add all non-virtual columns to make a vectorization context for - // the TableScan operator. - int i = 0; - for (ColumnInfo c : rowSchema.getSignature()) { - // Earlier, validation code should have eliminated virtual columns usage (HIVE-5560). - if (!isVirtualColumn(c)) { - vContext.addInitialColumn(c.getInternalName()); - typeNameMap.put(i, c.getTypeName()); - i++; - } - } - vContext.finishedAddingInitialColumns(); + // UNDONE: And, the types??? + VectorizationContext vContext = new VectorizationContext(contextName, vectorTaskColumnInfo.columnNames); return vContext; } @@ -1736,12 +2150,14 @@ private boolean isVirtualColumn(ColumnInfo column) { public void debugDisplayAllMaps(BaseWork work) { - Map columnNameMap = work.getVectorColumnNameMap(); - Map columnTypeMap = work.getVectorColumnTypeMap(); - Map scratchColumnTypeMap = work.getVectorScratchColumnTypeMap(); + String[] columnNames = work.getVectorColumnNames(); + Object columnTypeInfos = work.getVectorColumnTypeInfos(); + int partitionColumnCount = work.getVectorPartitionColumnCount(); + String[] scratchColumnTypeNames = work.getVectorScratchColumnTypeNames(); - LOG.debug("debugDisplayAllMaps columnNameMap " + columnNameMap.toString()); - LOG.debug("debugDisplayAllMaps columnTypeMap " + columnTypeMap.toString()); - LOG.debug("debugDisplayAllMaps scratchColumnTypeMap " + scratchColumnTypeMap.toString()); + LOG.debug("debugDisplayAllMaps columnNames " + Arrays.toString(columnNames)); + LOG.debug("debugDisplayAllMaps columnTypeInfos " + Arrays.deepToString((Object[]) columnTypeInfos)); + LOG.debug("debugDisplayAllMaps partitionColumnCount " + partitionColumnCount); + LOG.debug("debugDisplayAllMaps scratchColumnTypeNames " + Arrays.toString(scratchColumnTypeNames)); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java index d574c5c..627b19b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java @@ -32,6 +32,7 @@ import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.hive.ql.plan.Explain.Level; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** @@ -64,9 +65,13 @@ public BaseWork(String name) { // Vectorization. - protected Map vectorColumnNameMap; - protected Map vectorColumnTypeMap; - protected Map vectorScratchColumnTypeMap; + protected String[] vectorColumnNames; + protected TypeInfo[] vectorColumnTypeInfos; + protected int vectorPartitionColumnCount; + + protected boolean useVectorizedInputFileFormat; + + protected String[] vectorScratchColumnTypeNames; public void setGatheringStats(boolean gatherStats) { this.gatheringStats = gatherStats; @@ -152,30 +157,50 @@ public void addDummyOp(HashTableDummyOperator dummyOp) { return returnSet; } - public Map getVectorColumnNameMap() { - return vectorColumnNameMap; + // ----------------------------------------------------------------------------------------------- + + public String[] getVectorColumnNames() { + return vectorColumnNames; + } + + public void setVectorColumnNames(String[] vectorColumnNames) { + this.vectorColumnNames = vectorColumnNames; + } + + public TypeInfo[] getVectorColumnTypeInfos() { + return vectorColumnTypeInfos; + } + + public void setVectorColumnTypeInfos(TypeInfo[] vectorColumnTypeInfos) { + this.vectorColumnTypeInfos = vectorColumnTypeInfos; } - public void setVectorColumnNameMap(Map vectorColumnNameMap) { - this.vectorColumnNameMap = vectorColumnNameMap; + public int getVectorPartitionColumnCount() { + return vectorPartitionColumnCount; } - public Map getVectorColumnTypeMap() { - return vectorColumnTypeMap; + public void setVectorPartitionColumnCount(int vectorPartitionColumnCount) { + this.vectorPartitionColumnCount = vectorPartitionColumnCount; } - public void setVectorColumnTypeMap(Map vectorColumnTypeMap) { - this.vectorColumnTypeMap = vectorColumnTypeMap; + public void setUseVectorizedInputFileFormat(boolean useVectorizedInputFileFormat) { + this.useVectorizedInputFileFormat = useVectorizedInputFileFormat; } - public Map getVectorScratchColumnTypeMap() { - return vectorScratchColumnTypeMap; + public boolean getUseVectorizedInputFileFormat() { + return useVectorizedInputFileFormat; } - public void setVectorScratchColumnTypeMap(Map vectorScratchColumnTypeMap) { - this.vectorScratchColumnTypeMap = vectorScratchColumnTypeMap; + public String[] getVectorScratchColumnTypeNames() { + return vectorScratchColumnTypeNames; } + public void setVectorScratchColumnTypeNames(String[] vectorScratchColumnTypeNames) { + this.vectorScratchColumnTypeNames = vectorScratchColumnTypeNames; + } + + // ----------------------------------------------------------------------------------------------- + /** * @return the mapredLocalWork */ diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionConversion.java ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionConversion.java new file mode 100644 index 0000000..629aaf6 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionConversion.java @@ -0,0 +1,178 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * PartitionConversion. + * + */ +public class PartitionConversion { + + private static long serialVersionUID = 1L; + + private boolean validConversion; + private boolean[] resultConversionFlags; + + private TypeInfo invalidFromTypeInfo; + private TypeInfo invalidToTypeInfo; + + public boolean getValidConversion() { + return validConversion; + } + + public boolean[] getResultConversionFlags() { + return resultConversionFlags; + } + + public TypeInfo getInvalidFromTypeInfo() { + return invalidFromTypeInfo; + } + + public TypeInfo getInvalidToTypeInfo() { + return invalidToTypeInfo; + } + + // Currently, we only support these no-precision-loss or promotion data type conversions: + // + // UNDONE: And, all of them stay within the vector column type (Long, Double, Bytes, Decimal) + // UNDONE: for now. + // + // Short -> Int IMPLICIT WITH VECTORIZATION + // Short -> BigInt IMPLICIT WITH VECTORIZATION + // Float -> Double IMPLICIT WITH VECTORIZATION + // Int --> BigInt IMPLICIT WITH VECTORIZATION + // (Char | VarChar) -> String IMPLICIT WITH VECTORIZATION + // + private static HashMap validFromPrimitiveMap = + new HashMap(); + static { + validFromPrimitiveMap.put( + PrimitiveCategory.SHORT, + new PrimitiveCategory[] { PrimitiveCategory.INT, PrimitiveCategory.LONG }); + validFromPrimitiveMap.put( + PrimitiveCategory.INT, + new PrimitiveCategory[] { PrimitiveCategory.LONG }); + validFromPrimitiveMap.put( + PrimitiveCategory.FLOAT, + new PrimitiveCategory[] { PrimitiveCategory.DOUBLE } ); + validFromPrimitiveMap.put( + PrimitiveCategory.CHAR, + new PrimitiveCategory[] { PrimitiveCategory.STRING } ); + validFromPrimitiveMap.put( + PrimitiveCategory.VARCHAR, + new PrimitiveCategory[] { PrimitiveCategory.STRING } ); + } + + private boolean validateOne(TypeInfo fromTypeInfo, TypeInfo toTypeInfo) { + + if (fromTypeInfo.equals(toTypeInfo)) { + return false; + } + + if (fromTypeInfo.getCategory() == Category.PRIMITIVE && + toTypeInfo.getCategory() == Category.PRIMITIVE) { + + PrimitiveCategory fromPrimitiveCategory = ((PrimitiveTypeInfo) fromTypeInfo).getPrimitiveCategory(); + PrimitiveCategory toPrimitiveCategory = ((PrimitiveTypeInfo) toTypeInfo).getPrimitiveCategory(); + + PrimitiveCategory[] toPrimitiveCategories = + validFromPrimitiveMap.get(fromPrimitiveCategory); + if (toPrimitiveCategories == null || + !ArrayUtils.contains(toPrimitiveCategories, toPrimitiveCategory)) { + invalidFromTypeInfo = fromTypeInfo; + invalidToTypeInfo = toTypeInfo; + + // Tell caller a bad one was found. + validConversion = false; + return false; + } + } else { + // Ignore checking complex types. Assume they will not be included in the query. + } + + return true; + } + + public void validateConversion(List fromTypeInfoList, + List toTypeInfoList) { + + final int columnCount = fromTypeInfoList.size(); + resultConversionFlags = new boolean[columnCount]; + + // The method validateOne will turn this off when invalid conversion is found. + validConversion = true; + + boolean atLeastOneConversion = false; + for (int i = 0; i < columnCount; i++) { + TypeInfo fromTypeInfo = fromTypeInfoList.get(i); + TypeInfo toTypeInfo = toTypeInfoList.get(i); + + resultConversionFlags[i] = validateOne(fromTypeInfo, toTypeInfo); + if (!validConversion) { + return; + } + } + + if (atLeastOneConversion) { + // Leave resultConversionFlags set. + } else { + resultConversionFlags = null; + } + } + + public void validateConversion(TypeInfo[] fromTypeInfos, TypeInfo[] toTypeInfos) { + + final int columnCount = fromTypeInfos.length; + resultConversionFlags = new boolean[columnCount]; + + // The method validateOne will turn this off when invalid conversion is found. + validConversion = true; + + boolean atLeastOneConversion = false; + for (int i = 0; i < columnCount; i++) { + TypeInfo fromTypeInfo = fromTypeInfos[i]; + TypeInfo toTypeInfo = toTypeInfos[i]; + + resultConversionFlags[i] = validateOne(fromTypeInfo, toTypeInfo); + if (!validConversion) { + return; + } + if (resultConversionFlags[i]) { + atLeastOneConversion = true; + } + } + + if (atLeastOneConversion) { + // Leave resultConversionFlags set. + } else { + resultConversionFlags = null; + } + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java index 864301c..b032349 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java @@ -68,11 +68,13 @@ private String baseFileName; + private VectorPartitionDesc vectorPartitionDesc; + public void setBaseFileName(String baseFileName) { this.baseFileName = baseFileName; } - public PartitionDesc() { + public PartitionDesc() { } public PartitionDesc(final TableDesc table, final LinkedHashMap partSpec) { @@ -271,6 +273,9 @@ public PartitionDesc clone() { ret.partSpec = new java.util.LinkedHashMap(); ret.partSpec.putAll(partSpec); } + if (vectorPartitionDesc != null) { + ret.vectorPartitionDesc = vectorPartitionDesc.clone(); + } return ret; } @@ -300,4 +305,12 @@ public void deriveBaseFileName(String path) { public void intern(Interner interner) { this.tableDesc = interner.intern(tableDesc); } + + public void setVectorPartitionDesc(VectorPartitionDesc vectorPartitionDesc) { + this.vectorPartitionDesc = vectorPartitionDesc; + } + + public VectorPartitionDesc getVectorPartitionDesc() { + return vectorPartitionDesc; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java new file mode 100644 index 0000000..ff77c4e --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java @@ -0,0 +1,164 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * VectorMapDesc. + * + * Extra vector information just for the PartitionDesc. + * + */ +public class VectorPartitionDesc { + + private static long serialVersionUID = 1L; + + // Data Type Conversion Needed? + // + // VECTORIZED_INPUT_FILE_FORMAT: + // No data type conversion check? Assume ALTER TABLE prevented conversions that + // VectorizedInputFileFormat cannot handle... + // + // VECTOR_DESERIALIZE: + // LAZY_SIMPLE: + // Capable of converting on its own. + // LAZY_BINARY + // Partition schema assumed to match file contents. + // Conversion necessary from partition field values to vector columns. + // ROW_DESERIALIZE + // Partition schema assumed to match file contents. + // Conversion necessary from partition field values to vector columns. + // + + public static enum VectorMapOperatorReadType { + VECTORIZED_INPUT_FILE_FORMAT, + VECTOR_DESERIALIZE, + ROW_DESERIALIZE + } + + public static enum VectorDeserializeType { + NONE, + LAZY_SIMPLE, + LAZY_BINARY + } + + private final VectorMapOperatorReadType vectorMapOperatorReadType; + private final VectorDeserializeType vectorDeserializeType; + + private final boolean needsDataTypeConversionCheck; + + private boolean[] conversionFlags; + + private TypeInfo[] typeInfos; + + private VectorPartitionDesc(VectorMapOperatorReadType vectorMapOperatorReadType, + boolean needsDataTypeConversionCheck) { + this.vectorMapOperatorReadType = vectorMapOperatorReadType; + this.vectorDeserializeType = VectorDeserializeType.NONE; + this.needsDataTypeConversionCheck = needsDataTypeConversionCheck; + + conversionFlags = null; + typeInfos = null; + } + + private VectorPartitionDesc(VectorMapOperatorReadType vectorMapOperatorReadType, + VectorDeserializeType vectorDeserializeType, boolean needsDataTypeConversionCheck) { + this.vectorMapOperatorReadType = vectorMapOperatorReadType; + this.vectorDeserializeType = vectorDeserializeType; + this.needsDataTypeConversionCheck = needsDataTypeConversionCheck; + + conversionFlags = null; + typeInfos = null; + } + + public static VectorPartitionDesc VectorizedInputFileFormat() { + return new VectorPartitionDesc(VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT, true); + } + + public static VectorPartitionDesc VectorDeserialize(VectorDeserializeType vectorDeserializeType) { + boolean needsDataTypeConversionCheck; + switch (vectorDeserializeType) { + case LAZY_SIMPLE: + needsDataTypeConversionCheck = false; + break; + case LAZY_BINARY: + needsDataTypeConversionCheck = true; + break; + default: + throw new RuntimeException("Unexpected vector deserialize type " + + vectorDeserializeType.name()); + } + return new VectorPartitionDesc( + VectorMapOperatorReadType.VECTOR_DESERIALIZE, vectorDeserializeType, + needsDataTypeConversionCheck); + } + + public static VectorPartitionDesc RowDeserialize() { + return new VectorPartitionDesc(VectorMapOperatorReadType.ROW_DESERIALIZE, true); + } + + @Override + public VectorPartitionDesc clone() { + VectorPartitionDesc result = + new VectorPartitionDesc(vectorMapOperatorReadType, vectorDeserializeType, + needsDataTypeConversionCheck); + result.conversionFlags = + (conversionFlags == null ? null : + Arrays.copyOf(conversionFlags, conversionFlags.length)); + result.typeInfos = Arrays.copyOf(typeInfos, typeInfos.length); + return result; + } + + public VectorMapOperatorReadType getVectorMapOperatorReadType() { + return vectorMapOperatorReadType; + } + + public VectorDeserializeType getVectorDeserializeType() { + return vectorDeserializeType; + } + + public boolean getNeedsDataTypeConversionCheck() { + return needsDataTypeConversionCheck; + } + + public void setConversionFlags(boolean[] conversionFlags) { + this.conversionFlags = conversionFlags; + } + + public boolean[] getConversionFlags() { + return conversionFlags; + } + + public TypeInfo[] getTypeInfos() { + return typeInfos; + } + + public void setTypeInfos(List typeInfoList) { + typeInfos = typeInfoList.toArray(new TypeInfo[0]); + } + + public int getNonPartColumnCount() { + return typeInfos.length; + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java index 0f8712e..c076e6c 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import junit.framework.TestCase; @@ -50,13 +51,13 @@ void examineBatch(VectorizedRowBatch batch, VectorExtractRowSameBatch vectorExtr void testVectorRowObject(int caseNum, Random r) throws HiveException { - Map emptyScratchMap = new HashMap(); + String[] emptyScratchTypeNames = new String[0]; RandomRowObjectSource source = new RandomRowObjectSource(); source.init(r); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); - batchContext.init(emptyScratchMap, source.rowStructObjectInspector()); + batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); VectorAssignRowSameBatch vectorAssignRow = new VectorAssignRowSameBatch(); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java index 23e44f0..d3dc30d 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java @@ -71,6 +71,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.hive.serde2.fast.SerializeWrite; import org.apache.hadoop.io.BooleanWritable; @@ -331,13 +332,13 @@ void serializeBatch(VectorizedRowBatch batch, VectorSerializeRow vectorSerialize void testVectorSerializeRow(int caseNum, Random r, SerializationType serializationType) throws HiveException, IOException, SerDeException { - Map emptyScratchMap = new HashMap(); + String[] emptyScratchTypeNames = new String[0]; RandomRowObjectSource source = new RandomRowObjectSource(); source.init(r); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); - batchContext.init(emptyScratchMap, source.rowStructObjectInspector()); + batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); VectorAssignRowSameBatch vectorAssignRow = new VectorAssignRowSameBatch(); @@ -563,13 +564,13 @@ private LazySerDeParameters getSerDeParams(StructObjectInspector rowObjectInspec void testVectorDeserializeRow(int caseNum, Random r, SerializationType serializationType) throws HiveException, IOException, SerDeException { - Map emptyScratchMap = new HashMap(); + String[] emptyScratchTypeNames = new String[0]; RandomRowObjectSource source = new RandomRowObjectSource(); source.init(r); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); - batchContext.init(emptyScratchMap, source.rowStructObjectInspector()); + batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); int fieldCount = source.typeNames().size(); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java deleted file mode 100644 index 3321823..0000000 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java +++ /dev/null @@ -1,357 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector; - -import java.io.File; -import java.io.IOException; -import java.sql.Timestamp; -import java.util.Arrays; -import java.util.Calendar; -import java.util.List; -import java.util.Properties; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.RCFile; -import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; -import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; -import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; -import org.apache.hadoop.hive.serde2.io.ByteWritable; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; -import org.apache.hadoop.hive.serde2.io.ShortWritable; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.DataOutputBuffer; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.ObjectWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.compress.DefaultCodec; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -/** - * Class that tests the functionality of VectorizedRowBatchCtx. - */ -public class TestVectorizedRowBatchCtx { - - private Configuration conf; - private FileSystem fs; - private Path testFilePath; - private int colCount; - private ColumnarSerDe serDe; - private Properties tbl; - - @Before - public void openFileSystem() throws Exception { - conf = new Configuration(); - fs = FileSystem.getLocal(conf); - Path workDir = new Path(System.getProperty("test.tmp.dir", - "target" + File.separator + "test" + File.separator + "tmp")); - fs.setWorkingDirectory(workDir); - testFilePath = new Path("TestVectorizedRowBatchCtx.testDump.rc"); - fs.delete(testFilePath, false); - } - - private void initSerde() { - tbl = new Properties(); - - // Set the configuration parameters - tbl.setProperty(serdeConstants.SERIALIZATION_FORMAT, "6"); - tbl.setProperty("columns", - "ashort,aint,along,adouble,afloat,astring,abyte,aboolean,atimestamp"); - tbl.setProperty("columns.types", - "smallint:int:bigint:double:float:string:tinyint:boolean:timestamp"); - colCount = 9; - tbl.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); - - try { - serDe = new ColumnarSerDe(); - SerDeUtils.initializeSerDe(serDe, conf, tbl, null); - } catch (SerDeException e) { - throw new RuntimeException(e); - } - } - - private void WriteRCFile(FileSystem fs, Path file, Configuration conf) - throws IOException, SerDeException { - fs.delete(file, true); - - RCFileOutputFormat.setColumnNumber(conf, colCount); - RCFile.Writer writer = - new RCFile.Writer(fs, conf, file, null, null, - new DefaultCodec()); - - for (int i = 0; i < 10; ++i) { - BytesRefArrayWritable bytes = new BytesRefArrayWritable(colCount); - BytesRefWritable cu; - - if (i % 3 != 0) { - //if (i < 100) { - cu = new BytesRefWritable((i + "").getBytes("UTF-8"), 0, (i + "").getBytes("UTF-8").length); - bytes.set(0, cu); - - cu = new BytesRefWritable((i + 100 + "").getBytes("UTF-8"), 0, - (i + 100 + "").getBytes("UTF-8").length); - bytes.set(1, cu); - - cu = new BytesRefWritable((i + 200 + "").getBytes("UTF-8"), 0, - (i + 200 + "").getBytes("UTF-8").length); - bytes.set(2, cu); - - cu = new BytesRefWritable((i + 1.23 + "").getBytes("UTF-8"), 0, - (i + 1.23 + "").getBytes("UTF-8").length); - bytes.set(3, cu); - - cu = new BytesRefWritable((i + 2.23 + "").getBytes("UTF-8"), 0, - (i + 2.23 + "").getBytes("UTF-8").length); - bytes.set(4, cu); - - cu = new BytesRefWritable(("Test string").getBytes("UTF-8"), 0, - ("Test string").getBytes("UTF-8").length); - bytes.set(5, cu); - - cu = new BytesRefWritable((1 + "").getBytes("UTF-8"), 0, - (1 + "").getBytes("UTF-8").length); - bytes.set(6, cu); - - cu = new BytesRefWritable(("true").getBytes("UTF-8"), 0, - ("true").getBytes("UTF-8").length); - bytes.set(7, cu); - - Timestamp t = new Timestamp(Calendar.getInstance().getTime().getTime()); - cu = new BytesRefWritable(t.toString().getBytes("UTF-8"), 0, - t.toString().getBytes("UTF-8").length); - bytes.set(8, cu); - - } else { - cu = new BytesRefWritable((i + "").getBytes("UTF-8"), 0, (i + "").getBytes("UTF-8").length); - bytes.set(0, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(1, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(2, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(3, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(4, cu); - - cu = new BytesRefWritable(("Test string").getBytes("UTF-8"), 0, - ("Test string").getBytes("UTF-8").length); - bytes.set(5, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(6, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(7, cu); - -// cu = new BytesRefWritable(new byte[0], 0, 0); -// bytes.set(8, cu); - Timestamp t = new Timestamp(Calendar.getInstance().getTime().getTime()); - cu = new BytesRefWritable(t.toString().getBytes("UTF-8"), 0, - t.toString().getBytes("UTF-8").length); - bytes.set(8, cu); - } - writer.append(bytes); - } - writer.close(); - } - - private VectorizedRowBatch GetRowBatch() throws SerDeException, HiveException, IOException { - - RCFile.Reader reader = new RCFile.Reader(fs, this.testFilePath, conf); - DataOutputBuffer buffer = new DataOutputBuffer(); - - // Get object inspector - StructObjectInspector oi = (StructObjectInspector) serDe - .getObjectInspector(); - List fieldRefs = oi.getAllStructFieldRefs(); - - Assert.assertEquals("Field size should be 9", colCount, fieldRefs.size()); - - // Create the context - VectorizedRowBatchCtx ctx = new VectorizedRowBatchCtx(oi, oi, serDe, null, null); - VectorizedRowBatch batch = ctx.createVectorizedRowBatch(); - VectorizedBatchUtil.setNoNullFields(batch); - - // Iterate thru the rows and populate the batch - LongWritable rowID = new LongWritable(); - for (int i = 0; i < 10; i++) { - reader.next(rowID); - BytesRefArrayWritable cols = new BytesRefArrayWritable(); - reader.getCurrentRow(cols); - cols.resetValid(colCount); - ctx.addRowToBatch(i, cols, batch, buffer); - } - reader.close(); - batch.size = 10; - return batch; - } - - void ValidateRowBatch(VectorizedRowBatch batch) throws IOException, SerDeException { - - LongWritable rowID = new LongWritable(); - RCFile.Reader reader = new RCFile.Reader(fs, this.testFilePath, conf); - for (int i = 0; i < batch.size; i++) { - reader.next(rowID); - BytesRefArrayWritable cols = new BytesRefArrayWritable(); - reader.getCurrentRow(cols); - cols.resetValid(colCount); - Object row = serDe.deserialize(cols); - - StructObjectInspector oi = (StructObjectInspector) serDe - .getObjectInspector(); - List fieldRefs = oi.getAllStructFieldRefs(); - - for (int j = 0; j < fieldRefs.size(); j++) { - Object fieldData = oi.getStructFieldData(row, fieldRefs.get(j)); - ObjectInspector foi = fieldRefs.get(j).getFieldObjectInspector(); - - // Vectorization only supports PRIMITIVE data types. Assert the same - Assert.assertEquals(true, foi.getCategory() == Category.PRIMITIVE); - - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi; - Object writableCol = poi.getPrimitiveWritableObject(fieldData); - if (writableCol != null) { - switch (poi.getPrimitiveCategory()) { - case BOOLEAN: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == (((BooleanWritable) writableCol).get() ? 1 - : 0)); - } - break; - case BYTE: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == (long) ((ByteWritable) writableCol).get()); - } - break; - case SHORT: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == ((ShortWritable) writableCol).get()); - } - break; - case INT: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == ((IntWritable) writableCol).get()); - } - break; - case LONG: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == ((LongWritable) writableCol).get()); - } - break; - case FLOAT: { - DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[j]; - Assert.assertEquals(true, dcv.vector[i] == ((FloatWritable) writableCol).get()); - } - break; - case DOUBLE: { - DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[j]; - Assert.assertEquals(true, dcv.vector[i] == ((DoubleWritable) writableCol).get()); - } - break; - case BINARY: { - BytesColumnVector bcv = (BytesColumnVector) batch.cols[j]; - BytesWritable colBinary = (BytesWritable) writableCol; - BytesWritable batchBinary = new BytesWritable(); - batchBinary.set(bcv.vector[i], bcv.start[i], bcv.length[i]); - byte[] a = colBinary.getBytes(); - byte[] b = batchBinary.getBytes(); - Assert.assertEquals(true, Arrays.equals(a, b)); - } - break; - case STRING: { - BytesColumnVector bcv = (BytesColumnVector) batch.cols[j]; - Text colText = (Text) writableCol; - Text batchText = new Text(); - batchText.set(bcv.vector[i], bcv.start[i], bcv.length[i]); - String a = colText.toString(); - String b = batchText.toString(); - Assert.assertEquals(true, a.equals(b)); - } - break; - case TIMESTAMP: { - LongColumnVector tcv = (LongColumnVector) batch.cols[j]; - Timestamp t = ((TimestampWritable) writableCol).getTimestamp(); - long timeInNanoSec = (t.getTime() * 1000000) + (t.getNanos() % 1000000); - Assert.assertEquals(true, tcv.vector[i] == timeInNanoSec); - } - break; - default: - Assert.assertTrue("Unknown type", false); - } - } else { - Assert.assertEquals(true, batch.cols[j].isNull[i]); - } - } - - // Check repeating - Assert.assertEquals(false, batch.cols[0].isRepeating); - Assert.assertEquals(false, batch.cols[1].isRepeating); - Assert.assertEquals(false, batch.cols[2].isRepeating); - Assert.assertEquals(false, batch.cols[3].isRepeating); - Assert.assertEquals(false, batch.cols[4].isRepeating); - - // Check non null - Assert.assertEquals(true, batch.cols[0].noNulls); - Assert.assertEquals(false, batch.cols[1].noNulls); - Assert.assertEquals(false, batch.cols[2].noNulls); - Assert.assertEquals(false, batch.cols[3].noNulls); - Assert.assertEquals(false, batch.cols[4].noNulls); - } - reader.close(); - } - - @Test - public void TestCtx() throws Exception { - initSerde(); - WriteRCFile(this.fs, this.testFilePath, this.conf); - VectorizedRowBatch batch = GetRowBatch(); - ValidateRowBatch(batch); - - // Test VectorizedColumnarSerDe - VectorizedColumnarSerDe vcs = new VectorizedColumnarSerDe(); - SerDeUtils.initializeSerDe(vcs, this.conf, tbl, null); - Writable w = vcs.serializeVector(batch, (StructObjectInspector) serDe - .getObjectInspector()); - BytesRefArrayWritable[] refArray = (BytesRefArrayWritable[]) ((ObjectWritable) w).get(); - vcs.deserializeVector(refArray, 10, batch); - ValidateRowBatch(batch); - } -} diff --git ql/src/test/queries/clientpositive/vector_schema_evolution.q ql/src/test/queries/clientpositive/vector_schema_evolution.q new file mode 100644 index 0000000..b4bab48 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_schema_evolution.q @@ -0,0 +1,24 @@ +set hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +create table orc_tab ( foo string, bar string ) partitioned by (dt string) stored as orc; +describe extended orc_tab; + +alter table orc_tab add partition( dt='20150101' ) ; +describe extended orc_tab; + +insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three"); + +explain extended +select * from orc_tab; + +select * from orc_tab; + +alter table orc_tab add columns( goo string ); +describe extended orc_tab; + + +explain extended +select * from orc_tab; + +select * from orc_tab; diff --git ql/src/test/queries/clientpositive/vector_schema_evolution2.q ql/src/test/queries/clientpositive/vector_schema_evolution2.q new file mode 100644 index 0000000..ae1f0a4 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_schema_evolution2.q @@ -0,0 +1,24 @@ +set hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +create table orc_tab ( foo string, bar string ) partitioned by (dt string); +describe extended orc_tab; + +alter table orc_tab add partition( dt='20150101' ) ; +describe extended orc_tab; + +insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three"); + +explain extended +select * from orc_tab; + +select * from orc_tab; + +alter table orc_tab add columns( goo string ); +describe extended orc_tab; + + +explain extended +select * from orc_tab; + +select * from orc_tab; diff --git ql/src/test/queries/clientpositive/vector_schema_evolution3.q ql/src/test/queries/clientpositive/vector_schema_evolution3.q new file mode 100644 index 0000000..1744a61 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_schema_evolution3.q @@ -0,0 +1,24 @@ +set hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +create table orc_tab ( foo string, bar string ) partitioned by (dt string); +describe extended orc_tab; + +alter table orc_tab add partition( dt='20150101' ) ; +describe extended orc_tab; + +insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three"); + +explain extended +select * from orc_tab; + +select * from orc_tab; + +alter table orc_tab CHANGE COLUMN foo foo int; +describe extended orc_tab; + + +explain extended +select * from orc_tab; + +select * from orc_tab; diff --git ql/src/test/queries/clientpositive/vector_schema_evolution4.q ql/src/test/queries/clientpositive/vector_schema_evolution4.q new file mode 100644 index 0000000..8b479d0 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_schema_evolution4.q @@ -0,0 +1,25 @@ +set hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +create table orc_tab ( foo smallint, bar string ) partitioned by (dt string) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe' STORED AS SEQUENCEFILE; +describe extended orc_tab; + +alter table orc_tab add partition( dt='20150101' ) ; +describe extended orc_tab; + +insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three"); + +explain extended +select * from orc_tab; + +select * from orc_tab; + +alter table orc_tab CHANGE COLUMN foo foo int; +describe extended orc_tab; + + +explain extended +select * from orc_tab; + +select * from orc_tab; diff --git ql/src/test/queries/clientpositive/vector_schema_evolution5.q ql/src/test/queries/clientpositive/vector_schema_evolution5.q new file mode 100644 index 0000000..43694f0 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_schema_evolution5.q @@ -0,0 +1,24 @@ +set hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +create table orc_tab ( foo int, bar string ) partitioned by (dt string) stored as orc; +describe extended orc_tab; + +alter table orc_tab add partition( dt='20150101' ) ; +describe extended orc_tab; + +insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three"); + +explain extended +select * from orc_tab; + +select * from orc_tab; + +alter table orc_tab CHANGE COLUMN foo foo bigint; +describe extended orc_tab; + + +explain extended +select * from orc_tab; + +select * from orc_tab; diff --git ql/src/test/queries/clientpositive/vector_schema_evolution6.q ql/src/test/queries/clientpositive/vector_schema_evolution6.q new file mode 100644 index 0000000..6c671b4 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_schema_evolution6.q @@ -0,0 +1,24 @@ +set hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +create table orc_tab ( foo int, bar string ) partitioned by (dt string) stored as orc; +describe extended orc_tab; + +alter table orc_tab add partition( dt='20150101' ) ; +describe extended orc_tab; + +insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three"); + +explain extended +select * from orc_tab; + +select * from orc_tab; + +alter table orc_tab CHANGE COLUMN foo foo double; +describe extended orc_tab; + + +explain extended +select * from orc_tab; + +select * from orc_tab; diff --git ql/src/test/results/clientpositive/partition_char.q.out ql/src/test/results/clientpositive/partition_char.q.out index 763e5d4..2223825 100644 --- ql/src/test/results/clientpositive/partition_char.q.out +++ ql/src/test/results/clientpositive/partition_char.q.out @@ -2,11 +2,11 @@ PREHOOK: query: drop table partition_char_1 PREHOOK: type: DROPTABLE POSTHOOK: query: drop table partition_char_1 POSTHOOK: type: DROPTABLE -PREHOOK: query: create table partition_char_1 (key string, value char(20)) partitioned by (dt char(10), region int) +PREHOOK: query: create table partition_char_1 (key string, value char(20)) partitioned by (dt string, region int) PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@partition_char_1 -POSTHOOK: query: create table partition_char_1 (key string, value char(20)) partitioned by (dt char(10), region int) +POSTHOOK: query: create table partition_char_1 (key string, value char(20)) partitioned by (dt string, region int) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@partition_char_1 @@ -22,17 +22,64 @@ POSTHOOK: Input: default@src POSTHOOK: Output: default@partition_char_1@dt=2000-01-01/region=1 POSTHOOK: Lineage: partition_char_1 PARTITION(dt=2000-01-01,region=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: partition_char_1 PARTITION(dt=2000-01-01,region=1).value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: select * from partition_char_1 limit 1 +_col0 _col1 +PREHOOK: query: explain +select * from partition_char_1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from partition_char_1 +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: partition_char_1 + Statistics: Num rows: 10 Data size: 237 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: char(20)), dt (type: string), region (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 10 Data size: 237 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 237 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from partition_char_1 PREHOOK: type: QUERY PREHOOK: Input: default@partition_char_1 PREHOOK: Input: default@partition_char_1@dt=2000-01-01/region=1 #### A masked pattern was here #### -POSTHOOK: query: select * from partition_char_1 limit 1 +POSTHOOK: query: select * from partition_char_1 POSTHOOK: type: QUERY POSTHOOK: Input: default@partition_char_1 POSTHOOK: Input: default@partition_char_1@dt=2000-01-01/region=1 #### A masked pattern was here #### +partition_char_1.key partition_char_1.value partition_char_1.dt partition_char_1.region 238 val_238 2000-01-01 1 +86 val_86 2000-01-01 1 +311 val_311 2000-01-01 1 +27 val_27 2000-01-01 1 +165 val_165 2000-01-01 1 +409 val_409 2000-01-01 1 +255 val_255 2000-01-01 1 +278 val_278 2000-01-01 1 +98 val_98 2000-01-01 1 +484 val_484 2000-01-01 1 PREHOOK: query: drop table partition_char_1 PREHOOK: type: DROPTABLE PREHOOK: Input: default@partition_char_1 diff --git ql/src/test/results/clientpositive/vector_schema_evolution.q.out ql/src/test/results/clientpositive/vector_schema_evolution.q.out new file mode 100644 index 0000000..2409ff7 --- /dev/null +++ ql/src/test/results/clientpositive/vector_schema_evolution.q.out @@ -0,0 +1,342 @@ +PREHOOK: query: create table orc_tab ( foo string, bar string ) partitioned by (dt string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_tab +POSTHOOK: query: create table orc_tab ( foo string, bar string ) partitioned by (dt string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo string +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: alter table orc_tab add partition( dt='20150101' ) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab add partition( dt='20150101' ) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@orc_tab +POSTHOOK: Output: default@orc_tab@dt=20150101 +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo string +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).bar SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).foo SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 516 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: string), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 516 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 516 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 516 + serialization.ddl struct orc_tab { string foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 315 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { string foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one 20150101 +2 two 20150101 +3 three 20150101 +PREHOOK: query: alter table orc_tab add columns( goo string ) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@orc_tab +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab add columns( goo string ) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@orc_tab +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo string +bar string +goo string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 516 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: string), bar (type: string), goo (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 3 Data size: 516 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 516 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 516 + serialization.ddl struct orc_tab { string foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 315 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns foo,bar,goo + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { string foo, string bar, string goo} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one NULL 20150101 +2 two NULL 20150101 +3 three NULL 20150101 diff --git ql/src/test/results/clientpositive/vector_schema_evolution2.q.out ql/src/test/results/clientpositive/vector_schema_evolution2.q.out new file mode 100644 index 0000000..699cce5 --- /dev/null +++ ql/src/test/results/clientpositive/vector_schema_evolution2.q.out @@ -0,0 +1,342 @@ +PREHOOK: query: create table orc_tab ( foo string, bar string ) partitioned by (dt string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_tab +POSTHOOK: query: create table orc_tab ( foo string, bar string ) partitioned by (dt string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo string +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: alter table orc_tab add partition( dt='20150101' ) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab add partition( dt='20150101' ) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@orc_tab +POSTHOOK: Output: default@orc_tab@dt=20150101 +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo string +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).bar SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).foo SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: string), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 17 + serialization.ddl struct orc_tab { string foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 20 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { string foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one 20150101 +2 two 20150101 +3 three 20150101 +PREHOOK: query: alter table orc_tab add columns( goo string ) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@orc_tab +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab add columns( goo string ) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@orc_tab +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo string +bar string +goo string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: string), bar (type: string), goo (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 17 + serialization.ddl struct orc_tab { string foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 20 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns foo,bar,goo + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { string foo, string bar, string goo} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one NULL 20150101 +2 two NULL 20150101 +3 three NULL 20150101 diff --git ql/src/test/results/clientpositive/vector_schema_evolution3.q.out ql/src/test/results/clientpositive/vector_schema_evolution3.q.out new file mode 100644 index 0000000..c27421c --- /dev/null +++ ql/src/test/results/clientpositive/vector_schema_evolution3.q.out @@ -0,0 +1,341 @@ +PREHOOK: query: create table orc_tab ( foo string, bar string ) partitioned by (dt string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_tab +POSTHOOK: query: create table orc_tab ( foo string, bar string ) partitioned by (dt string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo string +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: alter table orc_tab add partition( dt='20150101' ) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab add partition( dt='20150101' ) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@orc_tab +POSTHOOK: Output: default@orc_tab@dt=20150101 +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo string +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).bar SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).foo SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: string), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 17 + serialization.ddl struct orc_tab { string foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 20 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { string foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one 20150101 +2 two 20150101 +3 three 20150101 +PREHOOK: query: alter table orc_tab CHANGE COLUMN foo foo int +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@orc_tab +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab CHANGE COLUMN foo foo int +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@orc_tab +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo int +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: int), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 17 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 17 + serialization.ddl struct orc_tab { string foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 20 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { i32 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one 20150101 +2 two 20150101 +3 three 20150101 diff --git ql/src/test/results/clientpositive/vector_schema_evolution4.q.out ql/src/test/results/clientpositive/vector_schema_evolution4.q.out new file mode 100644 index 0000000..b6f70b4 --- /dev/null +++ ql/src/test/results/clientpositive/vector_schema_evolution4.q.out @@ -0,0 +1,343 @@ +PREHOOK: query: create table orc_tab ( foo smallint, bar string ) partitioned by (dt string) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe' STORED AS SEQUENCEFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_tab +POSTHOOK: query: create table orc_tab ( foo smallint, bar string ) partitioned by (dt string) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe' STORED AS SEQUENCEFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo smallint +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: alter table orc_tab add partition( dt='20150101' ) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab add partition( dt='20150101' ) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@orc_tab +POSTHOOK: Output: default@orc_tab@dt=20150101 +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo smallint +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).bar SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).foo EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 23 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: smallint), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 23 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 23 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types smallint:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types smallint:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 23 + serialization.ddl struct orc_tab { i16 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + totalSize 167 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types smallint:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { i16 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one 20150101 +2 two 20150101 +3 three 20150101 +PREHOOK: query: alter table orc_tab CHANGE COLUMN foo foo int +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@orc_tab +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab CHANGE COLUMN foo foo int +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@orc_tab +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo int +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 23 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: int), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 23 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 23 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types smallint:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 23 + serialization.ddl struct orc_tab { i16 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + totalSize 167 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { i32 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one 20150101 +2 two 20150101 +3 three 20150101 diff --git ql/src/test/results/clientpositive/vector_schema_evolution5.q.out ql/src/test/results/clientpositive/vector_schema_evolution5.q.out new file mode 100644 index 0000000..307659c --- /dev/null +++ ql/src/test/results/clientpositive/vector_schema_evolution5.q.out @@ -0,0 +1,341 @@ +PREHOOK: query: create table orc_tab ( foo int, bar string ) partitioned by (dt string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_tab +POSTHOOK: query: create table orc_tab ( foo int, bar string ) partitioned by (dt string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo int +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: alter table orc_tab add partition( dt='20150101' ) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab add partition( dt='20150101' ) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@orc_tab +POSTHOOK: Output: default@orc_tab@dt=20150101 +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo int +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).bar SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).foo EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: int), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 273 + serialization.ddl struct orc_tab { i32 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 301 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { i32 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one 20150101 +2 two 20150101 +3 three 20150101 +PREHOOK: query: alter table orc_tab CHANGE COLUMN foo foo bigint +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@orc_tab +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab CHANGE COLUMN foo foo bigint +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@orc_tab +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo bigint +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: bigint), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types bigint:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 273 + serialization.ddl struct orc_tab { i32 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 301 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types bigint:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { i64 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one 20150101 +2 two 20150101 +3 three 20150101 diff --git ql/src/test/results/clientpositive/vector_schema_evolution6.q.out ql/src/test/results/clientpositive/vector_schema_evolution6.q.out new file mode 100644 index 0000000..371ec87 --- /dev/null +++ ql/src/test/results/clientpositive/vector_schema_evolution6.q.out @@ -0,0 +1,340 @@ +PREHOOK: query: create table orc_tab ( foo int, bar string ) partitioned by (dt string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_tab +POSTHOOK: query: create table orc_tab ( foo int, bar string ) partitioned by (dt string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo int +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: alter table orc_tab add partition( dt='20150101' ) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab add partition( dt='20150101' ) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@orc_tab +POSTHOOK: Output: default@orc_tab@dt=20150101 +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo int +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: query: insert into table orc_tab partition( dt='20150101' ) values("1", "one"),("2", "two"), ("3", "three") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@orc_tab@dt=20150101 +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).bar SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: orc_tab PARTITION(dt=20150101).foo EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: int), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 273 + serialization.ddl struct orc_tab { i32 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 301 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { i32 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1 one 20150101 +2 two 20150101 +3 three 20150101 +PREHOOK: query: alter table orc_tab CHANGE COLUMN foo foo double +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@orc_tab +PREHOOK: Output: default@orc_tab +POSTHOOK: query: alter table orc_tab CHANGE COLUMN foo foo double +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@orc_tab +POSTHOOK: Output: default@orc_tab +PREHOOK: query: describe extended orc_tab +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_tab +POSTHOOK: query: describe extended orc_tab +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_tab +foo double +bar string +dt string + +# Partition Information +# col_name data_type comment + +dt string + +#### A masked pattern was here #### +PREHOOK: query: explain extended +select * from orc_tab +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select * from orc_tab +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + orc_tab + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_tab + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: foo (type: double), bar (type: string), dt (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types double:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: dt=20150101 + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + dt 20150101 + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns foo,bar + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.orc_tab + numFiles 1 + numRows 3 + partition_columns dt + partition_columns.types string + rawDataSize 273 + serialization.ddl struct orc_tab { i32 foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 301 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns foo,bar + columns.comments + columns.types double:string +#### A masked pattern was here #### + name default.orc_tab + partition_columns dt + partition_columns.types string + serialization.ddl struct orc_tab { double foo, string bar} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_tab + name: default.orc_tab + Truncated Path -> Alias: + /orc_tab/dt=20150101 [orc_tab] + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_tab +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_tab +PREHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_tab +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_tab +POSTHOOK: Input: default@orc_tab@dt=20150101 +#### A masked pattern was here #### +1.0 one 20150101 +2.0 two 20150101 +3.0 three 20150101 diff --git serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java index 2b6d9c0..1e80211 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.serde2.binarysortable.fast; -import java.io.EOFException; import java.io.IOException; import java.math.BigInteger; import java.util.Arrays; @@ -36,9 +35,7 @@ import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; @@ -87,6 +84,18 @@ private InputByteBuffer inputByteBuffer = new InputByteBuffer(); + private boolean[] columnsToInclude; + + // Temporary objects to use when skipping for columnsToInclude. + private ReadDateResults dummyReadDateResults; + private ReadTimestampResults dummyReadTimestampResults; + private ReadStringResults dummyReadStringResults; + private ReadHiveCharResults dummyReadHiveCharResults; + private ReadHiveVarcharResults dummyReadHiveVarcharResults; + private ReadBinaryResults dummyReadBinaryResults; + private ReadIntervalYearMonthResults dummyReadIntervalYearMonthResults; + private ReadIntervalDayTimeResults dummyReadIntervalDayTimeResults; + /* * Use this constructor when only ascending sort order is used. */ @@ -108,6 +117,8 @@ public BinarySortableDeserializeRead(PrimitiveTypeInfo[] primitiveTypeInfos, readBeyondConfiguredFieldsWarned = false; readBeyondBufferRangeWarned = false; bufferRangeHasExtraDataWarned = false; + + columnsToInclude = null; } // Not public since we must have column information. @@ -122,6 +133,16 @@ private BinarySortableDeserializeRead() { } /* + * If some fields are are not going to be used by the query, use this routine to specify + * the columns to return. The readCheckNull method will automatically return NULL for the + * other columns. + */ + @Override + public void setColumnsToInclude(boolean[] columnsToInclude) { + this.columnsToInclude = columnsToInclude; + } + + /* * Set the range of bytes to be deserialized. */ @Override @@ -176,6 +197,89 @@ public boolean readCheckNull() throws IOException { // We have a field and are positioned to it. + // Do we want this field? + if (columnsToInclude != null && !columnsToInclude[fieldIndex]) { + + // We must read through the value to ignore it... + PrimitiveCategory primitiveCategory = primitiveTypeInfos[fieldIndex].getPrimitiveCategory(); + switch (primitiveCategory) { + case BOOLEAN: + readBoolean(); + break; + case BYTE: + readByte(); + break; + case SHORT: + readShort(); + break; + case INT: + readInt(); + break; + case LONG: + readLong(); + break; + case DATE: + if (dummyReadDateResults == null) { + dummyReadDateResults = createReadDateResults(); + } + readDate(dummyReadDateResults); + break; + case TIMESTAMP: + if (dummyReadTimestampResults == null) { + dummyReadTimestampResults = createReadTimestampResults(); + } + readTimestamp(dummyReadTimestampResults); + break; + case FLOAT: + readFloat(); + break; + case DOUBLE: + readDouble(); + break; + case STRING: + if (dummyReadStringResults == null) { + dummyReadStringResults = createReadStringResults(); + } + readString(dummyReadStringResults); + break; + case CHAR: + if (dummyReadHiveCharResults == null) { + dummyReadHiveCharResults = createReadHiveCharResults(); + } + readHiveChar(dummyReadHiveCharResults); + break; + case VARCHAR: + if (dummyReadHiveVarcharResults == null) { + dummyReadHiveVarcharResults = createReadHiveVarcharResults(); + } + readHiveVarchar(dummyReadHiveVarcharResults); + break; + case BINARY: + if (dummyReadBinaryResults == null) { + dummyReadBinaryResults = createReadBinaryResults(); + } + readBinary(dummyReadBinaryResults); + break; + case INTERVAL_YEAR_MONTH: + if (dummyReadIntervalYearMonthResults == null) { + dummyReadIntervalYearMonthResults = createReadIntervalYearMonthResults(); + } + readIntervalYearMonth(dummyReadIntervalYearMonthResults); + break; + case INTERVAL_DAY_TIME: + if (dummyReadIntervalDayTimeResults == null) { + dummyReadIntervalDayTimeResults = createReadIntervalDayTimeResults(); + } + readIntervalDayTime(dummyReadIntervalDayTimeResults); + break; + case DECIMAL: + earlyReadHiveDecimal(); + break; + default: + throw new RuntimeException("Unexpected primitive type category " + primitiveCategory); + } + } + if (primitiveTypeInfos[fieldIndex].getPrimitiveCategory() != PrimitiveCategory.DECIMAL) { return false; } diff --git serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java index b187aff..4b45aab 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java @@ -60,6 +60,13 @@ PrimitiveTypeInfo[] primitiveTypeInfos(); /* + * If some fields are are not going to be used by the query, use this routine to specify + * the columns to return. The readCheckNull method will automatically return NULL for the + * other columns. + */ + void setColumnsToInclude(boolean[] columnsToInclude); + + /* * Set the range of bytes to be deserialized. */ void set(byte[] bytes, int offset, int length); diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java index dc76c7d..379734c 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java @@ -113,8 +113,11 @@ private boolean readBeyondBufferRangeWarned; private boolean bufferRangeHasExtraDataWarned; + private boolean[] columnsToInclude; + public LazySimpleDeserializeRead(PrimitiveTypeInfo[] primitiveTypeInfos, byte separator, LazySerDeParameters lazyParams) { + this(); this.primitiveTypeInfos = primitiveTypeInfos; @@ -132,6 +135,13 @@ public LazySimpleDeserializeRead(PrimitiveTypeInfo[] primitiveTypeInfos, readBeyondConfiguredFieldsWarned = false; readBeyondBufferRangeWarned = false; bufferRangeHasExtraDataWarned = false; + + columnsToInclude = null; + } + + public LazySimpleDeserializeRead(PrimitiveTypeInfo[] primitiveTypeInfos, + LazySerDeParameters lazyParams) { + this(primitiveTypeInfos, lazyParams.getSeparators()[0], lazyParams); } // Not public since we must have the field count so every 8 fields NULL bytes can be navigated. @@ -146,6 +156,16 @@ private LazySimpleDeserializeRead() { } /* + * If some fields are are not going to be used by the query, use this routine to specify + * the columns to return. The readCheckNull method will automatically return NULL for the + * other columns. + */ + @Override + public void setColumnsToInclude(boolean[] columnsToInclude) { + this.columnsToInclude = columnsToInclude; + } + + /* * Set the range of bytes to be deserialized. */ @Override @@ -186,11 +206,6 @@ public boolean readCheckNull() { readBeyondBufferRangeWarned = true; } - // char[] charsBuffer = new char[end - start]; - // for (int c = 0; c < charsBuffer.length; c++) { - // charsBuffer[c] = (char) (bytes[start + c] & 0xFF); - // } - return true; } @@ -213,9 +228,14 @@ public boolean readCheckNull() { } } - char[] charField = new char[fieldLength]; - for (int c = 0; c < charField.length; c++) { - charField[c] = (char) (bytes[fieldStart + c] & 0xFF); + // char[] charField = new char[fieldLength]; + // for (int c = 0; c < charField.length; c++) { + // charField[c] = (char) (bytes[fieldStart + c] & 0xFF); + // } + + // Do we want this field? + if (columnsToInclude != null && !columnsToInclude[fieldIndex]) { + return true; } // Is the field the configured string representing NULL? diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java index 1f3806e..563ba56 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java @@ -81,6 +81,8 @@ private boolean readBeyondBufferRangeWarned; private boolean bufferRangeHasExtraDataWarned; + private boolean[] columnsToInclude; + public LazyBinaryDeserializeRead(PrimitiveTypeInfo[] primitiveTypeInfos) { this.primitiveTypeInfos = primitiveTypeInfos; fieldCount = primitiveTypeInfos.length; @@ -89,6 +91,8 @@ public LazyBinaryDeserializeRead(PrimitiveTypeInfo[] primitiveTypeInfos) { readBeyondConfiguredFieldsWarned = false; readBeyondBufferRangeWarned = false; bufferRangeHasExtraDataWarned = false; + + columnsToInclude = null; } // Not public since we must have the field count so every 8 fields NULL bytes can be navigated. @@ -103,6 +107,16 @@ private LazyBinaryDeserializeRead() { } /* + * If some fields are are not going to be used by the query, use this routine to specify + * the columns to return. The readCheckNull method will automatically return NULL for the + * other columns. + */ + @Override + public void setColumnsToInclude(boolean[] columnsToInclude) { + this.columnsToInclude = columnsToInclude; + } + + /* * Set the range of bytes to be deserialized. */ @Override @@ -154,6 +168,26 @@ public boolean readCheckNull() throws IOException { // We have a field and are positioned to it. + // Do we want this field? + if (columnsToInclude != null && !columnsToInclude[fieldIndex]) { + + // When NULL, we need to move past this field. + fieldIndex++; + + // Every 8 fields we read a new NULL byte. + if (fieldIndex < fieldCount) { + if ((fieldIndex % 8) == 0) { + // Get next null byte. + if (offset >= end) { + warnBeyondEof(); + } + nullByte = bytes[offset++]; + } + } + + return true; + } + if (primitiveTypeInfos[fieldIndex].getPrimitiveCategory() != PrimitiveCategory.DECIMAL) { return false; }