diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index bf48f69..68efdcd 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1096,6 +1096,9 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { HIVE_ORC_ZEROCOPY("hive.exec.orc.zerocopy", false, "Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)"), + HIVE_ORC_SCHEMA_EVOLUTION("hive.exec.orc.schema.evolution", true, + "Use schema evolution to convert ORC file data to the schema desired by the reader."), + HIVE_LAZYSIMPLE_EXTENDED_BOOLEAN_LITERAL("hive.lazysimple.extended_boolean_literal", false, "LazySimpleSerde uses this property to determine if it treats 'T', 't', 'F', 'f',\n" + "'1', and '0' as extened, legal boolean literal, in addition to 'TRUE' and 'FALSE'.\n" + diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java index d5ea96a..989e27d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor; import org.apache.hadoop.hive.ql.io.RecordIdentifier; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.plan.MapWork; @@ -61,6 +62,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.StringUtils; @@ -181,6 +183,16 @@ void initializeAsRoot(JobConf hconf, MapWork mapWork) throws Exception { initializeMapOperator(hconf); } + private boolean isInputFileFormatSelfDescribing(PartitionDesc pd) { + + // For now, ORC is the only format that handles schema evolution. + Class inputFileFormatClass = pd.getInputFileFormatClass(); + if (OrcInputFormat.class == inputFileFormatClass) { + return true; + } + return false; + } + private MapOpCtx initObjectInspector(Configuration hconf, MapOpCtx opCtx, StructObjectInspector tableRowOI) throws Exception { PartitionDesc pd = opCtx.partDesc; @@ -198,8 +210,13 @@ private MapOpCtx initObjectInspector(Configuration hconf, MapOpCtx opCtx, opCtx.partName = String.valueOf(partSpec); opCtx.deserializer = pd.getDeserializer(hconf); - StructObjectInspector partRawRowObjectInspector = - (StructObjectInspector) opCtx.deserializer.getObjectInspector(); + StructObjectInspector partRawRowObjectInspector; + if (isInputFileFormatSelfDescribing(pd)) { + partRawRowObjectInspector = tableRowOI; + } else { + partRawRowObjectInspector = + (StructObjectInspector) opCtx.deserializer.getObjectInspector(); + } opCtx.partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter(partRawRowObjectInspector, tableRowOI); @@ -300,8 +317,15 @@ private MapOpCtx initObjectInspector(Configuration hconf, MapOpCtx opCtx, PartitionDesc pd = conf.getPathToPartitionInfo().get(onefile); TableDesc tableDesc = pd.getTableDesc(); Deserializer partDeserializer = pd.getDeserializer(hconf); - StructObjectInspector partRawRowObjectInspector = - (StructObjectInspector) partDeserializer.getObjectInspector(); + + StructObjectInspector partRawRowObjectInspector; + if (isInputFileFormatSelfDescribing(pd)) { + Deserializer tblDeserializer = tableDesc.getDeserializer(hconf); + partRawRowObjectInspector = (StructObjectInspector) tblDeserializer.getObjectInspector(); + } else { + partRawRowObjectInspector = + (StructObjectInspector) partDeserializer.getObjectInspector(); + } StructObjectInspector tblRawRowObjectInspector = tableDescOI.get(tableDesc); if ((tblRawRowObjectInspector == null) || diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 5b21af9..164df50 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -160,6 +160,7 @@ import org.apache.hadoop.hive.ql.plan.SparkEdgeProperty; import org.apache.hadoop.hive.ql.plan.SparkWork; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.api.Adjacency; import org.apache.hadoop.hive.ql.plan.api.Graph; import org.apache.hadoop.hive.ql.session.SessionState; @@ -473,11 +474,6 @@ private static BaseWork getBaseWork(Configuration conf, String name) { } } - public static Map getMapWorkVectorScratchColumnTypeMap(Configuration hiveConf) { - MapWork mapWork = getMapWork(hiveConf); - return mapWork.getVectorScratchColumnTypeMap(); - } - public static void setWorkflowAdjacencies(Configuration conf, QueryPlan plan) { try { Graph stageGraph = plan.getQueryPlan().getStageGraph(); @@ -753,7 +749,7 @@ private static Path setBaseWork(Configuration conf, BaseWork w, Path hiveScratch } } - private static Path getPlanPath(Configuration conf, String name) { + public static Path getPlanPath(Configuration conf, String name) { Path planPath = getPlanPath(conf); if (planPath == null) { return null; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java index bedccc3..fe3243b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java @@ -155,7 +155,7 @@ public void init(JobConf job, OutputCollector output, Reporter reporter) throws ObjectPair pair = VectorizedBatchUtil. constructVectorizedRowBatch(keyStructInspector, - valueStructInspectors[tag], gWork.getVectorScratchColumnTypeMap()); + valueStructInspectors[tag], gWork.getVectorizedRowBatchCtx()); batches[tag] = pair.getFirst(); final int totalColumns = keysColumnOffset + valueStructInspectors[tag].getAllStructFieldRefs().size(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java index 91ba2bb..eda860a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java @@ -236,7 +236,7 @@ private void initializeSourceForTag(ReduceWork redWork, int tag, ObjectInspector boolean vectorizedRecordSource = (tag == bigTablePosition) && redWork.getVectorMode(); sources[tag].init(jconf, redWork.getReducer(), vectorizedRecordSource, keyTableDesc, valueTableDesc, reader, tag == bigTablePosition, (byte) tag, - redWork.getVectorScratchColumnTypeMap()); + redWork.getVectorizedRowBatchCtx()); ois[tag] = sources[tag].getObjectInspector(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java index 1f2f9f9..392a654 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java @@ -35,6 +35,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorDeserializeRow; import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory; import org.apache.hadoop.hive.ql.log.PerfLogger; @@ -124,7 +125,7 @@ void init(JobConf jconf, Operator reducer, boolean vectorized, TableDesc keyTableDesc, TableDesc valueTableDesc, Reader reader, boolean handleGroupKey, byte tag, - Map vectorScratchColumnTypeMap) + VectorizedRowBatchCtx batchContext) throws Exception { ObjectInspector keyObjectInspector; @@ -176,7 +177,8 @@ void init(JobConf jconf, Operator reducer, boolean vectorized, TableDesc keyT .genVectorStructExpressionWritables(valueStructInspectors))); ObjectPair pair = - VectorizedBatchUtil.constructVectorizedRowBatch(keyStructInspector, valueStructInspectors, vectorScratchColumnTypeMap); + VectorizedBatchUtil.constructVectorizedRowBatch(keyStructInspector, + valueStructInspectors, batchContext); rowObjectInspector = pair.getSecond(); batch = pair.getFirst(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java index ee6939d..a34e682 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java @@ -449,6 +449,8 @@ Object extract(int batchIndex) { } } + static int fake = 0; + private class StringExtractorByValue extends AbstractBytesExtractor { // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java index 7a552b8..85f798c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java @@ -814,7 +814,7 @@ public VectorGroupByOperator() { outputFieldNames, objectInspectors); if (isVectorOutput) { vrbCtx = new VectorizedRowBatchCtx(); - vrbCtx.init(vOutContext.getScratchColumnTypeMap(), (StructObjectInspector) outputObjInspector); + vrbCtx.init((StructObjectInspector) outputObjInspector, vOutContext.getScratchColumnTypeNames()); outputBatch = vrbCtx.createVectorizedRowBatch(); vectorAssignRowSameBatch = new VectorAssignRowSameBatch(); vectorAssignRowSameBatch.init((StructObjectInspector) outputObjInspector, vOutContext.getProjectedColumns()); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinBaseOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinBaseOperator.java index 0baec2c..d9f5d2f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinBaseOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinBaseOperator.java @@ -91,7 +91,7 @@ public VectorMapJoinBaseOperator (VectorizationContext vContext, OperatorDesc co Collection> result = super.initializeOp(hconf); vrbCtx = new VectorizedRowBatchCtx(); - vrbCtx.init(vOutContext.getScratchColumnTypeMap(), (StructObjectInspector) this.outputObjInspector); + vrbCtx.init((StructObjectInspector) this.outputObjInspector, vOutContext.getScratchColumnTypeNames()); outputBatch = vrbCtx.createVectorizedRowBatch(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java index 804ba17..66190ae 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java @@ -146,7 +146,7 @@ public VectorSMBMapJoinOperator(VectorizationContext vContext, OperatorDesc conf Collection> result = super.initializeOp(hconf); vrbCtx = new VectorizedRowBatchCtx(); - vrbCtx.init(vOutContext.getScratchColumnTypeMap(), (StructObjectInspector) this.outputObjInspector); + vrbCtx.init((StructObjectInspector) this.outputObjInspector, vOutContext.getScratchColumnTypeNames()); outputBatch = vrbCtx.createVectorizedRowBatch(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 46c2a78..51a6f36 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -144,6 +144,8 @@ VectorExpressionDescriptor vMap; + private List initialColumnNames; + private List projectedColumns; private List projectionColumnNames; private Map projectionColumnMap; @@ -158,6 +160,7 @@ public VectorizationContext(String contextName, List initialColumnNames) this.contextName = contextName; level = 0; LOG.info("VectorizationContext consructor contextName " + contextName + " level " + level + " initialColumnNames " + initialColumnNames.toString()); + this.initialColumnNames = initialColumnNames; this.projectionColumnNames = initialColumnNames; projectedColumns = new ArrayList(); @@ -178,7 +181,8 @@ public VectorizationContext(String contextName) { this.contextName = contextName; level = 0; LOG.info("VectorizationContext consructor contextName " + contextName + " level " + level); - projectedColumns = new ArrayList(); + initialColumnNames = new ArrayList(); + projectedColumns = new ArrayList(); projectionColumnNames = new ArrayList(); projectionColumnMap = new HashMap(); this.ocm = new OutputColumnManager(0); @@ -193,6 +197,7 @@ public VectorizationContext(String contextName, VectorizationContext vContext) { this.contextName = contextName; level = vContext.level + 1; LOG.info("VectorizationContext consructor reference contextName " + contextName + " level " + level); + this.initialColumnNames = vContext.initialColumnNames; this.projectedColumns = new ArrayList(); this.projectionColumnNames = new ArrayList(); this.projectionColumnMap = new HashMap(); @@ -205,6 +210,7 @@ public VectorizationContext(String contextName, VectorizationContext vContext) { // Add an initial column to a vectorization context when // a vectorized row batch is being created. public void addInitialColumn(String columnName) { + initialColumnNames.add(columnName); int index = projectedColumns.size(); projectedColumns.add(index); projectionColumnNames.add(columnName); @@ -233,6 +239,10 @@ public void addProjectionColumn(String columnName, int vectorBatchColIndex) { projectionColumnMap.put(columnName, vectorBatchColIndex); } + public List getInitialColumnNames() { + return initialColumnNames; + } + public List getProjectedColumns() { return projectedColumns; } @@ -1032,7 +1042,9 @@ private VectorExpression getVectorExpressionForUdf(GenericUDF genericeUdf, VectorExpressionDescriptor.Descriptor descriptor = builder.build(); Class vclass = this.vMap.getVectorExpressionClass(udfClass, descriptor); if (vclass == null) { - LOG.info("No vector udf found for " + udfClass.getSimpleName() + ", descriptor: " + descriptor); + if (LOG.isDebugEnabled()) { + LOG.debug("No vector udf found for "+udfClass.getSimpleName() + ", descriptor: "+descriptor); + } return null; } Mode childrenMode = getChildrenMode(mode, udfClass); @@ -2295,7 +2307,7 @@ public static String mapTypeNameSynonyms(String typeName) { } } - public static ColumnVector.Type getColumnVectorTypeFromTypeInfo(TypeInfo typeInfo) throws HiveException { + public static ColumnVector.Type getColumnVectorTypeFromTypeInfo(TypeInfo typeInfo) { PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); @@ -2325,7 +2337,7 @@ public static String mapTypeNameSynonyms(String typeName) { return ColumnVector.Type.DECIMAL; default: - throw new HiveException("Unexpected primitive type category " + primitiveCategory); + throw new RuntimeException("Unexpected primitive type category " + primitiveCategory); } } @@ -2435,13 +2447,13 @@ public int firstOutputColumnIndex() { return firstOutputColumnIndex; } - public Map getScratchColumnTypeMap() { - Map map = new HashMap(); + public String[] getScratchColumnTypeNames() { + String[] result = new String[ocm.outputColCount]; for (int i = 0; i < ocm.outputColCount; i++) { - String type = ocm.outputColumnsTypes[i]; - map.put(i+this.firstOutputColumnIndex, type); + String typeName = ocm.outputColumnsTypes[i]; + result[i] = typeName; } - return map; + return result; } @Override @@ -2461,9 +2473,7 @@ public int compare(Integer o1, Integer o2) { } sb.append("sorted projectionColumnMap ").append(sortedColumnMap).append(", "); - Map sortedScratchColumnTypeMap = new TreeMap(comparerInteger); - sortedScratchColumnTypeMap.putAll(getScratchColumnTypeMap()); - sb.append("sorted scratchColumnTypeMap ").append(sortedScratchColumnTypeMap); + sb.append("scratchColumnTypeNames ").append(getScratchColumnTypeNames().toString()); return sb.toString(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java index 3780113..8eff110 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java @@ -199,13 +199,13 @@ public static VectorizedRowBatch constructVectorizedRowBatch( * struct object inspector, not just any struct object inspector. * @param keyInspector * @param valueInspector - * @param vectorScratchColumnTypeMap + * @param batchContext * @return VectorizedRowBatch, OI * @throws HiveException */ public static ObjectPair constructVectorizedRowBatch( - StructObjectInspector keyInspector, StructObjectInspector valueInspector, Map vectorScratchColumnTypeMap) - throws HiveException { + StructObjectInspector keyInspector, StructObjectInspector valueInspector, + VectorizedRowBatchCtx batchContext) throws HiveException { ArrayList colNames = new ArrayList(); ArrayList ois = new ArrayList(); @@ -221,8 +221,6 @@ public static VectorizedRowBatch constructVectorizedRowBatch( } StandardStructObjectInspector rowObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(colNames, ois); - VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); - batchContext.init(vectorScratchColumnTypeMap, rowObjectInspector); return new ObjectPair<>(batchContext.createVectorizedRowBatch(), rowObjectInspector); } @@ -584,6 +582,34 @@ public static StandardStructObjectInspector convertToStandardStructObjectInspect return ObjectInspectorFactory.getStandardStructObjectInspector(columnNames,oids); } + public static String[] columnNamesFromStructObjectInspector( + StructObjectInspector structObjectInspector) throws HiveException { + + List fields = structObjectInspector.getAllStructFieldRefs(); + String[] result = new String[fields.size()]; + + int i = 0; + for(StructField field : fields) { + result[i++] = field.getFieldName(); + } + return result; + } + + public static TypeInfo[] typeInfosFromStructObjectInspector( + StructObjectInspector structObjectInspector) throws HiveException { + + List fields = structObjectInspector.getAllStructFieldRefs(); + TypeInfo[] result = new TypeInfo[fields.size()]; + + int i = 0; + for(StructField field : fields) { + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString( + field.getFieldObjectInspector().getTypeName()); + result[i++] = typeInfo; + } + return result; + } + public static PrimitiveTypeInfo[] primitiveTypeInfosFromStructObjectInspector( StructObjectInspector structObjectInspector) throws HiveException { @@ -599,6 +625,28 @@ public static StandardStructObjectInspector convertToStandardStructObjectInspect return result; } + public static PrimitiveTypeInfo[] primitiveTypeInfosFromTypeInfos(TypeInfo[] typeInfos) { + PrimitiveTypeInfo[] primitiveTypeInfos = new PrimitiveTypeInfo[typeInfos.length]; + + for (int i = 0; i < typeInfos.length; i++) { + TypeInfo typeInfo = typeInfos[i]; + primitiveTypeInfos[i] = (PrimitiveTypeInfo) typeInfo; + } + return primitiveTypeInfos; + } + + public static TypeInfo[] typeInfosFromTypeNames( + String[] typeNames) throws HiveException { + + TypeInfo[] result = new TypeInfo[typeNames.length]; + + for(int i = 0; i < typeNames.length; i++) { + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeNames[i]); + result[i] = typeInfo; + } + return result; + } + public static PrimitiveTypeInfo[] primitiveTypeInfosFromTypeNames( String[] typeNames) throws HiveException { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java index 82d4a8f..e8d2108 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java @@ -39,9 +39,11 @@ import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; import org.apache.hadoop.hive.ql.io.IOPrepareCache; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; @@ -58,6 +60,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.DataOutputBuffer; @@ -66,357 +70,206 @@ import org.apache.hive.common.util.DateUtils; /** - * Context for Vectorized row batch. this calss does eager deserialization of row data using serde + * Context for Vectorized row batch. this class does eager deserialization of row data using serde * in the RecordReader layer. * It has supports partitions in this layer so that the vectorized batch is populated correctly * with the partition column. */ public class VectorizedRowBatchCtx { + private static final long serialVersionUID = 1L; + private static final Log LOG = LogFactory.getLog(VectorizedRowBatchCtx.class.getName()); - // OI for raw row data (EG without partition cols) - private StructObjectInspector rawRowOI; + // The following information is for creating VectorizedRowBatch and for helping with + // knowing how the table is partitioned. + // + // It will be stored in MapWork and ReduceWork. + private String[] rowColumnNames; + private TypeInfo[] rowColumnTypeInfos; + private int nonPartitionColumnCount; + private int partitionColumnCount; - // OI for the row (Raw row OI + partition OI) - private StructObjectInspector rowOI; + private String[] scratchColumnTypeNames; - // Deserializer for the row data - private Deserializer deserializer; + /** + * Constructor for VectorizedRowBatchCtx + */ + public VectorizedRowBatchCtx() { + } - // Hash map of partition values. Key=TblColName value=PartitionValue - private Map partitionValues; - - //partition types - private Map partitionTypes; + public VectorizedRowBatchCtx(String[] rowColumnNames, TypeInfo[] rowColumnTypeInfos, + int partitionColumnCount, String[] scratchColumnTypeNames) { + this.rowColumnNames = rowColumnNames; + this.rowColumnTypeInfos = rowColumnTypeInfos; + this.partitionColumnCount = partitionColumnCount; + this.scratchColumnTypeNames = scratchColumnTypeNames; - // partition column positions, for use by classes that need to know whether a given column is a - // partition column - private Set partitionCols; - - // Column projection list - List of column indexes to include. This - // list does not contain partition columns - private List colsToInclude; + nonPartitionColumnCount = rowColumnTypeInfos.length - partitionColumnCount; + } - private Map scratchColumnTypeMap = null; + public String[] getRowColumnNames() { + return rowColumnNames; + } - /** - * Constructor for VectorizedRowBatchCtx - * - * @param rawRowOI - * OI for raw row data (EG without partition cols) - * @param rowOI - * OI for the row (Raw row OI + partition OI) - * @param deserializer - * Deserializer for the row data - * @param partitionValues - * Hash map of partition values. Key=TblColName value=PartitionValue - */ - public VectorizedRowBatchCtx(StructObjectInspector rawRowOI, StructObjectInspector rowOI, - Deserializer deserializer, Map partitionValues, - Map partitionTypes) { - this.rowOI = rowOI; - this.rawRowOI = rawRowOI; - this.deserializer = deserializer; - this.partitionValues = partitionValues; - this.partitionTypes = partitionTypes; + public TypeInfo[] getRowColumnTypeInfos() { + return rowColumnTypeInfos; } - /** - * Constructor for VectorizedRowBatchCtx - */ - public VectorizedRowBatchCtx() { + public int getNonPartitionColumnCount() { + return nonPartitionColumnCount; + } + + public int getPartitionColumnCount() { + return partitionColumnCount; + } + public String[] getScratchColumnTypeNames() { + return scratchColumnTypeNames; } /** - * Initializes the VectorizedRowBatch context based on an scratch column type map and + * Initializes the VectorizedRowBatch context based on an scratch column type names and * object inspector. - * @param scratchColumnTypeMap - * @param rowOI + * @param structObjectInspector + * @param scratchColumnTypeNames * Object inspector that shapes the column types + * @throws HiveException */ - public void init(Map scratchColumnTypeMap, - StructObjectInspector rowOI) { - this.scratchColumnTypeMap = scratchColumnTypeMap; - this.rowOI= rowOI; - this.rawRowOI = rowOI; + public void init(StructObjectInspector structObjectInspector, String[] scratchColumnTypeNames) + throws HiveException { + + // Row column information. + rowColumnNames = VectorizedBatchUtil.columnNamesFromStructObjectInspector(structObjectInspector); + rowColumnTypeInfos = VectorizedBatchUtil.typeInfosFromStructObjectInspector(structObjectInspector); + partitionColumnCount = 0; + nonPartitionColumnCount = rowColumnTypeInfos.length; + + // Scratch column information. + this.scratchColumnTypeNames = scratchColumnTypeNames; } - /** - * Initializes VectorizedRowBatch context based on the - * split and Hive configuration (Job conf with hive Plan). - * - * @param hiveConf - * Hive configuration using Hive plan is extracted - * @param split - * File split of the file being read - * @throws ClassNotFoundException - * @throws IOException - * @throws SerDeException - * @throws InstantiationException - * @throws IllegalAccessException - * @throws HiveException - */ - public void init(Configuration hiveConf, FileSplit split) throws ClassNotFoundException, - IOException, - SerDeException, - InstantiationException, - IllegalAccessException, - HiveException { + public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, Configuration hiveConf, + FileSplit split, Object[] partitionValues) throws IOException { Map pathToPartitionInfo = Utilities .getMapWork(hiveConf).getPathToPartitionInfo(); - PartitionDesc part = HiveFileFormatUtils + PartitionDesc partDesc = HiveFileFormatUtils .getPartitionDescFromPathRecursively(pathToPartitionInfo, split.getPath(), IOPrepareCache.get().getPartitionDescMap()); - String partitionPath = split.getPath().getParent().toString(); - scratchColumnTypeMap = Utilities.getMapWorkVectorScratchColumnTypeMap(hiveConf); - // LOG.info("VectorizedRowBatchCtx init scratchColumnTypeMap " + scratchColumnTypeMap.toString()); - - Properties partProps = - (part.getPartSpec() == null || part.getPartSpec().isEmpty()) ? - part.getTableDesc().getProperties() : part.getProperties(); - - Class serdeclass = hiveConf.getClassByName(part.getSerdeClassName()); - Deserializer partDeserializer = (Deserializer) serdeclass.newInstance(); - SerDeUtils.initializeSerDe(partDeserializer, hiveConf, part.getTableDesc().getProperties(), - partProps); - StructObjectInspector partRawRowObjectInspector = (StructObjectInspector) partDeserializer - .getObjectInspector(); - - deserializer = partDeserializer; - - // Check to see if this split is part of a partition of a table - String pcols = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); - - String[] partKeys = null; - if (pcols != null && pcols.length() > 0) { - - // Partitions exist for this table. Get the partition object inspector and - // raw row object inspector (row with out partition col) - LinkedHashMap partSpec = part.getPartSpec(); - partKeys = pcols.trim().split("/"); - String pcolTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES); - String[] partKeyTypes = pcolTypes.trim().split(":"); - - if (partKeys.length > partKeyTypes.length) { - throw new HiveException("Internal error : partKeys length, " +partKeys.length + - " greater than partKeyTypes length, " + partKeyTypes.length); - } - - List partNames = new ArrayList(partKeys.length); - List partObjectInspectors = new ArrayList(partKeys.length); - partitionValues = new LinkedHashMap(); - partitionTypes = new LinkedHashMap(); - for (int i = 0; i < partKeys.length; i++) { - String key = partKeys[i]; - partNames.add(key); - ObjectInspector objectInspector = null; - Object objectVal; - if (partSpec == null) { - // for partitionless table, initialize partValue to empty string. - // We can have partitionless table even if we have partition keys - // when there is only only partition selected and the partition key is not - // part of the projection/include list. - objectVal = null; - objectInspector = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - partitionTypes.put(key, PrimitiveCategory.STRING); - } else { - // Create a Standard java object Inspector - objectInspector = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo( - TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i])); - objectVal = - ObjectInspectorConverters. - getConverter(PrimitiveObjectInspectorFactory. - javaStringObjectInspector, objectInspector). - convert(partSpec.get(key)); - partitionTypes.put(key, TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i]).getPrimitiveCategory()); - } - if (LOG.isDebugEnabled()) { - LOG.debug("Partition column: name: " + key + ", value: " + objectVal + ", type: " + partitionTypes.get(key)); - } - partitionValues.put(key, objectVal); - partObjectInspectors.add(objectInspector); - } - - // Create partition OI - StructObjectInspector partObjectInspector = ObjectInspectorFactory - .getStandardStructObjectInspector(partNames, partObjectInspectors); - - // Get row OI from partition OI and raw row OI - StructObjectInspector rowObjectInspector = ObjectInspectorFactory - .getUnionStructObjectInspector(Arrays - .asList(new StructObjectInspector[] {partRawRowObjectInspector, partObjectInspector})); - rowOI = rowObjectInspector; - rawRowOI = partRawRowObjectInspector; - - // We have to do this after we've set rowOI, as getColIndexBasedOnColName uses it - partitionCols = new HashSet(); - if (pcols != null && pcols.length() > 0) { - for (int i = 0; i < partKeys.length; i++) { - partitionCols.add(getColIndexBasedOnColName(partKeys[i])); - } - } + getPartitionValues(vrbCtx, partDesc, partitionValues); - } else { + } - // No partitions for this table, hence row OI equals raw row OI - rowOI = partRawRowObjectInspector; - rawRowOI = partRawRowObjectInspector; + public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, PartitionDesc partDesc, + Object[] partitionValues) { + + LinkedHashMap partSpec = partDesc.getPartSpec(); + + for (int i = 0; i < vrbCtx.partitionColumnCount; i++) { + Object objectValue; + if (partSpec == null) { + // For partition-less table, initialize partValue to empty string. + // We can have partition-less table even if we have partition keys + // when there is only only partition selected and the partition key is not + // part of the projection/include list. + objectValue = null; + } else { + String key = vrbCtx.rowColumnNames[vrbCtx.nonPartitionColumnCount + i]; + + // Create a Standard java object Inspector + ObjectInspector objectInspector = + TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo( + vrbCtx.rowColumnTypeInfos[vrbCtx.nonPartitionColumnCount + i]); + objectValue = + ObjectInspectorConverters. + getConverter(PrimitiveObjectInspectorFactory. + javaStringObjectInspector, objectInspector). + convert(partSpec.get(key)); + } + partitionValues[i] = objectValue; } - - colsToInclude = ColumnProjectionUtils.getReadColumnIDs(hiveConf); } - + /** * Creates a Vectorized row batch and the column vectors. * * @return VectorizedRowBatch * @throws HiveException */ - public VectorizedRowBatch createVectorizedRowBatch() throws HiveException + public VectorizedRowBatch createVectorizedRowBatch() { - List fieldRefs = rowOI.getAllStructFieldRefs(); - VectorizedRowBatch result = new VectorizedRowBatch(fieldRefs.size()); - for (int j = 0; j < fieldRefs.size(); j++) { - // If the column is included in the include list or if the column is a - // partition column then create the column vector. Also note that partition columns are not - // in the included list. - if ((colsToInclude == null) || colsToInclude.contains(j) - || ((partitionValues != null) && - partitionValues.containsKey(fieldRefs.get(j).getFieldName()))) { - ObjectInspector foi = fieldRefs.get(j).getFieldObjectInspector(); - switch (foi.getCategory()) { - case PRIMITIVE: { - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi; - // Vectorization currently only supports the following data types: - // BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, BINARY, STRING, CHAR, VARCHAR, TIMESTAMP, - // DATE and DECIMAL - switch (poi.getPrimitiveCategory()) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - case TIMESTAMP: - case DATE: - case INTERVAL_YEAR_MONTH: - case INTERVAL_DAY_TIME: - result.cols[j] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); - break; - case FLOAT: - case DOUBLE: - result.cols[j] = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE); - break; - case BINARY: - case STRING: - case CHAR: - case VARCHAR: - result.cols[j] = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); - break; - case DECIMAL: - DecimalTypeInfo tInfo = (DecimalTypeInfo) poi.getTypeInfo(); - result.cols[j] = new DecimalColumnVector(VectorizedRowBatch.DEFAULT_SIZE, - tInfo.precision(), tInfo.scale()); - break; - default: - throw new RuntimeException("Vectorizaton is not supported for datatype:" - + poi.getPrimitiveCategory()); - } - break; - } - case LIST: - case MAP: - case STRUCT: - case UNION: - throw new HiveException("Vectorizaton is not supported for datatype:" - + foi.getCategory()); - default: - throw new HiveException("Unknown ObjectInspector category!"); - } - } + int totalColumnCount = rowColumnTypeInfos.length + scratchColumnTypeNames.length; + VectorizedRowBatch result = new VectorizedRowBatch(totalColumnCount); + + for (int i = 0; i < rowColumnTypeInfos.length; i++) { + TypeInfo typeInfo = rowColumnTypeInfos[i]; + result.cols[i] = allocateColumnVector(typeInfo, result.DEFAULT_SIZE); + } + + for (int i = 0; i < scratchColumnTypeNames.length; i++) { + String typeName = scratchColumnTypeNames[i]; + result.cols[rowColumnTypeInfos.length + i] = + allocateColumnVector(typeName, result.DEFAULT_SIZE); } - result.numCols = fieldRefs.size(); - this.addScratchColumnsToBatch(result); + + result.setPartitionInfo(nonPartitionColumnCount, partitionColumnCount); + result.reset(); return result; } - /** - * Adds the row to the batch after deserializing the row - * - * @param rowIndex - * Row index in the batch to which the row is added - * @param rowBlob - * Row blob (serialized version of row) - * @param batch - * Vectorized batch to which the row is added - * @param buffer a buffer to copy strings into - * @throws HiveException - * @throws SerDeException - */ - public void addRowToBatch(int rowIndex, Writable rowBlob, - VectorizedRowBatch batch, - DataOutputBuffer buffer - ) throws HiveException, SerDeException + public VectorizedRowBatch createVectorizedRowBatch(boolean[] columnsToIncludeTruncated) + throws HiveException { - Object row = this.deserializer.deserialize(rowBlob); - VectorizedBatchUtil.addRowToBatch(row, this.rawRowOI, rowIndex, batch, buffer); - } - - /** - * Deserialized set of rows and populates the batch - * - * @param rowBlob - * to deserialize - * @param batch - * Vectorized row batch which contains deserialized data - * @throws SerDeException - */ - public void convertRowBatchBlobToVectorizedBatch(Object rowBlob, int rowsInBlob, - VectorizedRowBatch batch) - throws SerDeException { - - if (deserializer instanceof VectorizedSerde) { - ((VectorizedSerde) deserializer).deserializeVector(rowBlob, rowsInBlob, batch); - } else { - throw new SerDeException( - "Not able to deserialize row batch. Serde does not implement VectorizedSerde"); + if (columnsToIncludeTruncated == null) { + return createVectorizedRowBatch(); } - } - private int getColIndexBasedOnColName(String colName) throws HiveException - { - List fieldRefs = rowOI.getAllStructFieldRefs(); - for (int i = 0; i < fieldRefs.size(); i++) { - if (fieldRefs.get(i).getFieldName().equals(colName)) { - return i; + int totalColumnCount = rowColumnTypeInfos.length + scratchColumnTypeNames.length; + VectorizedRowBatch result = new VectorizedRowBatch(totalColumnCount); + + for (int i = 0; i < columnsToIncludeTruncated.length; i++) { + if (columnsToIncludeTruncated[i]) { + TypeInfo typeInfo = rowColumnTypeInfos[i]; + result.cols[i] = allocateColumnVector(typeInfo, result.DEFAULT_SIZE); } } - throw new HiveException("Not able to find column name in row object inspector"); + + for (int i = nonPartitionColumnCount; i < nonPartitionColumnCount + partitionColumnCount; i++) { + TypeInfo typeInfo = rowColumnTypeInfos[i]; + result.cols[i] = allocateColumnVector(typeInfo, result.DEFAULT_SIZE); + } + + for (int i = 0; i < scratchColumnTypeNames.length; i++) { + String typeName = scratchColumnTypeNames[i]; + result.cols[rowColumnTypeInfos.length + i] = + allocateColumnVector(typeName, result.DEFAULT_SIZE); + } + + result.reset(); + return result; } - + /** * Add the partition values to the batch * * @param batch * @throws HiveException */ - public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveException + public void addPartitionColsToBatch(VectorizedRowBatch batch, Object[] partitionValues) + throws HiveException { - int colIndex; - Object value; - PrimitiveCategory pCategory; if (partitionValues != null) { - for (String key : partitionValues.keySet()) { - colIndex = getColIndexBasedOnColName(key); - value = partitionValues.get(key); - pCategory = partitionTypes.get(key); - - switch (pCategory) { + for (int i = 0; i < partitionColumnCount; i++) { + Object value = partitionValues[i]; + + int colIndex = nonPartitionColumnCount + i; + String partitionColumnName = rowColumnNames[colIndex]; + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) rowColumnTypeInfos[colIndex]; + switch (primitiveTypeInfo.getPrimitiveCategory()) { case BOOLEAN: { LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex]; if (value == null) { @@ -568,7 +421,7 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveExcepti HiveDecimal hd = (HiveDecimal) value; dv.set(0, hd); dv.isRepeating = true; - dv.isNull[0] = false; + dv.isNull[0] = false; } } break; @@ -602,10 +455,10 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveExcepti } } break; - + default: - throw new HiveException("Unable to recognize the partition type " + pCategory + - " for column " + key); + throw new HiveException("Unable to recognize the partition type " + primitiveTypeInfo.getPrimitiveCategory() + + " for column " + partitionColumnName); } } } @@ -613,64 +466,56 @@ public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveExcepti /** * Determine whether a given column is a partition column - * @param colnum column number in + * @param colNum column number in * {@link org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch}s created by this context. * @return true if it is a partition column, false otherwise */ - public final boolean isPartitionCol(int colnum) { - return (partitionCols == null) ? false : partitionCols.contains(colnum); + public final boolean isPartitionCol(int colNum) { + return colNum >= nonPartitionColumnCount && colNum < rowColumnTypeInfos.length; } - private void addScratchColumnsToBatch(VectorizedRowBatch vrb) throws HiveException { - if (scratchColumnTypeMap != null && !scratchColumnTypeMap.isEmpty()) { - int origNumCols = vrb.numCols; - int newNumCols = vrb.cols.length+scratchColumnTypeMap.keySet().size(); - vrb.cols = Arrays.copyOf(vrb.cols, newNumCols); - for (int i = origNumCols; i < newNumCols; i++) { - String typeName = scratchColumnTypeMap.get(i); - if (typeName == null) { - throw new HiveException("No type entry found for column " + i + " in map " + scratchColumnTypeMap.toString()); - } - vrb.cols[i] = allocateColumnVector(typeName, - VectorizedRowBatch.DEFAULT_SIZE); - } - vrb.numCols = vrb.cols.length; + public static ColumnVector allocateColumnVector(String typeName, int defaultSize) { + typeName = typeName.toLowerCase(); + + // Allow undecorated CHAR and VARCHAR to support scratch column type names. + if (typeName.equals("char") || typeName.equals("varchar")) { + return new BytesColumnVector(defaultSize); + } else if (typeName.equals("long")) { + typeName = "bigint"; } - } - /** - * Get the scale and precision for the given decimal type string. The decimal type is assumed to be - * of the format decimal(precision,scale) e.g. decimal(20,10). - * @param decimalType The given decimal type string. - * @return An integer array of size 2 with first element set to precision and second set to scale. - */ - private static int[] getScalePrecisionFromDecimalType(String decimalType) { - Pattern p = Pattern.compile("\\d+"); - Matcher m = p.matcher(decimalType); - m.find(); - int precision = Integer.parseInt(m.group()); - m.find(); - int scale = Integer.parseInt(m.group()); - int [] precScale = { precision, scale }; - return precScale; + TypeInfo typeInfo = (TypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); + return allocateColumnVector(typeInfo, defaultSize); } - public static ColumnVector allocateColumnVector(String type, int defaultSize) { - if (type.equalsIgnoreCase("double")) { - return new DoubleColumnVector(defaultSize); - } else if (VectorizationContext.isStringFamily(type)) { - return new BytesColumnVector(defaultSize); - } else if (VectorizationContext.decimalTypePattern.matcher(type).matches()){ - int [] precisionScale = getScalePrecisionFromDecimalType(type); - return new DecimalColumnVector(defaultSize, precisionScale[0], precisionScale[1]); - } else if (type.equalsIgnoreCase("long") || - type.equalsIgnoreCase("date") || - type.equalsIgnoreCase("timestamp") || - type.equalsIgnoreCase(serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME) || - type.equalsIgnoreCase(serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME)) { - return new LongColumnVector(defaultSize); - } else { - throw new RuntimeException("Cannot allocate vector column for " + type); + public static ColumnVector allocateColumnVector(TypeInfo typeInfo, int defaultSize) { + switch (typeInfo.getCategory()) { + case PRIMITIVE: { + Type vectorColumnType = VectorizationContext.getColumnVectorTypeFromTypeInfo(typeInfo); + switch (vectorColumnType) { + case LONG: + return new LongColumnVector(defaultSize); + case DOUBLE: + return new DoubleColumnVector(defaultSize); + case BYTES: + return new BytesColumnVector(defaultSize); + case DECIMAL: + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; + return new DecimalColumnVector(defaultSize, + decimalTypeInfo.precision(), decimalTypeInfo.scale()); + default: + throw new RuntimeException("Unknown vector column type " + vectorColumnType.name()); + } + } + case LIST: + case MAP: + case STRUCT: + case UNION: + // We assume here that since vectorization doesn't support complex types yet, the column + // will not be used. + return null; + default: + throw new RuntimeException("Unknown type category " + typeInfo.getCategory().name()); } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java index 87ebcf2..52e7e76 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java @@ -665,22 +665,12 @@ protected HashTableLoader getHashTableLoader(Configuration hconf) { * build join output results in. */ protected VectorizedRowBatch setupOverflowBatch() throws HiveException { + + int initialColumnCount = vContext.firstOutputColumnIndex(); VectorizedRowBatch overflowBatch; - Map scratchColumnTypeMap = vOutContext.getScratchColumnTypeMap(); - int maxColumn = 0; - for (int i = 0; i < outputProjection.length; i++) { - int outputColumn = outputProjection[i]; - if (maxColumn < outputColumn) { - maxColumn = outputColumn; - } - } - for (int outputColumn : scratchColumnTypeMap.keySet()) { - if (maxColumn < outputColumn) { - maxColumn = outputColumn; - } - } - overflowBatch = new VectorizedRowBatch(maxColumn + 1); + int totalNumColumns = initialColumnCount + vOutContext.getScratchColumnTypeNames().length; + overflowBatch = new VectorizedRowBatch(totalNumColumns); // First, just allocate just the projection columns we will be using. for (int i = 0; i < outputProjection.length; i++) { @@ -690,9 +680,9 @@ protected VectorizedRowBatch setupOverflowBatch() throws HiveException { } // Now, add any scratch columns needed for children operators. - for (int outputColumn : scratchColumnTypeMap.keySet()) { - String typeName = scratchColumnTypeMap.get(outputColumn); - allocateOverflowBatchColumnVector(overflowBatch, outputColumn, typeName); + int outputColumn = initialColumnCount; + for (String typeName : vOutContext.getScratchColumnTypeNames()) { + allocateOverflowBatchColumnVector(overflowBatch, outputColumn++, typeName); } overflowBatch.projectedColumns = outputProjection; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileInputFormat.java deleted file mode 100644 index e9e1d5a..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileInputFormat.java +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; - -/** - * A MapReduce/Hive Vectorized input format for RC files. - */ -public class VectorizedRCFileInputFormat extends FileInputFormat - implements InputFormatChecker { - - public VectorizedRCFileInputFormat() { - setMinSplitSize(SequenceFile.SYNC_INTERVAL); - } - - @Override - @SuppressWarnings("unchecked") - public RecordReader getRecordReader(InputSplit split, JobConf job, - Reporter reporter) throws IOException { - - reporter.setStatus(split.toString()); - - return new VectorizedRCFileRecordReader(job, (FileSplit) split); - } - - @Override - public boolean validateInput(FileSystem fs, HiveConf conf, - List files) throws IOException { - if (files.size() <= 0) { - return false; - } - for (int fileId = 0; fileId < files.size(); fileId++) { - RCFile.Reader reader = null; - try { - reader = new RCFile.Reader(fs, files.get(fileId) - .getPath(), conf); - reader.close(); - reader = null; - } catch (IOException e) { - return false; - } finally { - if (null != reader) { - reader.close(); - } - } - } - return true; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java deleted file mode 100644 index 4cc1c2f..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java +++ /dev/null @@ -1,261 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io; - -import java.io.IOException; -import java.util.Collections; -import java.util.Map; -import java.util.WeakHashMap; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; -import org.apache.hadoop.hive.ql.io.RCFile.KeyBuffer; -import org.apache.hadoop.hive.ql.io.RCFile.Reader; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; -import org.apache.hadoop.io.DataOutputBuffer; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.RecordReader; - -/** - * RCFileRecordReader. - */ -public class VectorizedRCFileRecordReader implements RecordReader { - - private final Reader in; - private final long start; - private final long end; - private boolean more = true; - protected Configuration conf; - private final FileSplit split; - private final boolean useCache; - private VectorizedRowBatchCtx rbCtx; - private final LongWritable keyCache = new LongWritable(); - private final BytesRefArrayWritable colsCache = new BytesRefArrayWritable(); - private boolean addPartitionCols = true; - private final DataOutputBuffer buffer = new DataOutputBuffer(); - - private static RCFileSyncCache syncCache = new RCFileSyncCache(); - - private static final class RCFileSyncEntry { - long end; - long endSync; - } - - private static final class RCFileSyncCache { - - private final Map cache; - - public RCFileSyncCache() { - cache = Collections.synchronizedMap(new WeakHashMap()); - } - - public void put(FileSplit split, long endSync) { - Path path = split.getPath(); - long end = split.getStart() + split.getLength(); - String key = path.toString() + "+" + String.format("%d", end); - - RCFileSyncEntry entry = new RCFileSyncEntry(); - entry.end = end; - entry.endSync = endSync; - if (entry.endSync >= entry.end) { - cache.put(key, entry); - } - } - - public long get(FileSplit split) { - Path path = split.getPath(); - long start = split.getStart(); - String key = path.toString() + "+" + String.format("%d", start); - RCFileSyncEntry entry = cache.get(key); - if (entry != null) { - return entry.endSync; - } - return -1; - } - } - - public VectorizedRCFileRecordReader(Configuration conf, FileSplit split) - throws IOException { - - Path path = split.getPath(); - FileSystem fs = path.getFileSystem(conf); - this.in = new RCFile.Reader(fs, path, conf); - this.end = split.getStart() + split.getLength(); - this.conf = conf; - this.split = split; - - useCache = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEUSERCFILESYNCCACHE); - - if (split.getStart() > in.getPosition()) { - long oldSync = useCache ? syncCache.get(split) : -1; - if (oldSync == -1) { - in.sync(split.getStart()); // sync to start - } else { - in.seek(oldSync); - } - } - - this.start = in.getPosition(); - - more = start < end; - try { - rbCtx = new VectorizedRowBatchCtx(); - rbCtx.init(conf, split); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - public Class getKeyClass() { - return LongWritable.class; - } - - public Class getValueClass() { - return BytesRefArrayWritable.class; - } - - @Override - public NullWritable createKey() { - return NullWritable.get(); - } - - @Override - public VectorizedRowBatch createValue() { - VectorizedRowBatch result; - try { - result = rbCtx.createVectorizedRowBatch(); - } catch (HiveException e) { - throw new RuntimeException("Error creating a batch", e); - } - return result; - } - - public boolean nextBlock() throws IOException { - return in.nextBlock(); - } - - @Override - public boolean next(NullWritable key, VectorizedRowBatch value) throws IOException { - - // Reset column fields noNull values to true - VectorizedBatchUtil.setNoNullFields(value); - buffer.reset(); - value.selectedInUse = false; - for (int i = 0; i < value.numCols; i++) { - value.cols[i].isRepeating = false; - } - - int i = 0; - try { - - for (; i < VectorizedRowBatch.DEFAULT_SIZE; i++) { - more = next(keyCache); - if (more) { - // Check and update partition cols if necessary. Ideally this should be done - // in CreateValue() as the partition is constant per split. But since Hive uses - // CombineHiveRecordReader and as this does not call CreateValue() for - // each new RecordReader it creates, this check is required in next() - if (addPartitionCols) { - rbCtx.addPartitionColsToBatch(value); - addPartitionCols = false; - } - in.getCurrentRow(colsCache); - // Currently RCFile reader does not support reading vectorized - // data. Populating the batch by adding one row at a time. - rbCtx.addRowToBatch(i, (Writable) colsCache, value, buffer); - } else { - break; - } - } - } catch (Exception e) { - throw new RuntimeException("Error while getting next row", e); - } - value.size = i; - return more; - } - - protected boolean next(LongWritable key) throws IOException { - if (!more) { - return false; - } - - more = in.next(key); - - long lastSeenSyncPos = in.lastSeenSyncPos(); - - if (lastSeenSyncPos >= end) { - if (useCache) { - syncCache.put(split, lastSeenSyncPos); - } - more = false; - return more; - } - return more; - } - - /** - * Return the progress within the input split. - * - * @return 0.0 to 1.0 of the input byte range - */ - public float getProgress() throws IOException { - if (end == start) { - return 0.0f; - } else { - return Math.min(1.0f, (in.getPosition() - start) / (float) (end - start)); - } - } - - public long getPos() throws IOException { - return in.getPosition(); - } - - public KeyBuffer getKeyBuffer() { - return in.getCurrentKeyBufferObj(); - } - - protected void seek(long pos) throws IOException { - in.seek(pos); - } - - public void sync(long pos) throws IOException { - in.sync(pos); - } - - public void resetBuffer() { - in.resetBuffer(); - } - - public long getStart() { - return start; - } - - public void close() throws IOException { - in.close(); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConversionTreeReaderFactory.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConversionTreeReaderFactory.java deleted file mode 100644 index aaf4eb4..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConversionTreeReaderFactory.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.io.orc; - -import java.io.IOException; -import java.util.List; - -/** - * Factory for creating ORC tree readers. These tree readers can handle type promotions and type - * conversions. - */ -public class ConversionTreeReaderFactory extends TreeReaderFactory { - - // TODO: This is currently only a place holder for type conversions. - - public static TreeReader createTreeReader(int columnId, - List types, - boolean[] included, - boolean skipCorrupt - ) throws IOException { - return TreeReaderFactory.createTreeReader(columnId, types, included, skipCorrupt); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index ef62a23..361a9db 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -1027,7 +1027,7 @@ private void populateAndCacheStripeDetails() throws IOException { Reader orcReader = createOrcReader(); stripes = orcReader.getStripes(); metadata = orcReader.getMetadata(); - types = orcReader.getTypes(); + types = orcReader.getFileTypes(); writerVersion = orcReader.getWriterVersion(); fileMetaInfo = context.footerInSplits ? ((ReaderImpl) orcReader).getFileMetaInfo() : null; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java index 187924d..2a5a922 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java @@ -134,6 +134,13 @@ * type in the list. * @return the list of flattened types */ + List getFileTypes(); + + /** + * Get the list of schema evolution types that will be returned by readers. The root type is + * the first type in the list. + * @return the list of flattened types + */ List getTypes(); /** diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java index a36027e..07ae453 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java @@ -24,6 +24,8 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Map; +import java.util.Properties; import java.util.Set; import org.apache.commons.logging.Log; @@ -33,15 +35,36 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.DiskRange; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.FileFormatException; +import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; +import org.apache.hadoop.hive.ql.io.IOPrepareCache; import org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterVersion; import org.apache.hadoop.hive.ql.io.orc.OrcProto.Footer; import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type; import org.apache.hadoop.hive.ql.io.orc.OrcProto.UserMetadataItem; import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.BufferChunk; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; +import org.apache.hadoop.hive.ql.plan.MapWork; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.util.JavaDataModel; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.Text; import com.google.common.collect.Lists; @@ -68,6 +91,29 @@ private final List versionList; private final OrcFile.WriterVersion writerVersion; + /** + * The types in the file. + */ + private final List fileTypes; + + /** + * The types we will actually read from the file. + * + * If the configuration column names/types are fewer than the file types, the types we want + * to read internally will fewer. + */ + private final List readerTypes; + + /** + * The desired result column names/types from the (optional) configuration names/types. + */ + private final List schemaEvolutionTypes; + + /** + * The subtype of the row struct. Different than 0 for ACID. + */ + private final int schemaEvolutionStructSubtype; + //serialized footer - Keeping this around for use by getFileMetaInfo() // will help avoid cpu cycles spend in deserializing at cost of increased // memory footprint. @@ -181,8 +227,13 @@ public long getContentLength() { } @Override + public List getFileTypes() { + return fileTypes; + } + + @Override public List getTypes() { - return footer.getTypesList(); + return schemaEvolutionTypes; } @Override @@ -326,9 +377,20 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { this.metadataSize = rInfo.metadataSize; this.metadata = rInfo.metadata; this.footer = rInfo.footer; - this.inspector = rInfo.inspector; this.versionList = footerMetaData.versionList; this.writerVersion = footerMetaData.writerVersion; + + this.fileTypes = this.footer.getTypesList(); + + SchemaEvolution schemaEvolution = + new SchemaEvolution(this.fileTypes, path); + + this.readerTypes = schemaEvolution.readerTypes; + + this.schemaEvolutionTypes = schemaEvolution.schemaEvolutionTypes; + this.schemaEvolutionStructSubtype = schemaEvolution.schemaEvolutionStructSubtype; + + this.inspector = OrcStruct.createObjectInspector(0, schemaEvolutionTypes); } /** @@ -519,9 +581,8 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, final int metadataSize; final OrcProto.Metadata metadata; final OrcProto.Footer footer; - final ObjectInspector inspector; - MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize, + MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize, ByteBuffer footerBuffer) throws IOException { this.compressionKind = CompressionKind.valueOf(codecStr); @@ -537,7 +598,6 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize); footerBuffer.position(position); - this.inspector = OrcStruct.createObjectInspector(0, footer.getTypesList()); } } @@ -627,15 +687,17 @@ public RecordReader rows() throws IOException { public RecordReader rowsOptions(Options options) throws IOException { LOG.info("Reading ORC rows from " + path + " with " + options); boolean[] include = options.getInclude(); + // if included columns is null, then include all columns if (include == null) { - include = new boolean[footer.getTypesCount()]; + include = new boolean[readerTypes.size()]; Arrays.fill(include, true); options.include(include); } + return new RecordReaderImpl(this.getStripes(), fileSystem, path, - options, footer.getTypesList(), codec, bufferSize, - footer.getRowIndexStride(), conf); + options, readerTypes, schemaEvolutionTypes, schemaEvolutionStructSubtype, + codec, bufferSize, footer.getRowIndexStride(), conf); } @@ -815,4 +877,468 @@ public MetadataReader metadata() throws IOException { public Footer getFooter() { return footer; } + + static List getAcidEventFields() { + return Lists.newArrayList("operation", "originalTransaction", "bucket", + "rowId", "currentTransaction", "row"); + } + + /** + * Take the file types and the (optional) configuration column names/types and see if there + * has been schema evolution. + */ + private class SchemaEvolution { + + /** + * The types we will actually read from the file. + * + * If the configuration column names/types are fewer than the file types, the types we want + * to read internally will fewer. + */ + final List readerTypes; + + /** + * The desired result column names/types from the (optional) configuration names/types. + */ + final List schemaEvolutionTypes; + + /** + * The subtype of the row struct. Different than 0 for ACID. + */ + final int schemaEvolutionStructSubtype; + + public SchemaEvolution(List fileTypes, Path path) throws IOException { + + List types = null; + + if (!HiveConf.getBoolVar(conf,ConfVars.HIVE_ORC_SCHEMA_EVOLUTION) || + !fileTypes.get(0).getKind().equals(OrcProto.Type.Kind.STRUCT)) { + + // No schema evolution requested or non-STRUCT. + readerTypes = fileTypes; + schemaEvolutionTypes = fileTypes; + schemaEvolutionStructSubtype = -1; + + LOG.info("No schema evolution requested or non-STRUCT: " + fileTypes.toString()); + + } else { + StructObjectInspector desiredObjectInspector = getDesiredRowObjectInspector(conf, path); + if (desiredObjectInspector == null) { + + // No desired schema found in configuration properties. + readerTypes = fileTypes; + schemaEvolutionTypes = fileTypes; + schemaEvolutionStructSubtype = -1; + + LOG.info("No desired schema for schema evolution found in configuration properties: " + fileTypes.toString()); + + } else { + + // For ACID, the row is the ROW field in the outer STRUCT. + final boolean isAcid = checkAcidSchema(fileTypes); + final List rowSchema; + int rowSubtype; + if (isAcid) { + rowSubtype = OrcRecordUpdater.ROW + 1; + rowSchema = fileTypes.subList(rowSubtype, fileTypes.size()); + } else { + rowSubtype = 0; + rowSchema = fileTypes; + } + + // Do checking on the overlap. Additional columns will be defaulted to NULL. + final List desiredRowSchema = getOrcTypes(desiredObjectInspector); + + int numFileColumns = rowSchema.get(0).getSubtypesCount(); + int numDesiredColumns = desiredRowSchema.get(0).getSubtypesCount(); + + int numReadColumns = Math.min(numFileColumns, numDesiredColumns); + + // Check type promotion. ORC can only support type promotions for integer types + // short -> int -> bigint as same integer readers are used for the above types. + + for (int i = 0; i < numReadColumns; i++) { + OrcProto.Type fColType = fileTypes.get(rowSubtype + i); + OrcProto.Type rColType = desiredRowSchema.get(i); + if (!fColType.getKind().equals(rColType.getKind())) { + + boolean ok = false; + if (fColType.getKind().equals(OrcProto.Type.Kind.SHORT)) { + + if (rColType.getKind().equals(OrcProto.Type.Kind.INT) || + rColType.getKind().equals(OrcProto.Type.Kind.LONG)) { + // type promotion possible, converting SHORT to INT/LONG requested type + ok = true; + } + } else if (fColType.getKind().equals(OrcProto.Type.Kind.INT)) { + + if (rColType.getKind().equals(OrcProto.Type.Kind.LONG)) { + // type promotion possible, converting INT to LONG requested type + ok = true; + } + } + + if (!ok) { + throw new IOException("ORC does not support type conversion from " + + fColType.getKind().name() + " to " + rColType.getKind().name()); + } + } + } + + if (numFileColumns <= numDesiredColumns) { + + readerTypes = fileTypes; + + } else { + + // File has more columns than the desired schema. + + readerTypes = new ArrayList(); + + if (isAcid) { + + // This copies the ACID struct type which is subtype = 0. + // It has field names "operation" through "row". + // And we copy the types for all fields EXCEPT ROW (which must be last!). + + for (int i = 0; i < rowSubtype; i++) { + readerTypes.add(fileTypes.get(i).toBuilder().build()); + } + } + + // Copy just the file schema's row columns up to numReadColumns... + assert numReadColumns == numDesiredColumns; + copyPartialStruct(fileTypes, rowSubtype, readerTypes, numReadColumns); + } + + schemaEvolutionTypes = new ArrayList(); + + if (isAcid) { + + // This copies the ACID struct type which is subtype = 0. + // It has field names "operation" through "row". + // And we copy the types for all fields EXCEPT ROW (which must be last!). + + for (int i = 0; i < rowSubtype; i++) { + schemaEvolutionTypes.add(fileTypes.get(i).toBuilder().build()); + } + } + // Add the row struct type. + getOrcTypesAppend(schemaEvolutionTypes, desiredObjectInspector); + + schemaEvolutionStructSubtype = rowSubtype; + + LOG.info("Schema evolution: (fileTypes) " + fileTypes.toString() + + " (readerTypes) " + readerTypes.toString() + + " (schemaEvolutionTypes) " + schemaEvolutionTypes.toString()); + + } + } + } + + private void copyPartialStruct(List fileTypes, int rowSubtype, + List readerTypes, int numReadColumns) { + + Type structType = fileTypes.get(rowSubtype); + List subtypes = structType.getSubtypesList(); + + // Determine the end -- one beyond the last subtype we need. + Integer endSubtype = subtypes.get(numReadColumns); + int partialSubtypeCount = endSubtype - rowSubtype - 1; + subtypes = subtypes.subList(0, partialSubtypeCount); + + List fieldNames = structType.getFieldNamesList().subList(0, numReadColumns); + + // STRUCT type. + readerTypes.add(structType.toBuilder().clearSubtypes().addAllSubtypes(subtypes) + .clearFieldNames().addAllFieldNames(fieldNames).build()); + + // Add subtypes for just the fields we want. + for (int subtype = rowSubtype + 1; subtype < endSubtype; subtype++) { + readerTypes.add(fileTypes.get(subtype).toBuilder().build()); + } + } + + private boolean checkAcidSchema(List fileSchema) { + if (fileSchema.get(0).getKind().equals(OrcProto.Type.Kind.STRUCT)) { + List acidFields = getAcidEventFields(); + List rootFields = fileSchema.get(0).getFieldNamesList(); + if (acidFields.equals(rootFields)) { + return true; + } + } + return false; + } + + private StructObjectInspector getDesiredRowObjectInspector(Configuration conf, + Path path) throws IOException { + + String metaTableColumnProperty = null; + String metaTableTypeProperty = null; + String metaTableParitionColumnProperty = null; + + boolean fetchedInfoFromMapWork = false; + Path planPath = Utilities.getPlanPath(conf, Utilities.MAP_PLAN_NAME); + if (planPath != null) { + MapWork mapWork = Utilities.getMapWork(conf); + if (mapWork != null) { + + Map pathToPartitionInfo = mapWork.getPathToPartitionInfo(); + + PartitionDesc part = HiveFileFormatUtils + .getPartitionDescFromPathRecursively(pathToPartitionInfo, + path, IOPrepareCache.get().getPartitionDescMap()); + + // Only examine the table columns for ORC. + if (part.getTableDesc() != null) { + Properties properties = part.getTableDesc().getProperties(); + + metaTableColumnProperty = + properties.getProperty(hive_metastoreConstants.META_TABLE_COLUMNS); + metaTableTypeProperty = + properties.getProperty(hive_metastoreConstants.META_TABLE_COLUMN_TYPES); + fetchedInfoFromMapWork = + metaTableColumnProperty != null && metaTableTypeProperty != null; + if (fetchedInfoFromMapWork) { + metaTableParitionColumnProperty = + properties.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); + if (metaTableParitionColumnProperty != null) { + LOG.info("Partition descriptor partition columns " + + Arrays.toString(metaTableParitionColumnProperty.split("/"))); + } + } + } + } + } + + if (!fetchedInfoFromMapWork) { + return null; + } + + // NOTE: Configuration META_TABLE_COLUMNS (same property name as LIST_COLUMNS) include + // partitions -- which make it a useless property for what we are up to here... + // That is because we don'thave a way to determine which columns are partitions and remove + // them. + // + String confMetaTableColumnProperty = conf.get(hive_metastoreConstants.META_TABLE_COLUMNS); + String confMetaTableTypeProperty = conf.get(hive_metastoreConstants.META_TABLE_COLUMN_TYPES); + if (confMetaTableColumnProperty != null && confMetaTableTypeProperty != null) { + LOG.info("Configuration columns " + confMetaTableColumnProperty + ", types " + confMetaTableTypeProperty); + } + + ArrayList metaTableColumnNames = + Lists.newArrayList(metaTableColumnProperty.split(",")); + if (metaTableColumnNames.size() == 0) { + return null; + } + ArrayList metaTableTypes = + TypeInfoUtils.getTypeInfosFromTypeString(metaTableTypeProperty); + if (metaTableTypes.size() != metaTableColumnNames.size()) { + return null; + } + + // Find first virtual column and clip them off. + int virtualColumnClipNum = -1; + int columnNum = 0; + for (String metaTableColumn : metaTableColumnNames) { + if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(metaTableColumn)) { + virtualColumnClipNum = columnNum; + break; + } + columnNum++; + } + if (virtualColumnClipNum != -1) { + + if (virtualColumnClipNum == 0) { + LOG.info("No non-virtual meta data columns!"); + return null; + } + + metaTableColumnNames = + Lists.newArrayList(metaTableColumnNames.subList(0, virtualColumnClipNum)); + metaTableTypes = Lists.newArrayList(metaTableTypes.subList(0, virtualColumnClipNum)); + } + + /* + if (metaTableParitionColumnProperty != null && !metaTableParitionColumnProperty.isEmpty()) { + String[] partitionColumns = metaTableParitionColumnProperty.split("/"); + + // Unfortunately, sometimes META_TABLE_COLUMNS include partition columns, sometimes they + // DO NOT... + + // Find first partition column and clip them off. + String firstPartitionColumn = partitionColumns[0]; + int partitionColumnClipNum = -1; + columnNum = 0; + for (String metaTableColumn : metaTableColumnNames) { + if (firstPartitionColumn.equals(metaTableColumn)) { + partitionColumnClipNum = columnNum; + break; + } + columnNum++; + } + if (partitionColumnClipNum != -1) { + metaTableColumnNames = + Lists.newArrayList(metaTableColumnNames.subList(0, partitionColumnClipNum)); + metaTableTypes = Lists.newArrayList(metaTableTypes.subList(0, partitionColumnClipNum)); + } + } + */ + + // Desired schema does not include virtual columns or partition columns. + StructTypeInfo structTypeInfo = new StructTypeInfo(); + structTypeInfo.setAllStructFieldNames(metaTableColumnNames); + structTypeInfo.setAllStructFieldTypeInfos(metaTableTypes); + + StructObjectInspector desiredObjectInspector = + (StructObjectInspector) new OrcStruct.OrcStructInspector(structTypeInfo); + + return desiredObjectInspector; + } + + private List getOrcTypes(ObjectInspector inspector) { + List result = Lists.newArrayList(); + getOrcTypesAppend(result, inspector); + return result; + } + + private void getOrcTypesAppend(List result, ObjectInspector inspector) { + int subtype = result.size(); + OrcProto.Type.Builder type = OrcProto.Type.newBuilder(); + switch (inspector.getCategory()) { + case PRIMITIVE: + switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { + case BOOLEAN: + type.setKind(OrcProto.Type.Kind.BOOLEAN); + break; + case BYTE: + type.setKind(OrcProto.Type.Kind.BYTE); + break; + case SHORT: + type.setKind(OrcProto.Type.Kind.SHORT); + break; + case INT: + type.setKind(OrcProto.Type.Kind.INT); + break; + case LONG: + type.setKind(OrcProto.Type.Kind.LONG); + break; + case FLOAT: + type.setKind(OrcProto.Type.Kind.FLOAT); + break; + case DOUBLE: + type.setKind(OrcProto.Type.Kind.DOUBLE); + break; + case STRING: + type.setKind(OrcProto.Type.Kind.STRING); + break; + case CHAR: + // The char length needs to be written to file and should be available + // from the object inspector + CharTypeInfo charTypeInfo = (CharTypeInfo) ((PrimitiveObjectInspector) inspector) + .getTypeInfo(); + type.setKind(OrcProto.Type.Kind.CHAR); + type.setMaximumLength(charTypeInfo.getLength()); + break; + case VARCHAR: + // The varchar length needs to be written to file and should be available + // from the object inspector + VarcharTypeInfo typeInfo = (VarcharTypeInfo) ((PrimitiveObjectInspector) inspector) + .getTypeInfo(); + type.setKind(OrcProto.Type.Kind.VARCHAR); + type.setMaximumLength(typeInfo.getLength()); + break; + case BINARY: + type.setKind(OrcProto.Type.Kind.BINARY); + break; + case TIMESTAMP: + type.setKind(OrcProto.Type.Kind.TIMESTAMP); + break; + case DATE: + type.setKind(OrcProto.Type.Kind.DATE); + break; + case DECIMAL: + DecimalTypeInfo decTypeInfo = (DecimalTypeInfo) ((PrimitiveObjectInspector) inspector) + .getTypeInfo(); + type.setKind(OrcProto.Type.Kind.DECIMAL); + type.setPrecision(decTypeInfo.precision()); + type.setScale(decTypeInfo.scale()); + break; + default: + throw new IllegalArgumentException("Unknown primitive category: " + + ((PrimitiveObjectInspector) inspector).getPrimitiveCategory()); + } + result.add(type.build()); + break; + case LIST: + type.setKind(OrcProto.Type.Kind.LIST); + type.addSubtypes(++subtype); + result.add(type.build()); + getOrcTypesAppend(result, ((ListObjectInspector) inspector).getListElementObjectInspector()); + break; + case MAP: + { + // Make room for MAP type. + result.add(null); + + // Add MAP type pair in order to determine their subtype values. + getOrcTypesAppend(result, ((MapObjectInspector) inspector).getMapKeyObjectInspector()); + int subtype2 = result.size(); + getOrcTypesAppend(result, ((MapObjectInspector) inspector).getMapValueObjectInspector()); + type.setKind(OrcProto.Type.Kind.MAP); + type.addSubtypes(subtype + 1); + type.addSubtypes(subtype2); + result.set(subtype, type.build()); + } + break; + case STRUCT: + { + List fields = + ((StructObjectInspector) inspector).getAllStructFieldRefs(); + + // Make room for STRUCT type. + result.add(null); + + List fieldSubtypes = new ArrayList(fields.size()); + for (int i = 0 ; i < fields.size(); i++) { + int fieldSubtype = result.size(); + fieldSubtypes.add(fieldSubtype); + getOrcTypesAppend(result, fields.get(i).getFieldObjectInspector()); + } + + type.setKind(OrcProto.Type.Kind.STRUCT); + for (int i = 0 ; i < fields.size(); i++) { + type.addSubtypes(fieldSubtypes.get(i)); + type.addFieldNames(fields.get(i).getFieldName()); + } + result.set(subtype, type.build()); + } + break; + case UNION: + { + List unionInspectors = + ((UnionObjectInspector) inspector).getObjectInspectors(); + + // Make room for UNION type. + result.add(null); + + List unionSubtypes = new ArrayList(unionInspectors.size()); + for (int i = 0 ; i < unionInspectors.size(); i++) { + int unionSubtype = result.size(); + unionSubtypes.add(unionSubtype); + getOrcTypesAppend(result, unionInspectors.get(i)); + } + + type.setKind(OrcProto.Type.Kind.UNION); + for (int i = 0 ; i < unionInspectors.size(); i++) { + type.addSubtypes(unionSubtypes.get(i)); + } + result.set(subtype, type.build()); + } + break; + default: + throw new IllegalArgumentException("Unknown category: " + inspector.getCategory()); + } + } + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderFactory.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderFactory.java deleted file mode 100644 index 23a9af4..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderFactory.java +++ /dev/null @@ -1,274 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.io.orc; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; -import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; - -import com.google.common.collect.Lists; - -/** - * Factory to create ORC tree readers. It also compares file schema with schema specified on read - * to see if type promotions are possible. - */ -public class RecordReaderFactory { - static final Log LOG = LogFactory.getLog(RecordReaderFactory.class); - private static final boolean isLogInfoEnabled = LOG.isInfoEnabled(); - - public static TreeReaderFactory.TreeReader createTreeReader(int colId, - Configuration conf, - List fileSchema, - boolean[] included, - boolean skipCorrupt) throws IOException { - final boolean isAcid = checkAcidSchema(fileSchema); - final List originalFileSchema; - if (isAcid) { - originalFileSchema = fileSchema.subList(fileSchema.get(0).getSubtypesCount(), - fileSchema.size()); - } else { - originalFileSchema = fileSchema; - } - final int numCols = originalFileSchema.get(0).getSubtypesCount(); - List schemaOnRead = getSchemaOnRead(numCols, conf); - List schemaUsed = getMatchingSchema(fileSchema, schemaOnRead); - if (schemaUsed == null) { - return TreeReaderFactory.createTreeReader(colId, fileSchema, included, skipCorrupt); - } else { - return ConversionTreeReaderFactory.createTreeReader(colId, schemaUsed, included, skipCorrupt); - } - } - - static List getAcidEventFields() { - return Lists.newArrayList("operation", "originalTransaction", "bucket", - "rowId", "currentTransaction", "row"); - } - - private static boolean checkAcidSchema(List fileSchema) { - if (fileSchema.get(0).getKind().equals(OrcProto.Type.Kind.STRUCT)) { - List acidFields = getAcidEventFields(); - List rootFields = fileSchema.get(0).getFieldNamesList(); - if (acidFields.equals(rootFields)) { - return true; - } - } - return false; - } - - private static List getMatchingSchema(List fileSchema, - List schemaOnRead) { - if (schemaOnRead == null) { - if (isLogInfoEnabled) { - LOG.info("Schema is not specified on read. Using file schema."); - } - return null; - } - - if (fileSchema.size() != schemaOnRead.size()) { - if (isLogInfoEnabled) { - LOG.info("Schema on read column count does not match file schema's column count." + - " Falling back to using file schema."); - } - return null; - } else { - List result = Lists.newArrayList(fileSchema); - // check type promotion. ORC can only support type promotions for integer types - // short -> int -> bigint as same integer readers are used for the above types. - boolean canPromoteType = false; - for (int i = 0; i < fileSchema.size(); i++) { - OrcProto.Type fColType = fileSchema.get(i); - OrcProto.Type rColType = schemaOnRead.get(i); - if (!fColType.getKind().equals(rColType.getKind())) { - - if (fColType.getKind().equals(OrcProto.Type.Kind.SHORT)) { - - if (rColType.getKind().equals(OrcProto.Type.Kind.INT) || - rColType.getKind().equals(OrcProto.Type.Kind.LONG)) { - // type promotion possible, converting SHORT to INT/LONG requested type - result.set(i, result.get(i).toBuilder().setKind(rColType.getKind()).build()); - canPromoteType = true; - } else { - canPromoteType = false; - } - - } else if (fColType.getKind().equals(OrcProto.Type.Kind.INT)) { - - if (rColType.getKind().equals(OrcProto.Type.Kind.LONG)) { - // type promotion possible, converting INT to LONG requested type - result.set(i, result.get(i).toBuilder().setKind(rColType.getKind()).build()); - canPromoteType = true; - } else { - canPromoteType = false; - } - - } else { - canPromoteType = false; - } - } - } - - if (canPromoteType) { - if (isLogInfoEnabled) { - LOG.info("Integer type promotion happened in ORC record reader. Using promoted schema."); - } - return result; - } - } - - return null; - } - - private static List getSchemaOnRead(int numCols, Configuration conf) { - String columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES); - final String columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS); - if (columnTypeProperty == null || columnNameProperty == null) { - return null; - } - - ArrayList columnNames = Lists.newArrayList(columnNameProperty.split(",")); - ArrayList fieldTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); - StructTypeInfo structTypeInfo = new StructTypeInfo(); - // Column types from conf includes virtual and partition columns at the end. We consider only - // the actual columns in the file. - structTypeInfo.setAllStructFieldNames(Lists.newArrayList(columnNames.subList(0, numCols))); - structTypeInfo.setAllStructFieldTypeInfos(Lists.newArrayList(fieldTypes.subList(0, numCols))); - ObjectInspector oi = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(structTypeInfo); - return getOrcTypes(oi); - } - - private static List getOrcTypes(ObjectInspector inspector) { - List result = Lists.newArrayList(); - getOrcTypesImpl(result, inspector); - return result; - } - - private static void getOrcTypesImpl(List result, ObjectInspector inspector) { - OrcProto.Type.Builder type = OrcProto.Type.newBuilder(); - switch (inspector.getCategory()) { - case PRIMITIVE: - switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { - case BOOLEAN: - type.setKind(OrcProto.Type.Kind.BOOLEAN); - break; - case BYTE: - type.setKind(OrcProto.Type.Kind.BYTE); - break; - case SHORT: - type.setKind(OrcProto.Type.Kind.SHORT); - break; - case INT: - type.setKind(OrcProto.Type.Kind.INT); - break; - case LONG: - type.setKind(OrcProto.Type.Kind.LONG); - break; - case FLOAT: - type.setKind(OrcProto.Type.Kind.FLOAT); - break; - case DOUBLE: - type.setKind(OrcProto.Type.Kind.DOUBLE); - break; - case STRING: - type.setKind(OrcProto.Type.Kind.STRING); - break; - case CHAR: - // The char length needs to be written to file and should be available - // from the object inspector - CharTypeInfo charTypeInfo = (CharTypeInfo) ((PrimitiveObjectInspector) inspector) - .getTypeInfo(); - type.setKind(OrcProto.Type.Kind.CHAR); - type.setMaximumLength(charTypeInfo.getLength()); - break; - case VARCHAR: - // The varchar length needs to be written to file and should be available - // from the object inspector - VarcharTypeInfo typeInfo = (VarcharTypeInfo) ((PrimitiveObjectInspector) inspector) - .getTypeInfo(); - type.setKind(OrcProto.Type.Kind.VARCHAR); - type.setMaximumLength(typeInfo.getLength()); - break; - case BINARY: - type.setKind(OrcProto.Type.Kind.BINARY); - break; - case TIMESTAMP: - type.setKind(OrcProto.Type.Kind.TIMESTAMP); - break; - case DATE: - type.setKind(OrcProto.Type.Kind.DATE); - break; - case DECIMAL: - DecimalTypeInfo decTypeInfo = (DecimalTypeInfo) ((PrimitiveObjectInspector) inspector) - .getTypeInfo(); - type.setKind(OrcProto.Type.Kind.DECIMAL); - type.setPrecision(decTypeInfo.precision()); - type.setScale(decTypeInfo.scale()); - break; - default: - throw new IllegalArgumentException("Unknown primitive category: " + - ((PrimitiveObjectInspector) inspector).getPrimitiveCategory()); - } - result.add(type.build()); - break; - case LIST: - type.setKind(OrcProto.Type.Kind.LIST); - result.add(type.build()); - getOrcTypesImpl(result, ((ListObjectInspector) inspector).getListElementObjectInspector()); - break; - case MAP: - type.setKind(OrcProto.Type.Kind.MAP); - result.add(type.build()); - getOrcTypesImpl(result, ((MapObjectInspector) inspector).getMapKeyObjectInspector()); - getOrcTypesImpl(result, ((MapObjectInspector) inspector).getMapValueObjectInspector()); - break; - case STRUCT: - type.setKind(OrcProto.Type.Kind.STRUCT); - result.add(type.build()); - for (StructField field : ((StructObjectInspector) inspector).getAllStructFieldRefs()) { - getOrcTypesImpl(result, field.getFieldObjectInspector()); - } - break; - case UNION: - type.setKind(OrcProto.Type.Kind.UNION); - result.add(type.build()); - for (ObjectInspector oi : ((UnionObjectInspector) inspector).getObjectInspectors()) { - getOrcTypesImpl(result, oi); - } - break; - default: - throw new IllegalArgumentException("Unknown category: " + inspector.getCategory()); - } - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index 77d2cc6..ccb1d66 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -66,7 +66,7 @@ private OrcProto.StripeFooter stripeFooter; private final long totalRowCount; private final CompressionCodec codec; - private final List types; + private final List readerTypes; private final int bufferSize; private final boolean[] included; private final long rowIndexStride; @@ -153,7 +153,9 @@ protected RecordReaderImpl(List stripes, FileSystem fileSystem, Path path, Reader.Options options, - List types, + List readerTypes, + List schemaEvolutionTypes, + int schemaEvolutionStructSubtype, CompressionCodec codec, int bufferSize, long strideRate, @@ -162,16 +164,16 @@ protected RecordReaderImpl(List stripes, this.path = path; this.file = fileSystem.open(path); this.codec = codec; - this.types = types; + this.readerTypes = readerTypes; this.bufferSize = bufferSize; this.included = options.getInclude(); this.conf = conf; this.rowIndexStride = strideRate; - this.metadata = new MetadataReader(file, codec, bufferSize, types.size()); + this.metadata = new MetadataReader(file, codec, bufferSize, readerTypes.size()); SearchArgument sarg = options.getSearchArgument(); if (sarg != null && strideRate != 0) { sargApp = new SargApplier( - sarg, options.getColumnNames(), strideRate, types, included.length); + sarg, options.getColumnNames(), strideRate, readerTypes, included.length); } else { sargApp = null; } @@ -202,9 +204,12 @@ protected RecordReaderImpl(List stripes, skipCorrupt = OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf); } - reader = RecordReaderFactory.createTreeReader(0, conf, types, included, skipCorrupt); - indexes = new OrcProto.RowIndex[types.size()]; - bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()]; + reader = + TreeReaderFactory.createTreeReader(0, readerTypes, schemaEvolutionTypes, + schemaEvolutionStructSubtype, included, skipCorrupt); + + indexes = new OrcProto.RowIndex[readerTypes.size()]; + bloomFilterIndices = new OrcProto.BloomFilterIndex[readerTypes.size()]; advanceToNextRow(reader, 0L, true); } @@ -690,7 +695,7 @@ private static Object getBaseObjectForComparison(PredicateLeaf.Type type, Object // same as the above array, but indices are set to true private final boolean[] sargColumns; public SargApplier(SearchArgument sarg, String[] columnNames, long rowIndexStride, - List types, int includedCount) { + List readerTypes, int includedCount) { this.sarg = sarg; sargLeaves = sarg.getLeaves(); filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, columnNames, 0); @@ -909,7 +914,7 @@ public ByteBuffer getData() { * @param includedRowGroups which row groups are needed * @param isCompressed does the file have generic compression * @param encodings the encodings for each column - * @param types the types of the columns + * @param readerTypes the readerTypes of the columns * @param compressionSize the compression block size * @return the list of disk ranges that will be loaded */ @@ -920,12 +925,12 @@ public ByteBuffer getData() { boolean[] includedRowGroups, boolean isCompressed, List encodings, - List types, + List readerTypes, int compressionSize, boolean doMergeBuffers) { long offset = 0; // figure out which columns have a present stream - boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types); + boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, readerTypes); DiskRangeListCreateHelper list = new DiskRangeListCreateHelper(); for (OrcProto.Stream stream : streamList) { long length = stream.getLength(); @@ -941,7 +946,7 @@ public ByteBuffer getData() { RecordReaderUtils.addEntireStreamToRanges(offset, length, list, doMergeBuffers); } else { RecordReaderUtils.addRgFilteredStreamToRanges(stream, includedRowGroups, - isCompressed, indexes[column], encodings.get(column), types.get(column), + isCompressed, indexes[column], encodings.get(column), readerTypes.get(column), compressionSize, hasNull[column], offset, length, list, doMergeBuffers); } } @@ -978,7 +983,7 @@ private void readPartialDataStreams(StripeInformation stripe) throws IOException List streamList = stripeFooter.getStreamsList(); DiskRangeList toRead = planReadPartialDataStreams(streamList, indexes, included, includedRowGroups, codec != null, - stripeFooter.getColumnsList(), types, bufferSize, true); + stripeFooter.getColumnsList(), readerTypes, bufferSize, true); if (LOG.isDebugEnabled()) { LOG.debug("chunks = " + RecordReaderUtils.stringifyDiskRanges(toRead)); } @@ -1087,6 +1092,7 @@ public VectorizedRowBatch nextBatch(VectorizedRowBatch previous) throws IOExcept } else { result = (VectorizedRowBatch) previous; result.selectedInUse = false; + reader.setVectorColumnCount(result.getNonPartitionColumnCount()); reader.nextVector(result.cols, (int) batchSize); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java index 9c9a1c0..e46bf3a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java @@ -46,13 +46,19 @@ /** * Stateless methods shared between RecordReaderImpl and EncodedReaderImpl. */ + public class RecordReaderUtils { + static int fake = 0; + private static final HadoopShims SHIMS = ShimLoader.getHadoopShims(); static boolean[] findPresentStreamsByColumn( List streamList, List types) { boolean[] hasNull = new boolean[types.size()]; for(OrcProto.Stream stream: streamList) { if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.PRESENT)) { + if (stream.getColumn() >= hasNull.length) { + fake++; + } hasNull[stream.getColumn()] = true; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java index 6d47532..7af23ab 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java @@ -37,6 +37,7 @@ import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampUtils; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; import org.apache.hadoop.hive.serde2.io.ByteWritable; @@ -65,6 +66,7 @@ protected final int columnId; protected BitFieldReader present = null; protected boolean valuePresent = false; + protected int vectorColumnCount; TreeReader(int columnId) throws IOException { this(columnId, null); @@ -78,6 +80,11 @@ } else { present = new BitFieldReader(in, 1); } + vectorColumnCount = -1; + } + + void setVectorColumnCount(int vectorColumnCount) { + this.vectorColumnCount = vectorColumnCount; } void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { @@ -1947,24 +1954,42 @@ public Object nextVector(Object previousVector, long batchSize) throws IOExcepti } protected static class StructTreeReader extends TreeReader { + private final int readerColumnCount; + private final int resultColumnCount; protected final TreeReader[] fields; private final String[] fieldNames; StructTreeReader(int columnId, - List types, + List readerTypes, + List schemaEvolutionTypes, + int schemaEvolutionSubtype, boolean[] included, boolean skipCorrupt) throws IOException { super(columnId); - OrcProto.Type type = types.get(columnId); - int fieldCount = type.getFieldNamesCount(); - this.fields = new TreeReader[fieldCount]; - this.fieldNames = new String[fieldCount]; - for (int i = 0; i < fieldCount; ++i) { - int subtype = type.getSubtypes(i); + + OrcProto.Type readerStructType = readerTypes.get(columnId); + readerColumnCount = readerStructType.getFieldNamesCount(); + + OrcProto.Type schemaEvolutionStructType = schemaEvolutionTypes.get(columnId); + + if (columnId == schemaEvolutionSubtype) { + // If there are more result columns than reader columns, we will default those additional + // columns to NULL. + resultColumnCount = schemaEvolutionStructType.getFieldNamesCount(); + } else { + resultColumnCount = readerColumnCount; + } + + this.fields = new TreeReader[readerColumnCount]; + this.fieldNames = new String[readerColumnCount]; + for (int i = 0; i < readerColumnCount; ++i) { + int subtype = readerStructType.getSubtypes(i); if (included == null || included[subtype]) { - this.fields[i] = createTreeReader(subtype, types, included, skipCorrupt); + this.fields[i] = createTreeReader(subtype, readerTypes, schemaEvolutionTypes, + schemaEvolutionSubtype, included, skipCorrupt); } - this.fieldNames[i] = type.getFieldNames(i); + // Use the schema evolution name since file/reader types may not have the real column name. + this.fieldNames[i] = schemaEvolutionStructType.getFieldNames(i); } } @@ -1984,22 +2009,28 @@ Object next(Object previous) throws IOException { OrcStruct result = null; if (valuePresent) { if (previous == null) { - result = new OrcStruct(fields.length); + result = new OrcStruct(resultColumnCount); } else { result = (OrcStruct) previous; // If the input format was initialized with a file with a // different number of fields, the number of fields needs to // be updated to the correct number - if (result.getNumFields() != fields.length) { - result.setNumFields(fields.length); + if (result.getNumFields() != resultColumnCount) { + result.setNumFields(resultColumnCount); } } - for (int i = 0; i < fields.length; ++i) { + for (int i = 0; i < readerColumnCount; ++i) { if (fields[i] != null) { result.setFieldValue(i, fields[i].next(result.getFieldValue(i))); } } + if (resultColumnCount > readerColumnCount) { + for (int i = readerColumnCount; i < resultColumnCount; ++i) { + // Default new schema evolution fields to NULL. + result.setFieldValue(i, null); + } + } } return result; } @@ -2008,13 +2039,13 @@ Object next(Object previous) throws IOException { public Object nextVector(Object previousVector, long batchSize) throws IOException { final ColumnVector[] result; if (previousVector == null) { - result = new ColumnVector[fields.length]; + result = new ColumnVector[readerColumnCount]; } else { result = (ColumnVector[]) previousVector; } // Read all the members of struct as column vectors - for (int i = 0; i < fields.length; i++) { + for (int i = 0; i < readerColumnCount; i++) { if (fields[i] != null) { if (result[i] == null) { result[i] = (ColumnVector) fields[i].nextVector(null, batchSize); @@ -2023,6 +2054,17 @@ public Object nextVector(Object previousVector, long batchSize) throws IOExcepti } } } + + // Default additional schema evolution fields to NULL. + if (vectorColumnCount != -1 && vectorColumnCount > readerColumnCount) { + for (int i = readerColumnCount; i < vectorColumnCount; ++i) { + ColumnVector colVector = result[i]; + colVector.isRepeating = true; + colVector.noNulls = false; + colVector.isNull[0] = true; + } + } + return result; } @@ -2054,17 +2096,20 @@ void skipRows(long items) throws IOException { protected RunLengthByteReader tags; UnionTreeReader(int columnId, - List types, + List readerTypes, + List schemaEvolutionTypes, + int schemaEvolutionSubtype, boolean[] included, boolean skipCorrupt) throws IOException { super(columnId); - OrcProto.Type type = types.get(columnId); + OrcProto.Type type = schemaEvolutionTypes.get(columnId); int fieldCount = type.getSubtypesCount(); this.fields = new TreeReader[fieldCount]; for (int i = 0; i < fieldCount; ++i) { int subtype = type.getSubtypes(i); if (included == null || included[subtype]) { - this.fields[i] = createTreeReader(subtype, types, included, skipCorrupt); + this.fields[i] = createTreeReader(subtype, readerTypes, schemaEvolutionTypes, + schemaEvolutionSubtype, included, skipCorrupt); } } } @@ -2134,12 +2179,15 @@ void skipRows(long items) throws IOException { protected IntegerReader lengths = null; ListTreeReader(int columnId, - List types, + List readerTypes, + List schemaEvolutionTypes, + int schemaEvolutionSubtype, boolean[] included, boolean skipCorrupt) throws IOException { super(columnId); - OrcProto.Type type = types.get(columnId); - elementReader = createTreeReader(type.getSubtypes(0), types, included, skipCorrupt); + OrcProto.Type type = schemaEvolutionTypes.get(columnId); + elementReader = createTreeReader(type.getSubtypes(0), readerTypes, schemaEvolutionTypes, + schemaEvolutionSubtype, included, skipCorrupt); } @Override @@ -2224,20 +2272,24 @@ void skipRows(long items) throws IOException { protected IntegerReader lengths = null; MapTreeReader(int columnId, - List types, + List readerTypes, + List schemaEvolutionTypes, + int schemaEvolutionSubtype, boolean[] included, boolean skipCorrupt) throws IOException { super(columnId); - OrcProto.Type type = types.get(columnId); + OrcProto.Type type = schemaEvolutionTypes.get(columnId); int keyColumn = type.getSubtypes(0); int valueColumn = type.getSubtypes(1); if (included == null || included[keyColumn]) { - keyReader = createTreeReader(keyColumn, types, included, skipCorrupt); + keyReader = createTreeReader(keyColumn, readerTypes, schemaEvolutionTypes, + schemaEvolutionSubtype, included, skipCorrupt); } else { keyReader = null; } if (included == null || included[valueColumn]) { - valueReader = createTreeReader(valueColumn, types, included, skipCorrupt); + valueReader = createTreeReader(valueColumn, readerTypes, schemaEvolutionTypes, + schemaEvolutionSubtype, included, skipCorrupt); } else { valueReader = null; } @@ -2317,11 +2369,13 @@ void skipRows(long items) throws IOException { } public static TreeReader createTreeReader(int columnId, - List types, + List readerTypes, + List schemaEvolutionTypes, + int schemaEvolutionSubtype, boolean[] included, boolean skipCorrupt ) throws IOException { - OrcProto.Type type = types.get(columnId); + OrcProto.Type type = schemaEvolutionTypes.get(columnId); switch (type.getKind()) { case BOOLEAN: return new BooleanTreeReader(columnId); @@ -2361,13 +2415,17 @@ public static TreeReader createTreeReader(int columnId, int scale = type.hasScale() ? type.getScale() : HiveDecimal.SYSTEM_DEFAULT_SCALE; return new DecimalTreeReader(columnId, precision, scale); case STRUCT: - return new StructTreeReader(columnId, types, included, skipCorrupt); + return new StructTreeReader(columnId, readerTypes, schemaEvolutionTypes, + schemaEvolutionSubtype, included, skipCorrupt); case LIST: - return new ListTreeReader(columnId, types, included, skipCorrupt); + return new ListTreeReader(columnId, readerTypes, schemaEvolutionTypes, + schemaEvolutionSubtype, included, skipCorrupt); case MAP: - return new MapTreeReader(columnId, types, included, skipCorrupt); + return new MapTreeReader(columnId, readerTypes, schemaEvolutionTypes, + schemaEvolutionSubtype, included, skipCorrupt); case UNION: - return new UnionTreeReader(columnId, types, included, skipCorrupt); + return new UnionTreeReader(columnId, readerTypes, schemaEvolutionTypes, + schemaEvolutionSubtype, included, skipCorrupt); default: throw new IllegalArgumentException("Unsupported type " + type.getKind()); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java index a8e5c2e..069ba54 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java @@ -19,12 +19,14 @@ package org.apache.hadoop.hive.ql.io.orc; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.AcidInputFormat; import org.apache.hadoop.hive.ql.io.RecordIdentifier; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; @@ -46,7 +48,9 @@ private final AcidInputFormat.RowReader innerReader; private final RecordIdentifier key; private final OrcStruct value; - private final VectorizedRowBatchCtx rowBatchCtx; + private MapWork mapWork; + private VectorizedRowBatchCtx rbCtx; + private Object[] partitionValues; private final ObjectInspector objectInspector; private final DataOutputBuffer buffer = new DataOutputBuffer(); @@ -55,22 +59,15 @@ FileSplit split) throws IOException { this.innerReader = inner; this.key = inner.createKey(); - this.rowBatchCtx = new VectorizedRowBatchCtx(); + mapWork = Utilities.getMapWork(conf); + rbCtx = mapWork.getVectorizedRowBatchCtx(); + int partitionColumnCount = rbCtx.getPartitionColumnCount(); + if (partitionColumnCount > 0) { + partitionValues = new Object[partitionColumnCount]; + rbCtx.getPartitionValues(rbCtx, conf, split, partitionValues); + } this.value = inner.createValue(); this.objectInspector = inner.getObjectInspector(); - try { - rowBatchCtx.init(conf, split); - } catch (ClassNotFoundException e) { - throw new IOException("Failed to initialize context", e); - } catch (SerDeException e) { - throw new IOException("Failed to initialize context", e); - } catch (InstantiationException e) { - throw new IOException("Failed to initialize context", e); - } catch (IllegalAccessException e) { - throw new IOException("Failed to initialize context", e); - } catch (HiveException e) { - throw new IOException("Failed to initialize context", e); - } } @Override @@ -82,20 +79,22 @@ public boolean next(NullWritable nullWritable, if (!innerReader.next(key, value)) { return false; } - try { - rowBatchCtx.addPartitionColsToBatch(vectorizedRowBatch); - } catch (HiveException e) { - throw new IOException("Problem adding partition column", e); + if (partitionValues != null) { + try { + rbCtx.addPartitionColsToBatch(vectorizedRowBatch, partitionValues); + } catch (HiveException e) { + throw new IOException("Problem adding partition column", e); + } } try { VectorizedBatchUtil.acidAddRowToBatch(value, (StructObjectInspector) objectInspector, - vectorizedRowBatch.size++, vectorizedRowBatch, rowBatchCtx, buffer); + vectorizedRowBatch.size++, vectorizedRowBatch, rbCtx, buffer); while (vectorizedRowBatch.size < vectorizedRowBatch.selected.length && innerReader.next(key, value)) { VectorizedBatchUtil.acidAddRowToBatch(value, (StructObjectInspector) objectInspector, - vectorizedRowBatch.size++, vectorizedRowBatch, rowBatchCtx, buffer); + vectorizedRowBatch.size++, vectorizedRowBatch, rbCtx, buffer); } } catch (HiveException he) { throw new IOException("error iterating", he); @@ -110,11 +109,7 @@ public NullWritable createKey() { @Override public VectorizedRowBatch createValue() { - try { - return rowBatchCtx.createVectorizedRowBatch(); - } catch (HiveException e) { - throw new RuntimeException("Error creating a batch", e); - } + return rbCtx.createVectorizedRowBatch(); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java index bf09001..0a5f1f3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java @@ -27,6 +27,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -35,6 +36,8 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.InputFormatChecker; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.MapWork; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileSplit; @@ -55,7 +58,9 @@ private final long offset; private final long length; private float progress = 0.0f; + private MapWork mapWork; private VectorizedRowBatchCtx rbCtx; + private Object[] partitionValues; private boolean addPartitionCols = true; VectorizedOrcRecordReader(Reader file, Configuration conf, @@ -69,11 +74,13 @@ OrcInputFormat.setSearchArgument(options, types, conf, true); this.reader = file.rowsOptions(options); - try { - rbCtx = new VectorizedRowBatchCtx(); - rbCtx.init(conf, fileSplit); - } catch (Exception e) { - throw new RuntimeException(e); + + mapWork = Utilities.getMapWork(conf); + rbCtx = mapWork.getVectorizedRowBatchCtx(); + int partitionColumnCount = rbCtx.getPartitionColumnCount(); + if (partitionColumnCount > 0) { + partitionValues = new Object[partitionColumnCount]; + rbCtx.getPartitionValues(rbCtx, conf, fileSplit, partitionValues); } } @@ -90,7 +97,9 @@ public boolean next(NullWritable key, VectorizedRowBatch value) throws IOExcepti // as this does not call CreateValue for each new RecordReader it creates, this check is // required in next() if (addPartitionCols) { - rbCtx.addPartitionColsToBatch(value); + if (partitionValues != null) { + rbCtx.addPartitionColsToBatch(value, partitionValues); + } addPartitionCols = false; } reader.nextBatch(value); @@ -108,11 +117,7 @@ public NullWritable createKey() { @Override public VectorizedRowBatch createValue() { - try { - return rbCtx.createVectorizedRowBatch(); - } catch (HiveException e) { - throw new RuntimeException("Error creating a batch", e); - } + return rbCtx.createVectorizedRowBatch(); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/parquet/VectorizedParquetInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/parquet/VectorizedParquetInputFormat.java index ed99615..54735e6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/parquet/VectorizedParquetInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/parquet/VectorizedParquetInputFormat.java @@ -14,8 +14,10 @@ package org.apache.hadoop.hive.ql.io.parquet; import java.io.IOException; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnAssign; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnAssignFactory; import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface; @@ -23,6 +25,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; @@ -32,7 +35,6 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; - import org.apache.parquet.hadoop.ParquetInputFormat; /** @@ -51,7 +53,9 @@ private static final Log LOG = LogFactory.getLog(VectorizedParquetRecordReader.class); private final ParquetRecordReaderWrapper internalReader; + private MapWork mapWork; private VectorizedRowBatchCtx rbCtx; + private Object[] partitionValues; private ArrayWritable internalValues; private NullWritable internalKey; private VectorColumnAssign[] assigners; @@ -65,11 +69,12 @@ public VectorizedParquetRecordReader( split, conf, reporter); - try { - rbCtx = new VectorizedRowBatchCtx(); - rbCtx.init(conf, split); - } catch (Exception e) { - throw new RuntimeException(e); + mapWork = Utilities.getMapWork(conf); + rbCtx = mapWork.getVectorizedRowBatchCtx(); + int partitionColumnCount = rbCtx.getPartitionColumnCount(); + if (partitionColumnCount > 0) { + partitionValues = new Object[partitionColumnCount]; + rbCtx.getPartitionValues(rbCtx, conf, split, partitionValues); } } @@ -81,13 +86,9 @@ public NullWritable createKey() { @Override public VectorizedRowBatch createValue() { - VectorizedRowBatch outputBatch = null; - try { - outputBatch = rbCtx.createVectorizedRowBatch(); - internalValues = internalReader.createValue(); - } catch (HiveException e) { - throw new RuntimeException("Error creating a batch", e); - } + VectorizedRowBatch outputBatch; + outputBatch = rbCtx.createVectorizedRowBatch(); + internalValues = internalReader.createValue(); return outputBatch; } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index da1d9eb..8584d22 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -26,11 +26,13 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.Stack; import java.util.regex.Pattern; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; @@ -53,7 +55,6 @@ import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterLongOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterStringOperator; -import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOuterFilteredOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorSMBMapJoinOperator; @@ -61,6 +62,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext.InConstantType; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion; import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; @@ -88,6 +90,7 @@ import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.VectorPartitionConversion; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; @@ -100,6 +103,7 @@ import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.ql.udf.UDFAcos; import org.apache.hadoop.hive.ql.udf.UDFAsin; @@ -309,14 +313,51 @@ public Vectorizer() { supportedAggregationUdfs.add("stddev_samp"); } + private class VectorTaskColumnInfo { + List columnNames; + List typeInfos; + int partitionColumnCount; + + String[] scratchTypeNameArray; + + VectorTaskColumnInfo() { + partitionColumnCount = 0; + } + + public void setColumnNames(List columnNames) { + this.columnNames = columnNames; + } + public void setTypeInfos(List typeInfos) { + this.typeInfos = typeInfos; + } + public void setPartitionColumnCount(int partitionColumnCount) { + this.partitionColumnCount = partitionColumnCount; + } + public void setScratchTypeNameArray(String[] scratchTypeNameArray) { + this.scratchTypeNameArray = scratchTypeNameArray; + } + + public void transferToBaseWork(BaseWork baseWork) { + + String[] columnNameArray = columnNames.toArray(new String[0]); + TypeInfo[] typeInfoArray = typeInfos.toArray(new TypeInfo[0]); + + VectorizedRowBatchCtx vectorizedRowBatchCtx = + new VectorizedRowBatchCtx( + columnNameArray, + typeInfoArray, + partitionColumnCount, + scratchTypeNameArray); + baseWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx); + } + } + class VectorizationDispatcher implements Dispatcher { - private List reduceColumnNames; - private List reduceTypeInfos; + private final PhysicalContext physicalContext; public VectorizationDispatcher(PhysicalContext physicalContext) { - reduceColumnNames = null; - reduceTypeInfos = null; + this.physicalContext = physicalContext; } @Override @@ -354,9 +395,10 @@ public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) } private void convertMapWork(MapWork mapWork, boolean isTez) throws SemanticException { - boolean ret = validateMapWork(mapWork, isTez); + VectorTaskColumnInfo vectorTaskColumnInfo = new VectorTaskColumnInfo(); + boolean ret = validateMapWork(mapWork, vectorTaskColumnInfo, isTez); if (ret) { - vectorizeMapWork(mapWork, isTez); + vectorizeMapWork(mapWork, vectorTaskColumnInfo, isTez); } } @@ -367,40 +409,342 @@ private void addMapWorkRules(Map opRules, NodeProcessor np) + ReduceSinkOperator.getOperatorName()), np); } - private boolean validateMapWork(MapWork mapWork, boolean isTez) throws SemanticException { - LOG.info("Validating MapWork..."); + private ImmutablePair verifyOnlyOneTableScanOperator(MapWork mapWork) { // Eliminate MR plans with more than one TableScanOperator. + LinkedHashMap> aliasToWork = mapWork.getAliasToWork(); if ((aliasToWork == null) || (aliasToWork.size() == 0)) { - return false; + return null; } int tableScanCount = 0; - for (Operator op : aliasToWork.values()) { + String alias = ""; + TableScanOperator tableScanOperator = null; + for (Entry> entry : aliasToWork.entrySet()) { + Operator op = entry.getValue(); if (op == null) { LOG.warn("Map work has invalid aliases to work with. Fail validation!"); - return false; + return null; } if (op instanceof TableScanOperator) { tableScanCount++; + alias = entry.getKey(); + tableScanOperator = (TableScanOperator) op; } } if (tableScanCount > 1) { - LOG.warn("Map work has more than 1 TableScanOperator aliases to work with. Fail validation!"); - return false; + LOG.warn("Map work has more than 1 TableScanOperator. Fail validation!"); + return null; + } + return new ImmutablePair(alias, tableScanOperator); + } + + private void getTableScanOperatorSchemaInfo(TableScanOperator tableScanOperator, + List logicalColumnNameList, List logicalTypeInfoList) { + + TableScanDesc tableScanDesc = tableScanOperator.getConf(); + + // Add all non-virtual columns to make a vectorization context for + // the TableScan operator. + RowSchema rowSchema = tableScanOperator.getSchema(); + for (ColumnInfo c : rowSchema.getSignature()) { + // Validation will later exclude vectorization of virtual columns usage (HIVE-5560). + if (!isVirtualColumn(c)) { + String columnName = c.getInternalName(); + String typeName = c.getTypeName(); + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + + logicalColumnNameList.add(columnName); + logicalTypeInfoList.add(typeInfo); + } } + } + + private String getColumnsString(List columnNames) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < columnNames.size(); i++) { + if (i > 0) { + sb.append(","); + } + sb.append(columnNames.get(i)); + } + return sb.toString(); + } + + private String getTypesString(List typeInfos) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < typeInfos.size(); i++) { + if (i > 0) { + sb.append(":"); + } + sb.append(typeInfos.get(i).getTypeName()); + } + return sb.toString(); + } + + private String getColumns(List columnNames, int start, int length, + Character separator) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + if (i > start) { + sb.append(separator); + } + sb.append(columnNames.get(i)); + } + return sb.toString(); + } + + private String getTypes(List typeInfos, int start, int length, + Character separator) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + if (i > start) { + sb.append(separator); + } + sb.append(typeInfos.get(i).getTypeName()); + } + return sb.toString(); + } + + private boolean verifyInputFormat(PartitionDesc pd) { + + // Look for Pass-Thru case where InputFileFormat has VectorizedInputFormatInterface + // and reads VectorizedRowBatch as a "row". + + List> interfaceList = + Arrays.asList(pd.getInputFileFormatClass().getInterfaces()); + if (interfaceList.contains(VectorizedInputFormatInterface.class)) { + + pd.setVectorPartitionDesc(VectorPartitionDesc.VectorizedInputFileFormat()); + + return true; + } + + LOG.info("Input format: " + pd.getInputFileFormatClassName() + + ", doesn't provide vectorized input"); + + return false; + } + + private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String alias, + TableScanOperator tableScanOperator, VectorTaskColumnInfo vectorTaskColumnInfo) { + + final List logicalColumnNameList = new ArrayList(); + final List logicalTypeInfoList = new ArrayList(); + + getTableScanOperatorSchemaInfo(tableScanOperator, + logicalColumnNameList, logicalTypeInfoList); + final int logicalCount = logicalColumnNameList.size(); + + // Validate input format and schema evolution capability. + + // For the table, enter a null value in the multi-key map indicating no conversion necessary + // if the schema matches the table. + + HashMap conversionMap = new HashMap(); + + boolean isFirst = true; + int nonPartitionColumnCount = 0; + int partitionColumnCount = 0; + + List nonPartColumnList = null; + String nonPartColumnsString = ""; + List nonPartTypeInfoList = null; + String partColumnsString = ""; + String partTypesString = ""; // Validate the input format - for (String path : mapWork.getPathToPartitionInfo().keySet()) { - PartitionDesc pd = mapWork.getPathToPartitionInfo().get(path); - List> interfaceList = - Arrays.asList(pd.getInputFileFormatClass().getInterfaces()); - if (!interfaceList.contains(VectorizedInputFormatInterface.class)) { - LOG.info("Input format: " + pd.getInputFileFormatClassName() - + ", doesn't provide vectorized input"); + VectorPartitionConversion partitionConversion = new VectorPartitionConversion(); + LinkedHashMap> pathToAliases = mapWork.getPathToAliases(); + LinkedHashMap pathToPartitionInfo = mapWork.getPathToPartitionInfo(); + for (Entry> entry: pathToAliases.entrySet()) { + String path = entry.getKey(); + List aliases = entry.getValue(); + boolean isPresent = (aliases != null && aliases.indexOf(alias) != -1); + if (!isPresent) { + LOG.info("Alias " + alias + " not present in aliases " + aliases); + return false; + } + PartitionDesc partDesc = pathToPartitionInfo.get(path); + if (partDesc.getVectorPartitionDesc() != null) { + // We seen this already. + continue; + } + if (!verifyInputFormat(partDesc)) { return false; } + VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); + LOG.info("Vectorizer path: " + path + ", read type " + + vectorPartDesc.getVectorMapOperatorReadType().name() + ", aliases " + aliases); + + Properties partProps = partDesc.getProperties(); + + String nextNonPartColumnsString = + partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMNS); + String[] nextNonPartColumns = nextNonPartColumnsString.split(","); + + String nextNonPartTypesString = + partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMN_TYPES); + + // We convert to an array of TypeInfo using a library routine since it parses the information + // and can handle use of different separators, etc. We cannot use the raw type string + // for comparison in the map because of the different separators used. + List nextNonPartTypeInfoList = + TypeInfoUtils.getTypeInfosFromTypeString(nextNonPartTypesString); + + String nextPartColumnsString = + partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); + String nextPartTypesString = + partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES); + + if (isFirst) { + + // We establish with the first one whether the table is partitioned or not. + + if (nextPartColumnsString == null) { + partitionColumnCount = 0; + nonPartitionColumnCount = logicalCount; + } else { + partitionColumnCount = nextPartColumnsString.split("/").length; + nonPartitionColumnCount = logicalCount - partitionColumnCount; + } + + nonPartColumnList = logicalColumnNameList.subList(0, nonPartitionColumnCount); + nonPartColumnsString = getColumns(logicalColumnNameList, 0, nonPartitionColumnCount, ','); + nonPartTypeInfoList = logicalTypeInfoList.subList(0, nonPartitionColumnCount); + + if (partitionColumnCount > 0) { + partColumnsString = + getColumns(logicalColumnNameList, nonPartitionColumnCount, partitionColumnCount, '/'); + partTypesString = + getTypes(logicalTypeInfoList, nonPartitionColumnCount, partitionColumnCount, ':'); + + if (!partColumnsString.equalsIgnoreCase(nextPartColumnsString)) { + LOG.info( + String.format("Could not vectorize partition %s. Its partition column names %s do not match the table's partition column names %s", + path, nextPartColumnsString, partColumnsString)); + return false; + } + if (!partTypesString.equalsIgnoreCase(nextPartTypesString)) { + LOG.info( + String.format("Could not vectorize partition %s. Its partition column types %s do not match the table's partition column types %s", + path, nextPartTypesString, partTypesString)); + return false; + } + } + + // Add the table (non-partitioned) columns and types into the map as not needing + // conversion (i.e. null). + conversionMap.put( + new ImmutablePair(nonPartColumnsString, nonPartTypeInfoList), null); + + isFirst = false; + } else { + if (partitionColumnCount > 0) { + if (!partColumnsString.equalsIgnoreCase(nextPartColumnsString)) { + LOG.info( + String.format("Could not vectorize partition %s. Its partition column names %s do not match the other partition column names %s", + path, nextPartColumnsString, partColumnsString)); + return false; + } + if (!partTypesString.equalsIgnoreCase(nextPartTypesString)) { + LOG.info( + String.format("Could not vectorize partition %s. Its partition column types %s do not match the other partition column types %s", + path, nextPartTypesString, partTypesString)); + return false; + } + } + } + + ImmutablePair columnNamesAndTypesCombination = + new ImmutablePair(nextNonPartColumnsString, nextNonPartTypeInfoList); + + boolean[] conversionFlags; + if (conversionMap.containsKey(columnNamesAndTypesCombination)) { + + conversionFlags = conversionMap.get(columnNamesAndTypesCombination); + + } else { + + List nextNonPartColumnList = Arrays.asList(nextNonPartColumns); + + // Validate the column names that are present are the same. Missing columns will be + // implicitly defaulted to null. + + if (nextNonPartColumnList.size() > nonPartColumnList.size()) { + LOG.info( + String.format("Could not vectorize partition %s. The partition column names %d is greater than the number of table columns %d", + path, nextNonPartColumnList.size(), nonPartColumnList.size())); + return false; + } + for (int i = 0; i < nextNonPartColumnList.size(); i++) { + String nextColumnName = nextNonPartColumnList.get(i); + String tableColumnName = nonPartColumnList.get(i); + if (!nextColumnName.equals(tableColumnName)) { + LOG.info( + String.format("Could not vectorize partition %s. The partition column name %s is does not match table column name %s", + path, nextColumnName, tableColumnName)); + return false; + } + } + + // The table column types might have been changed with ALTER. There are restrictions + // here for vectorization. + + // Some readers / deserializers take responsibility for conversion themselves. + + // If we need to check for conversion, the conversion object may come back null + // indicating from a vectorization point of view the conversion is implicit. That is, + // all implicit integer upgrades. + + if (vectorPartDesc.getNeedsDataTypeConversionCheck() && + !nextNonPartTypeInfoList.equals(nonPartTypeInfoList)) { + + // The results will be in 2 members: validConversion and conversionFlags + partitionConversion.validateConversion(nextNonPartTypeInfoList, nonPartTypeInfoList); + if (!partitionConversion.getValidConversion()) { + return false; + } + conversionFlags = partitionConversion.getResultConversionFlags(); + } else { + conversionFlags = null; + } + + // We enter this in our map so we don't have to check again for subsequent partitions. + + conversionMap.put(columnNamesAndTypesCombination, conversionFlags); + } + + vectorPartDesc.setConversionFlags(conversionFlags); + + vectorPartDesc.setTypeInfos(nextNonPartTypeInfoList); } + + vectorTaskColumnInfo.setColumnNames(logicalColumnNameList); + vectorTaskColumnInfo.setTypeInfos(logicalTypeInfoList); + vectorTaskColumnInfo.setPartitionColumnCount(partitionColumnCount); + + return true; + } + + private boolean validateMapWork(MapWork mapWork, VectorTaskColumnInfo vectorTaskColumnInfo, boolean isTez) + throws SemanticException { + + LOG.info("Validating MapWork..."); + + ImmutablePair pair = verifyOnlyOneTableScanOperator(mapWork); + if (pair == null) { + return false; + } + String alias = pair.left; + TableScanOperator tableScanOperator = pair.right; + + // This call fills in the column names, types, and partition column count in + // vectorTaskColumnInfo. + if (!validateInputFormatAndSchemaEvolution(mapWork, alias, tableScanOperator, vectorTaskColumnInfo)) { + return false; + } + Map opRules = new LinkedHashMap(); MapWorkValidationNodeProcessor vnp = new MapWorkValidationNodeProcessor(mapWork, isTez); addMapWorkRules(opRules, vnp); @@ -422,11 +766,14 @@ private boolean validateMapWork(MapWork mapWork, boolean isTez) throws SemanticE return true; } - private void vectorizeMapWork(MapWork mapWork, boolean isTez) throws SemanticException { + private void vectorizeMapWork(MapWork mapWork, VectorTaskColumnInfo vectorTaskColumnInfo, + boolean isTez) throws SemanticException { + LOG.info("Vectorizing MapWork..."); mapWork.setVectorMode(true); Map opRules = new LinkedHashMap(); - MapWorkVectorizationNodeProcessor vnp = new MapWorkVectorizationNodeProcessor(mapWork, isTez); + MapWorkVectorizationNodeProcessor vnp = + new MapWorkVectorizationNodeProcessor(mapWork, isTez, vectorTaskColumnInfo); addMapWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new PreOrderOnceWalker(disp); @@ -436,9 +783,9 @@ private void vectorizeMapWork(MapWork mapWork, boolean isTez) throws SemanticExc HashMap nodeOutput = new HashMap(); ogw.startWalking(topNodes, nodeOutput); - mapWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap()); - mapWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap()); - mapWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap()); + vectorTaskColumnInfo.setScratchTypeNameArray(vnp.getVectorScratchColumnTypeNames()); + + vectorTaskColumnInfo.transferToBaseWork(mapWork); if (LOG.isDebugEnabled()) { debugDisplayAllMaps(mapWork); @@ -448,13 +795,19 @@ private void vectorizeMapWork(MapWork mapWork, boolean isTez) throws SemanticExc } private void convertReduceWork(ReduceWork reduceWork, boolean isTez) throws SemanticException { - boolean ret = validateReduceWork(reduceWork); + VectorTaskColumnInfo vectorTaskColumnInfo = new VectorTaskColumnInfo(); + boolean ret = validateReduceWork(reduceWork, vectorTaskColumnInfo, isTez); if (ret) { - vectorizeReduceWork(reduceWork, isTez); + vectorizeReduceWork(reduceWork, vectorTaskColumnInfo, isTez); } } - private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork) throws SemanticException { + private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork, + VectorTaskColumnInfo vectorTaskColumnInfo) throws SemanticException { + + ArrayList reduceColumnNames = new ArrayList(); + ArrayList reduceTypeInfos = new ArrayList(); + try { // Check key ObjectInspector. ObjectInspector keyObjectInspector = reduceWork.getKeyObjectInspector(); @@ -478,9 +831,6 @@ private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork) throws Sema StructObjectInspector valueStructObjectInspector = (StructObjectInspector)valueObjectInspector; List valueFields = valueStructObjectInspector.getAllStructFieldRefs(); - reduceColumnNames = new ArrayList(); - reduceTypeInfos = new ArrayList(); - for (StructField field: keyFields) { reduceColumnNames.add(Utilities.ReduceField.KEY.toString() + "." + field.getFieldName()); reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName())); @@ -492,6 +842,10 @@ private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork) throws Sema } catch (Exception e) { throw new SemanticException(e); } + + vectorTaskColumnInfo.setColumnNames(reduceColumnNames); + vectorTaskColumnInfo.setTypeInfos(reduceTypeInfos); + return true; } @@ -500,11 +854,13 @@ private void addReduceWorkRules(Map opRules, NodeProcessor opRules.put(new RuleRegExp("R2", SelectOperator.getOperatorName() + ".*"), np); } - private boolean validateReduceWork(ReduceWork reduceWork) throws SemanticException { + private boolean validateReduceWork(ReduceWork reduceWork, + VectorTaskColumnInfo vectorTaskColumnInfo, boolean isTez) throws SemanticException { + LOG.info("Validating ReduceWork..."); // Validate input to ReduceWork. - if (!getOnlyStructObjectInspectors(reduceWork)) { + if (!getOnlyStructObjectInspectors(reduceWork, vectorTaskColumnInfo)) { return false; } // Now check the reduce operator tree. @@ -528,7 +884,9 @@ private boolean validateReduceWork(ReduceWork reduceWork) throws SemanticExcepti return true; } - private void vectorizeReduceWork(ReduceWork reduceWork, boolean isTez) throws SemanticException { + private void vectorizeReduceWork(ReduceWork reduceWork, + VectorTaskColumnInfo vectorTaskColumnInfo, boolean isTez) throws SemanticException { + LOG.info("Vectorizing ReduceWork..."); reduceWork.setVectorMode(true); @@ -537,7 +895,7 @@ private void vectorizeReduceWork(ReduceWork reduceWork, boolean isTez) throws Se // VectorizationContext... Do we use PreOrderWalker instead of DefaultGraphWalker. Map opRules = new LinkedHashMap(); ReduceWorkVectorizationNodeProcessor vnp = - new ReduceWorkVectorizationNodeProcessor(reduceColumnNames, reduceTypeInfos, isTez); + new ReduceWorkVectorizationNodeProcessor(vectorTaskColumnInfo, isTez); addReduceWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new PreOrderWalker(disp); @@ -552,9 +910,9 @@ private void vectorizeReduceWork(ReduceWork reduceWork, boolean isTez) throws Se // Necessary since we are vectorizing the root operator in reduce. reduceWork.setReducer(vnp.getRootVectorOp()); - reduceWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap()); - reduceWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap()); - reduceWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap()); + vectorTaskColumnInfo.setScratchTypeNameArray(vnp.getVectorScratchColumnTypeNames()); + + vectorTaskColumnInfo.transferToBaseWork(reduceWork); if (LOG.isDebugEnabled()) { debugDisplayAllMaps(reduceWork); @@ -622,23 +980,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // The vectorization context for the Map or Reduce task. protected VectorizationContext taskVectorizationContext; - // The input projection column type name map for the Map or Reduce task. - protected Map taskColumnTypeNameMap; - VectorizationNodeProcessor() { - taskColumnTypeNameMap = new HashMap(); - } - - public Map getVectorColumnNameMap() { - return taskVectorizationContext.getProjectionColumnMap(); - } - - public Map getVectorColumnTypeMap() { - return taskColumnTypeNameMap; } - public Map getVectorScratchColumnTypeMap() { - return taskVectorizationContext.getScratchColumnTypeMap(); + public String[] getVectorScratchColumnTypeNames() { + return taskVectorizationContext.getScratchColumnTypeNames(); } protected final Set> opsDone = @@ -707,10 +1053,15 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, class MapWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { + private final MapWork mWork; + private VectorTaskColumnInfo vectorTaskColumnInfo; private final boolean isTez; - public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTez) { + public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTez, + VectorTaskColumnInfo vectorTaskColumnInfo) { super(); + this.mWork = mWork; + this.vectorTaskColumnInfo = vectorTaskColumnInfo; this.isTez = isTez; } @@ -724,8 +1075,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, if (op instanceof TableScanOperator) { if (taskVectorizationContext == null) { - taskVectorizationContext = getVectorizationContext(op.getSchema(), op.getName(), - taskColumnTypeNameMap); + taskVectorizationContext = getVectorizationContext(op.getName(), vectorTaskColumnInfo); } vContext = taskVectorizationContext; } else { @@ -766,10 +1116,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { - private final List reduceColumnNames; - private final List reduceTypeInfos; + private VectorTaskColumnInfo vectorTaskColumnInfo; - private final boolean isTez; + private boolean isTez; private Operator rootVectorOp; @@ -777,11 +1126,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return rootVectorOp; } - public ReduceWorkVectorizationNodeProcessor(List reduceColumnNames, - List reduceTypeInfos, boolean isTez) { + public ReduceWorkVectorizationNodeProcessor(VectorTaskColumnInfo vectorTaskColumnInfo, + boolean isTez) { + super(); - this.reduceColumnNames = reduceColumnNames; - this.reduceTypeInfos = reduceTypeInfos; + this.vectorTaskColumnInfo = vectorTaskColumnInfo; rootVectorOp = null; this.isTez = isTez; } @@ -797,15 +1146,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, boolean saveRootVectorOp = false; if (op.getParentOperators().size() == 0) { - LOG.info("ReduceWorkVectorizationNodeProcessor process reduceColumnNames " + reduceColumnNames.toString()); + LOG.info("ReduceWorkVectorizationNodeProcessor process reduceColumnNames " + vectorTaskColumnInfo.columnNames.toString()); - vContext = new VectorizationContext("__Reduce_Shuffle__", reduceColumnNames); + vContext = new VectorizationContext("__Reduce_Shuffle__", vectorTaskColumnInfo.columnNames); taskVectorizationContext = vContext; - int i = 0; - for (TypeInfo typeInfo : reduceTypeInfos) { - taskColumnTypeNameMap.put(i, typeInfo.getTypeName()); - i++; - } + saveRootVectorOp = true; if (LOG.isDebugEnabled()) { @@ -869,6 +1214,7 @@ protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) { @Override public PhysicalContext resolve(PhysicalContext physicalContext) throws SemanticException { + hiveConf = physicalContext.getConf(); boolean vectorPath = HiveConf.getBoolVar(hiveConf, @@ -1422,23 +1768,10 @@ private boolean validateDataType(String type, VectorExpressionDescriptor.Mode mo return result; } - private VectorizationContext getVectorizationContext(RowSchema rowSchema, String contextName, - Map typeNameMap) { + private VectorizationContext getVectorizationContext(String contextName, + VectorTaskColumnInfo vectorTaskColumnInfo) { - VectorizationContext vContext = new VectorizationContext(contextName); - - // Add all non-virtual columns to make a vectorization context for - // the TableScan operator. - int i = 0; - for (ColumnInfo c : rowSchema.getSignature()) { - // Earlier, validation code should have eliminated virtual columns usage (HIVE-5560). - if (!isVirtualColumn(c)) { - vContext.addInitialColumn(c.getInternalName()); - typeNameMap.put(i, c.getTypeName()); - i++; - } - } - vContext.finishedAddingInitialColumns(); + VectorizationContext vContext = new VectorizationContext(contextName, vectorTaskColumnInfo.columnNames); return vContext; } @@ -1794,12 +2127,16 @@ private boolean isVirtualColumn(ColumnInfo column) { public void debugDisplayAllMaps(BaseWork work) { - Map columnNameMap = work.getVectorColumnNameMap(); - Map columnTypeMap = work.getVectorColumnTypeMap(); - Map scratchColumnTypeMap = work.getVectorScratchColumnTypeMap(); + VectorizedRowBatchCtx vectorizedRowBatchCtx = work.getVectorizedRowBatchCtx(); + + String[] columnNames = vectorizedRowBatchCtx.getRowColumnNames(); + Object columnTypeInfos = vectorizedRowBatchCtx.getRowColumnTypeInfos(); + int partitionColumnCount = vectorizedRowBatchCtx.getPartitionColumnCount(); + String[] scratchColumnTypeNames =vectorizedRowBatchCtx.getScratchColumnTypeNames(); - LOG.debug("debugDisplayAllMaps columnNameMap " + columnNameMap.toString()); - LOG.debug("debugDisplayAllMaps columnTypeMap " + columnTypeMap.toString()); - LOG.debug("debugDisplayAllMaps scratchColumnTypeMap " + scratchColumnTypeMap.toString()); + LOG.debug("debugDisplayAllMaps columnNames " + Arrays.toString(columnNames)); + LOG.debug("debugDisplayAllMaps columnTypeInfos " + Arrays.deepToString((Object[]) columnTypeInfos)); + LOG.debug("debugDisplayAllMaps partitionColumnCount " + partitionColumnCount); + LOG.debug("debugDisplayAllMaps scratchColumnTypeNames " + Arrays.toString(scratchColumnTypeNames)); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java.orig ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java.orig deleted file mode 100644 index 0d4c1d8..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java.orig +++ /dev/null @@ -1,1744 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.optimizer.physical; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; -import java.util.Stack; -import java.util.regex.Pattern; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; -import org.apache.hadoop.hive.ql.exec.*; -import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; -import org.apache.hadoop.hive.ql.exec.spark.SparkTask; -import org.apache.hadoop.hive.ql.exec.tez.TezTask; -import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyLongOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyMultiKeyOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyStringOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerLongOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerMultiKeyOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerStringOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiLongOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiMultiKeyOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiStringOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterLongOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterMultiKeyOperator; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterStringOperator; -import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator; -import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOuterFilteredOperator; -import org.apache.hadoop.hive.ql.exec.vector.VectorSMBMapJoinOperator; -import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; -import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface; -import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression; -import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; -import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; -import org.apache.hadoop.hive.ql.lib.Dispatcher; -import org.apache.hadoop.hive.ql.lib.GraphWalker; -import org.apache.hadoop.hive.ql.lib.Node; -import org.apache.hadoop.hive.ql.lib.NodeProcessor; -import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; -import org.apache.hadoop.hive.ql.lib.PreOrderOnceWalker; -import org.apache.hadoop.hive.ql.lib.PreOrderWalker; -import org.apache.hadoop.hive.ql.lib.Rule; -import org.apache.hadoop.hive.ql.lib.RuleRegExp; -import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.metadata.VirtualColumn; -import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc; -import org.apache.hadoop.hive.ql.plan.AggregationDesc; -import org.apache.hadoop.hive.ql.plan.BaseWork; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; -import org.apache.hadoop.hive.ql.plan.GroupByDesc; -import org.apache.hadoop.hive.ql.plan.JoinDesc; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.ql.plan.MapWork; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.ql.plan.PartitionDesc; -import org.apache.hadoop.hive.ql.plan.ReduceWork; -import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; -import org.apache.hadoop.hive.ql.plan.SparkHashTableSinkDesc; -import org.apache.hadoop.hive.ql.plan.SparkWork; -import org.apache.hadoop.hive.ql.plan.TableScanDesc; -import org.apache.hadoop.hive.ql.plan.TezWork; -import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; -import org.apache.hadoop.hive.ql.plan.api.OperatorType; -import org.apache.hadoop.hive.ql.udf.UDFAcos; -import org.apache.hadoop.hive.ql.udf.UDFAsin; -import org.apache.hadoop.hive.ql.udf.UDFAtan; -import org.apache.hadoop.hive.ql.udf.UDFBin; -import org.apache.hadoop.hive.ql.udf.UDFConv; -import org.apache.hadoop.hive.ql.udf.UDFCos; -import org.apache.hadoop.hive.ql.udf.UDFDayOfMonth; -import org.apache.hadoop.hive.ql.udf.UDFDegrees; -import org.apache.hadoop.hive.ql.udf.UDFExp; -import org.apache.hadoop.hive.ql.udf.UDFHex; -import org.apache.hadoop.hive.ql.udf.UDFHour; -import org.apache.hadoop.hive.ql.udf.UDFLength; -import org.apache.hadoop.hive.ql.udf.UDFLike; -import org.apache.hadoop.hive.ql.udf.UDFLn; -import org.apache.hadoop.hive.ql.udf.UDFLog; -import org.apache.hadoop.hive.ql.udf.UDFLog10; -import org.apache.hadoop.hive.ql.udf.UDFLog2; -import org.apache.hadoop.hive.ql.udf.UDFMinute; -import org.apache.hadoop.hive.ql.udf.UDFMonth; -import org.apache.hadoop.hive.ql.udf.UDFRadians; -import org.apache.hadoop.hive.ql.udf.UDFRand; -import org.apache.hadoop.hive.ql.udf.UDFSecond; -import org.apache.hadoop.hive.ql.udf.UDFSign; -import org.apache.hadoop.hive.ql.udf.UDFSin; -import org.apache.hadoop.hive.ql.udf.UDFSqrt; -import org.apache.hadoop.hive.ql.udf.UDFSubstr; -import org.apache.hadoop.hive.ql.udf.UDFTan; -import org.apache.hadoop.hive.ql.udf.UDFToBoolean; -import org.apache.hadoop.hive.ql.udf.UDFToByte; -import org.apache.hadoop.hive.ql.udf.UDFToDouble; -import org.apache.hadoop.hive.ql.udf.UDFToFloat; -import org.apache.hadoop.hive.ql.udf.UDFToInteger; -import org.apache.hadoop.hive.ql.udf.UDFToLong; -import org.apache.hadoop.hive.ql.udf.UDFToShort; -import org.apache.hadoop.hive.ql.udf.UDFToString; -import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear; -import org.apache.hadoop.hive.ql.udf.UDFYear; -import org.apache.hadoop.hive.ql.udf.generic.*; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; - -public class Vectorizer implements PhysicalPlanResolver { - - protected static transient final Log LOG = LogFactory.getLog(Vectorizer.class); - - Pattern supportedDataTypesPattern; - List> vectorizableTasks = - new ArrayList>(); - Set> supportedGenericUDFs = new HashSet>(); - - Set supportedAggregationUdfs = new HashSet(); - - private HiveConf hiveConf; - - public Vectorizer() { - - StringBuilder patternBuilder = new StringBuilder(); - patternBuilder.append("int"); - patternBuilder.append("|smallint"); - patternBuilder.append("|tinyint"); - patternBuilder.append("|bigint"); - patternBuilder.append("|integer"); - patternBuilder.append("|long"); - patternBuilder.append("|short"); - patternBuilder.append("|timestamp"); - patternBuilder.append("|" + serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME); - patternBuilder.append("|" + serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME); - patternBuilder.append("|boolean"); - patternBuilder.append("|binary"); - patternBuilder.append("|string"); - patternBuilder.append("|byte"); - patternBuilder.append("|float"); - patternBuilder.append("|double"); - patternBuilder.append("|date"); - patternBuilder.append("|void"); - - // Decimal types can be specified with different precision and scales e.g. decimal(10,5), - // as opposed to other data types which can be represented by constant strings. - // The regex matches only the "decimal" prefix of the type. - patternBuilder.append("|decimal.*"); - - // CHAR and VARCHAR types can be specified with maximum length. - patternBuilder.append("|char.*"); - patternBuilder.append("|varchar.*"); - - supportedDataTypesPattern = Pattern.compile(patternBuilder.toString()); - - supportedGenericUDFs.add(GenericUDFOPPlus.class); - supportedGenericUDFs.add(GenericUDFOPMinus.class); - supportedGenericUDFs.add(GenericUDFOPMultiply.class); - supportedGenericUDFs.add(GenericUDFOPDivide.class); - supportedGenericUDFs.add(GenericUDFOPMod.class); - supportedGenericUDFs.add(GenericUDFOPNegative.class); - supportedGenericUDFs.add(GenericUDFOPPositive.class); - - supportedGenericUDFs.add(GenericUDFOPEqualOrLessThan.class); - supportedGenericUDFs.add(GenericUDFOPEqualOrGreaterThan.class); - supportedGenericUDFs.add(GenericUDFOPGreaterThan.class); - supportedGenericUDFs.add(GenericUDFOPLessThan.class); - supportedGenericUDFs.add(GenericUDFOPNot.class); - supportedGenericUDFs.add(GenericUDFOPNotEqual.class); - supportedGenericUDFs.add(GenericUDFOPNotNull.class); - supportedGenericUDFs.add(GenericUDFOPNull.class); - supportedGenericUDFs.add(GenericUDFOPOr.class); - supportedGenericUDFs.add(GenericUDFOPAnd.class); - supportedGenericUDFs.add(GenericUDFOPEqual.class); - supportedGenericUDFs.add(UDFLength.class); - - supportedGenericUDFs.add(UDFYear.class); - supportedGenericUDFs.add(UDFMonth.class); - supportedGenericUDFs.add(UDFDayOfMonth.class); - supportedGenericUDFs.add(UDFHour.class); - supportedGenericUDFs.add(UDFMinute.class); - supportedGenericUDFs.add(UDFSecond.class); - supportedGenericUDFs.add(UDFWeekOfYear.class); - supportedGenericUDFs.add(GenericUDFToUnixTimeStamp.class); - - supportedGenericUDFs.add(GenericUDFDateAdd.class); - supportedGenericUDFs.add(GenericUDFDateSub.class); - supportedGenericUDFs.add(GenericUDFDate.class); - supportedGenericUDFs.add(GenericUDFDateDiff.class); - - supportedGenericUDFs.add(UDFLike.class); - supportedGenericUDFs.add(GenericUDFRegExp.class); - supportedGenericUDFs.add(UDFSubstr.class); - supportedGenericUDFs.add(GenericUDFLTrim.class); - supportedGenericUDFs.add(GenericUDFRTrim.class); - supportedGenericUDFs.add(GenericUDFTrim.class); - - supportedGenericUDFs.add(UDFSin.class); - supportedGenericUDFs.add(UDFCos.class); - supportedGenericUDFs.add(UDFTan.class); - supportedGenericUDFs.add(UDFAsin.class); - supportedGenericUDFs.add(UDFAcos.class); - supportedGenericUDFs.add(UDFAtan.class); - supportedGenericUDFs.add(UDFDegrees.class); - supportedGenericUDFs.add(UDFRadians.class); - supportedGenericUDFs.add(GenericUDFFloor.class); - supportedGenericUDFs.add(GenericUDFCeil.class); - supportedGenericUDFs.add(UDFExp.class); - supportedGenericUDFs.add(UDFLn.class); - supportedGenericUDFs.add(UDFLog2.class); - supportedGenericUDFs.add(UDFLog10.class); - supportedGenericUDFs.add(UDFLog.class); - supportedGenericUDFs.add(GenericUDFPower.class); - supportedGenericUDFs.add(GenericUDFRound.class); - supportedGenericUDFs.add(GenericUDFBRound.class); - supportedGenericUDFs.add(GenericUDFPosMod.class); - supportedGenericUDFs.add(UDFSqrt.class); - supportedGenericUDFs.add(UDFSign.class); - supportedGenericUDFs.add(UDFRand.class); - supportedGenericUDFs.add(UDFBin.class); - supportedGenericUDFs.add(UDFHex.class); - supportedGenericUDFs.add(UDFConv.class); - - supportedGenericUDFs.add(GenericUDFLower.class); - supportedGenericUDFs.add(GenericUDFUpper.class); - supportedGenericUDFs.add(GenericUDFConcat.class); - supportedGenericUDFs.add(GenericUDFAbs.class); - supportedGenericUDFs.add(GenericUDFBetween.class); - supportedGenericUDFs.add(GenericUDFIn.class); - supportedGenericUDFs.add(GenericUDFCase.class); - supportedGenericUDFs.add(GenericUDFWhen.class); - supportedGenericUDFs.add(GenericUDFCoalesce.class); - supportedGenericUDFs.add(GenericUDFElt.class); - supportedGenericUDFs.add(GenericUDFInitCap.class); - - // For type casts - supportedGenericUDFs.add(UDFToLong.class); - supportedGenericUDFs.add(UDFToInteger.class); - supportedGenericUDFs.add(UDFToShort.class); - supportedGenericUDFs.add(UDFToByte.class); - supportedGenericUDFs.add(UDFToBoolean.class); - supportedGenericUDFs.add(UDFToFloat.class); - supportedGenericUDFs.add(UDFToDouble.class); - supportedGenericUDFs.add(UDFToString.class); - supportedGenericUDFs.add(GenericUDFTimestamp.class); - supportedGenericUDFs.add(GenericUDFToDecimal.class); - supportedGenericUDFs.add(GenericUDFToDate.class); - supportedGenericUDFs.add(GenericUDFToChar.class); - supportedGenericUDFs.add(GenericUDFToVarchar.class); - supportedGenericUDFs.add(GenericUDFToIntervalYearMonth.class); - supportedGenericUDFs.add(GenericUDFToIntervalDayTime.class); - - // For conditional expressions - supportedGenericUDFs.add(GenericUDFIf.class); - - supportedAggregationUdfs.add("min"); - supportedAggregationUdfs.add("max"); - supportedAggregationUdfs.add("count"); - supportedAggregationUdfs.add("sum"); - supportedAggregationUdfs.add("avg"); - supportedAggregationUdfs.add("variance"); - supportedAggregationUdfs.add("var_pop"); - supportedAggregationUdfs.add("var_samp"); - supportedAggregationUdfs.add("std"); - supportedAggregationUdfs.add("stddev"); - supportedAggregationUdfs.add("stddev_pop"); - supportedAggregationUdfs.add("stddev_samp"); - } - - class VectorizationDispatcher implements Dispatcher { - - private List reduceColumnNames; - private List reduceTypeInfos; - - public VectorizationDispatcher(PhysicalContext physicalContext) { - reduceColumnNames = null; - reduceTypeInfos = null; - } - - @Override - public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) - throws SemanticException { - Task currTask = (Task) nd; - if (currTask instanceof MapRedTask) { - convertMapWork(((MapRedTask) currTask).getWork().getMapWork(), false); - } else if (currTask instanceof TezTask) { - TezWork work = ((TezTask) currTask).getWork(); - for (BaseWork w: work.getAllWork()) { - if (w instanceof MapWork) { - convertMapWork((MapWork) w, true); - } else if (w instanceof ReduceWork) { - // We are only vectorizing Reduce under Tez. - if (HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_ENABLED)) { - convertReduceWork((ReduceWork) w, true); - } - } - } - } else if (currTask instanceof SparkTask) { - SparkWork sparkWork = (SparkWork) currTask.getWork(); - for (BaseWork baseWork : sparkWork.getAllWork()) { - if (baseWork instanceof MapWork) { - convertMapWork((MapWork) baseWork, false); - } else if (baseWork instanceof ReduceWork - && HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_ENABLED)) { - convertReduceWork((ReduceWork) baseWork, false); - } - } - } - return null; - } - - private void convertMapWork(MapWork mapWork, boolean isTez) throws SemanticException { - boolean ret = validateMapWork(mapWork, isTez); - if (ret) { - vectorizeMapWork(mapWork, isTez); - } - } - - private void addMapWorkRules(Map opRules, NodeProcessor np) { - opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + ".*" - + FileSinkOperator.getOperatorName()), np); - opRules.put(new RuleRegExp("R2", TableScanOperator.getOperatorName() + ".*" - + ReduceSinkOperator.getOperatorName()), np); - } - - private boolean validateMapWork(MapWork mapWork, boolean isTez) throws SemanticException { - LOG.info("Validating MapWork..."); - - // Eliminate MR plans with more than one TableScanOperator. - LinkedHashMap> aliasToWork = mapWork.getAliasToWork(); - if ((aliasToWork == null) || (aliasToWork.size() == 0)) { - return false; - } - int tableScanCount = 0; - for (Operator op : aliasToWork.values()) { - if (op == null) { - LOG.warn("Map work has invalid aliases to work with. Fail validation!"); - return false; - } - if (op instanceof TableScanOperator) { - tableScanCount++; - } - } - if (tableScanCount > 1) { - LOG.warn("Map work has more than 1 TableScanOperator aliases to work with. Fail validation!"); - return false; - } - - // Validate the input format - for (String path : mapWork.getPathToPartitionInfo().keySet()) { - PartitionDesc pd = mapWork.getPathToPartitionInfo().get(path); - List> interfaceList = - Arrays.asList(pd.getInputFileFormatClass().getInterfaces()); - if (!interfaceList.contains(VectorizedInputFormatInterface.class)) { - LOG.info("Input format: " + pd.getInputFileFormatClassName() - + ", doesn't provide vectorized input"); - return false; - } - } - Map opRules = new LinkedHashMap(); - MapWorkValidationNodeProcessor vnp = new MapWorkValidationNodeProcessor(mapWork, isTez); - addMapWorkRules(opRules, vnp); - Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); - GraphWalker ogw = new DefaultGraphWalker(disp); - - // iterator the mapper operator tree - ArrayList topNodes = new ArrayList(); - topNodes.addAll(mapWork.getAliasToWork().values()); - HashMap nodeOutput = new HashMap(); - ogw.startWalking(topNodes, nodeOutput); - for (Node n : nodeOutput.keySet()) { - if (nodeOutput.get(n) != null) { - if (!((Boolean)nodeOutput.get(n)).booleanValue()) { - return false; - } - } - } - return true; - } - - private void vectorizeMapWork(MapWork mapWork, boolean isTez) throws SemanticException { - LOG.info("Vectorizing MapWork..."); - mapWork.setVectorMode(true); - Map opRules = new LinkedHashMap(); - MapWorkVectorizationNodeProcessor vnp = new MapWorkVectorizationNodeProcessor(mapWork, isTez); - addMapWorkRules(opRules, vnp); - Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); - GraphWalker ogw = new PreOrderOnceWalker(disp); - // iterator the mapper operator tree - ArrayList topNodes = new ArrayList(); - topNodes.addAll(mapWork.getAliasToWork().values()); - HashMap nodeOutput = new HashMap(); - ogw.startWalking(topNodes, nodeOutput); - - mapWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap()); - mapWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap()); - mapWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap()); - - if (LOG.isDebugEnabled()) { - debugDisplayAllMaps(mapWork); - } - - return; - } - - private void convertReduceWork(ReduceWork reduceWork, boolean isTez) throws SemanticException { - boolean ret = validateReduceWork(reduceWork); - if (ret) { - vectorizeReduceWork(reduceWork, isTez); - } - } - - private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork) throws SemanticException { - try { - // Check key ObjectInspector. - ObjectInspector keyObjectInspector = reduceWork.getKeyObjectInspector(); - if (keyObjectInspector == null || !(keyObjectInspector instanceof StructObjectInspector)) { - return false; - } - StructObjectInspector keyStructObjectInspector = (StructObjectInspector)keyObjectInspector; - List keyFields = keyStructObjectInspector.getAllStructFieldRefs(); - - // Tez doesn't use tagging... - if (reduceWork.getNeedsTagging()) { - return false; - } - - // Check value ObjectInspector. - ObjectInspector valueObjectInspector = reduceWork.getValueObjectInspector(); - if (valueObjectInspector == null || - !(valueObjectInspector instanceof StructObjectInspector)) { - return false; - } - StructObjectInspector valueStructObjectInspector = (StructObjectInspector)valueObjectInspector; - List valueFields = valueStructObjectInspector.getAllStructFieldRefs(); - - reduceColumnNames = new ArrayList(); - reduceTypeInfos = new ArrayList(); - - for (StructField field: keyFields) { - reduceColumnNames.add(Utilities.ReduceField.KEY.toString() + "." + field.getFieldName()); - reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName())); - } - for (StructField field: valueFields) { - reduceColumnNames.add(Utilities.ReduceField.VALUE.toString() + "." + field.getFieldName()); - reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName())); - } - } catch (Exception e) { - throw new SemanticException(e); - } - return true; - } - - private void addReduceWorkRules(Map opRules, NodeProcessor np) { - opRules.put(new RuleRegExp("R1", GroupByOperator.getOperatorName() + ".*"), np); - opRules.put(new RuleRegExp("R2", SelectOperator.getOperatorName() + ".*"), np); - } - - private boolean validateReduceWork(ReduceWork reduceWork) throws SemanticException { - LOG.info("Validating ReduceWork..."); - - // Validate input to ReduceWork. - if (!getOnlyStructObjectInspectors(reduceWork)) { - return false; - } - // Now check the reduce operator tree. - Map opRules = new LinkedHashMap(); - ReduceWorkValidationNodeProcessor vnp = new ReduceWorkValidationNodeProcessor(); - addReduceWorkRules(opRules, vnp); - Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); - GraphWalker ogw = new DefaultGraphWalker(disp); - // iterator the reduce operator tree - ArrayList topNodes = new ArrayList(); - topNodes.add(reduceWork.getReducer()); - HashMap nodeOutput = new HashMap(); - ogw.startWalking(topNodes, nodeOutput); - for (Node n : nodeOutput.keySet()) { - if (nodeOutput.get(n) != null) { - if (!((Boolean)nodeOutput.get(n)).booleanValue()) { - return false; - } - } - } - return true; - } - - private void vectorizeReduceWork(ReduceWork reduceWork, boolean isTez) throws SemanticException { - LOG.info("Vectorizing ReduceWork..."); - reduceWork.setVectorMode(true); - - // For some reason, the DefaultGraphWalker does not descend down from the reducer Operator as - // expected. We need to descend down, otherwise it breaks our algorithm that determines - // VectorizationContext... Do we use PreOrderWalker instead of DefaultGraphWalker. - Map opRules = new LinkedHashMap(); - ReduceWorkVectorizationNodeProcessor vnp = - new ReduceWorkVectorizationNodeProcessor(reduceColumnNames, reduceTypeInfos, isTez); - addReduceWorkRules(opRules, vnp); - Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); - GraphWalker ogw = new PreOrderWalker(disp); - // iterator the reduce operator tree - ArrayList topNodes = new ArrayList(); - topNodes.add(reduceWork.getReducer()); - LOG.info("vectorizeReduceWork reducer Operator: " + - reduceWork.getReducer().getName() + "..."); - HashMap nodeOutput = new HashMap(); - ogw.startWalking(topNodes, nodeOutput); - - // Necessary since we are vectorizing the root operator in reduce. - reduceWork.setReducer(vnp.getRootVectorOp()); - - reduceWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap()); - reduceWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap()); - reduceWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap()); - - if (LOG.isDebugEnabled()) { - debugDisplayAllMaps(reduceWork); - } - } - } - - class MapWorkValidationNodeProcessor implements NodeProcessor { - - private final MapWork mapWork; - private final boolean isTez; - - public MapWorkValidationNodeProcessor(MapWork mapWork, boolean isTez) { - this.mapWork = mapWork; - this.isTez = isTez; - } - - @Override - public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - for (Node n : stack) { - Operator op = (Operator) n; - if (nonVectorizableChildOfGroupBy(op)) { - return new Boolean(true); - } - boolean ret = validateMapWorkOperator(op, mapWork, isTez); - if (!ret) { - LOG.info("MapWork Operator: " + op.getName() + " could not be vectorized."); - return new Boolean(false); - } - } - return new Boolean(true); - } - } - - class ReduceWorkValidationNodeProcessor implements NodeProcessor { - - @Override - public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - for (Node n : stack) { - Operator op = (Operator) n; - if (nonVectorizableChildOfGroupBy(op)) { - return new Boolean(true); - } - boolean ret = validateReduceWorkOperator(op); - if (!ret) { - LOG.info("ReduceWork Operator: " + op.getName() + " could not be vectorized."); - return new Boolean(false); - } - } - return new Boolean(true); - } - } - - // This class has common code used by both MapWorkVectorizationNodeProcessor and - // ReduceWorkVectorizationNodeProcessor. - class VectorizationNodeProcessor implements NodeProcessor { - - // The vectorization context for the Map or Reduce task. - protected VectorizationContext taskVectorizationContext; - - // The input projection column type name map for the Map or Reduce task. - protected Map taskColumnTypeNameMap; - - VectorizationNodeProcessor() { - taskColumnTypeNameMap = new HashMap(); - } - - public Map getVectorColumnNameMap() { - return taskVectorizationContext.getProjectionColumnMap(); - } - - public Map getVectorColumnTypeMap() { - return taskColumnTypeNameMap; - } - - public Map getVectorScratchColumnTypeMap() { - return taskVectorizationContext.getScratchColumnTypeMap(); - } - - protected final Set> opsDone = - new HashSet>(); - - protected final Map, Operator> opToVectorOpMap = - new HashMap, Operator>(); - - public VectorizationContext walkStackToFindVectorizationContext(Stack stack, - Operator op) throws SemanticException { - VectorizationContext vContext = null; - if (stack.size() <= 1) { - throw new SemanticException( - String.format("Expected operator stack for operator %s to have at least 2 operators", - op.getName())); - } - // Walk down the stack of operators until we found one willing to give us a context. - // At the bottom will be the root operator, guaranteed to have a context - int i= stack.size()-2; - while (vContext == null) { - if (i < 0) { - return null; - } - Operator opParent = (Operator) stack.get(i); - Operator vectorOpParent = opToVectorOpMap.get(opParent); - if (vectorOpParent != null) { - if (vectorOpParent instanceof VectorizationContextRegion) { - VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOpParent; - vContext = vcRegion.getOuputVectorizationContext(); - LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " has new vectorization context " + vContext.toString()); - } else { - LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " does not have new vectorization context"); - } - } else { - LOG.info("walkStackToFindVectorizationContext " + opParent.getName() + " is not vectorized"); - } - --i; - } - return vContext; - } - - public Operator doVectorize(Operator op, - VectorizationContext vContext, boolean isTez) throws SemanticException { - Operator vectorOp = op; - try { - if (!opsDone.contains(op)) { - vectorOp = vectorizeOperator(op, vContext, isTez); - opsDone.add(op); - if (vectorOp != op) { - opToVectorOpMap.put(op, vectorOp); - opsDone.add(vectorOp); - } - } - } catch (HiveException e) { - throw new SemanticException(e); - } - return vectorOp; - } - - @Override - public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - throw new SemanticException("Must be overridden"); - } - } - - class MapWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { - - private final boolean isTez; - - public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTez) { - super(); - this.isTez = isTez; - } - - @Override - public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - - Operator op = (Operator) nd; - - VectorizationContext vContext = null; - - if (op instanceof TableScanOperator) { - if (taskVectorizationContext == null) { - taskVectorizationContext = getVectorizationContext(op.getSchema(), op.getName(), - taskColumnTypeNameMap); - } - vContext = taskVectorizationContext; - } else { - LOG.info("MapWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName()); - vContext = walkStackToFindVectorizationContext(stack, op); - if (vContext == null) { - // No operator has "pushed" a new context -- so use the task vectorization context. - vContext = taskVectorizationContext; - } - } - - assert vContext != null; - LOG.info("MapWorkVectorizationNodeProcessor process operator " + op.getName() + " using vectorization context" + vContext.toString()); - - // When Vectorized GROUPBY outputs rows instead of vectorized row batchs, we don't - // vectorize the operators below it. - if (nonVectorizableChildOfGroupBy(op)) { - // No need to vectorize - if (!opsDone.contains(op)) { - opsDone.add(op); - } - return null; - } - - Operator vectorOp = doVectorize(op, vContext, isTez); - - if (LOG.isDebugEnabled()) { - if (vectorOp instanceof VectorizationContextRegion) { - VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; - VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext(); - LOG.debug("Vectorized MapWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString()); - } - } - - return null; - } - } - - class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { - - private final List reduceColumnNames; - private final List reduceTypeInfos; - - private final boolean isTez; - - private Operator rootVectorOp; - - public Operator getRootVectorOp() { - return rootVectorOp; - } - - public ReduceWorkVectorizationNodeProcessor(List reduceColumnNames, - List reduceTypeInfos, boolean isTez) { - super(); - this.reduceColumnNames = reduceColumnNames; - this.reduceTypeInfos = reduceTypeInfos; - rootVectorOp = null; - this.isTez = isTez; - } - - @Override - public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - - Operator op = (Operator) nd; - - VectorizationContext vContext = null; - - boolean saveRootVectorOp = false; - - if (op.getParentOperators().size() == 0) { - LOG.info("ReduceWorkVectorizationNodeProcessor process reduceColumnNames " + reduceColumnNames.toString()); - - vContext = new VectorizationContext("__Reduce_Shuffle__", reduceColumnNames); - taskVectorizationContext = vContext; - int i = 0; - for (TypeInfo typeInfo : reduceTypeInfos) { - taskColumnTypeNameMap.put(i, typeInfo.getTypeName()); - i++; - } - saveRootVectorOp = true; - - if (LOG.isDebugEnabled()) { - LOG.debug("Vectorized ReduceWork reduce shuffle vectorization context " + vContext.toString()); - } - } else { - LOG.info("ReduceWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName()); - vContext = walkStackToFindVectorizationContext(stack, op); - if (vContext == null) { - // If we didn't find a context among the operators, assume the top -- reduce shuffle's - // vectorization context. - vContext = taskVectorizationContext; - } - } - - assert vContext != null; - LOG.info("ReduceWorkVectorizationNodeProcessor process operator " + op.getName() + " using vectorization context" + vContext.toString()); - - // When Vectorized GROUPBY outputs rows instead of vectorized row batchs, we don't - // vectorize the operators below it. - if (nonVectorizableChildOfGroupBy(op)) { - // No need to vectorize - if (!opsDone.contains(op)) { - opsDone.add(op); - } - return null; - } - - Operator vectorOp = doVectorize(op, vContext, isTez); - - if (LOG.isDebugEnabled()) { - if (vectorOp instanceof VectorizationContextRegion) { - VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; - VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext(); - LOG.debug("Vectorized ReduceWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString()); - } - } - if (saveRootVectorOp && op != vectorOp) { - rootVectorOp = vectorOp; - } - - return null; - } - } - - private static class ValidatorVectorizationContext extends VectorizationContext { - private ValidatorVectorizationContext() { - super("No Name"); - } - - @Override - protected int getInputColumnIndex(String name) { - return 0; - } - - @Override - protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) { - return 0; - } - } - - @Override - public PhysicalContext resolve(PhysicalContext physicalContext) throws SemanticException { - hiveConf = physicalContext.getConf(); - - boolean vectorPath = HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED); - if (!vectorPath) { - LOG.info("Vectorization is disabled"); - return physicalContext; - } - // create dispatcher and graph walker - Dispatcher disp = new VectorizationDispatcher(physicalContext); - TaskGraphWalker ogw = new TaskGraphWalker(disp); - - // get all the tasks nodes from root task - ArrayList topNodes = new ArrayList(); - topNodes.addAll(physicalContext.getRootTasks()); - - // begin to walk through the task tree. - ogw.startWalking(topNodes, null); - return physicalContext; - } - - boolean validateMapWorkOperator(Operator op, MapWork mWork, boolean isTez) { - boolean ret = false; - switch (op.getType()) { - case MAPJOIN: - if (op instanceof MapJoinOperator) { - ret = validateMapJoinOperator((MapJoinOperator) op); - } else if (op instanceof SMBMapJoinOperator) { - ret = validateSMBMapJoinOperator((SMBMapJoinOperator) op); - } - break; - case GROUPBY: - ret = validateGroupByOperator((GroupByOperator) op, false, isTez); - break; - case FILTER: - ret = validateFilterOperator((FilterOperator) op); - break; - case SELECT: - ret = validateSelectOperator((SelectOperator) op); - break; - case REDUCESINK: - ret = validateReduceSinkOperator((ReduceSinkOperator) op); - break; - case TABLESCAN: - ret = validateTableScanOperator((TableScanOperator) op, mWork); - break; - case FILESINK: - case LIMIT: - case EVENT: - case SPARKPRUNINGSINK: - ret = true; - break; - case HASHTABLESINK: - ret = op instanceof SparkHashTableSinkOperator && - validateSparkHashTableSinkOperator((SparkHashTableSinkOperator) op); - break; - default: - ret = false; - break; - } - return ret; - } - - boolean validateReduceWorkOperator(Operator op) { - boolean ret = false; - switch (op.getType()) { - case MAPJOIN: - // Does MAPJOIN actually get planned in Reduce? - if (op instanceof MapJoinOperator) { - ret = validateMapJoinOperator((MapJoinOperator) op); - } else if (op instanceof SMBMapJoinOperator) { - ret = validateSMBMapJoinOperator((SMBMapJoinOperator) op); - } - break; - case GROUPBY: - if (HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_GROUPBY_ENABLED)) { - ret = validateGroupByOperator((GroupByOperator) op, true, true); - } else { - ret = false; - } - break; - case FILTER: - ret = validateFilterOperator((FilterOperator) op); - break; - case SELECT: - ret = validateSelectOperator((SelectOperator) op); - break; - case REDUCESINK: - ret = validateReduceSinkOperator((ReduceSinkOperator) op); - break; - case FILESINK: - ret = validateFileSinkOperator((FileSinkOperator) op); - break; - case LIMIT: - case EVENT: - case SPARKPRUNINGSINK: - ret = true; - break; - case HASHTABLESINK: - ret = op instanceof SparkHashTableSinkOperator && - validateSparkHashTableSinkOperator((SparkHashTableSinkOperator) op); - break; - default: - ret = false; - break; - } - return ret; - } - - public Boolean nonVectorizableChildOfGroupBy(Operator op) { - Operator currentOp = op; - while (currentOp.getParentOperators().size() > 0) { - currentOp = currentOp.getParentOperators().get(0); - if (currentOp.getType().equals(OperatorType.GROUPBY)) { - GroupByDesc desc = (GroupByDesc)currentOp.getConf(); - boolean isVectorOutput = desc.getVectorDesc().isVectorOutput(); - if (isVectorOutput) { - // This GROUP BY does vectorize its output. - return false; - } - return true; - } - } - return false; - } - - private boolean validateSMBMapJoinOperator(SMBMapJoinOperator op) { - SMBJoinDesc desc = op.getConf(); - // Validation is the same as for map join, since the 'small' tables are not vectorized - return validateMapJoinDesc(desc); - } - - private boolean validateTableScanOperator(TableScanOperator op, MapWork mWork) { - TableScanDesc desc = op.getConf(); - if (desc.isGatherStats()) { - return false; - } - - String columns = ""; - String types = ""; - String partitionColumns = ""; - String partitionTypes = ""; - boolean haveInfo = false; - - // This over-reaches slightly, since we can have > 1 table-scan per map-work. - // It needs path to partition, path to alias, then check the alias == the same table-scan, to be accurate. - // That said, that is a TODO item to be fixed when we support >1 TableScans per vectorized pipeline later. - LinkedHashMap partitionDescs = mWork.getPathToPartitionInfo(); - - // For vectorization, compare each partition information for against the others. - // We assume the table information will be from one of the partitions, so it will - // work to focus on the partition information and not compare against the TableScanOperator - // columns (in the VectorizationContext).... - for (Map.Entry entry : partitionDescs.entrySet()) { - PartitionDesc partDesc = entry.getValue(); - if (partDesc.getPartSpec() == null || partDesc.getPartSpec().isEmpty()) { - // No partition information -- we match because we would default to using the table description. - continue; - } - Properties partProps = partDesc.getProperties(); - if (!haveInfo) { - columns = partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMNS); - types = partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMN_TYPES); - partitionColumns = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); - partitionTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES); - haveInfo = true; - } else { - String nextColumns = partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMNS); - String nextTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMN_TYPES); - String nextPartitionColumns = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); - String nextPartitionTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES); - if (!columns.equalsIgnoreCase(nextColumns)) { - LOG.info( - String.format("Could not vectorize partition %s. Its column names %s do not match the other column names %s", - entry.getKey(), nextColumns, columns)); - return false; - } - if (!types.equalsIgnoreCase(nextTypes)) { - LOG.info( - String.format("Could not vectorize partition %s. Its column types %s do not match the other column types %s", - entry.getKey(), nextTypes, types)); - return false; - } - if (!partitionColumns.equalsIgnoreCase(nextPartitionColumns)) { - LOG.info( - String.format("Could not vectorize partition %s. Its partition column names %s do not match the other partition column names %s", - entry.getKey(), nextPartitionColumns, partitionColumns)); - return false; - } - if (!partitionTypes.equalsIgnoreCase(nextPartitionTypes)) { - LOG.info( - String.format("Could not vectorize partition %s. Its partition column types %s do not match the other partition column types %s", - entry.getKey(), nextPartitionTypes, partitionTypes)); - return false; - } - } - } - return true; - } - - private boolean validateMapJoinOperator(MapJoinOperator op) { - MapJoinDesc desc = op.getConf(); - return validateMapJoinDesc(desc); - } - - private boolean validateMapJoinDesc(MapJoinDesc desc) { - byte posBigTable = (byte) desc.getPosBigTable(); - List filterExprs = desc.getFilters().get(posBigTable); - if (!validateExprNodeDesc(filterExprs, VectorExpressionDescriptor.Mode.FILTER)) { - LOG.info("Cannot vectorize map work filter expression"); - return false; - } - List keyExprs = desc.getKeys().get(posBigTable); - if (!validateExprNodeDesc(keyExprs)) { - LOG.info("Cannot vectorize map work key expression"); - return false; - } - List valueExprs = desc.getExprs().get(posBigTable); - if (!validateExprNodeDesc(valueExprs)) { - LOG.info("Cannot vectorize map work value expression"); - return false; - } - return true; - } - - private boolean validateSparkHashTableSinkOperator(SparkHashTableSinkOperator op) { - SparkHashTableSinkDesc desc = op.getConf(); - byte tag = desc.getTag(); - // it's essentially a MapJoinDesc - List filterExprs = desc.getFilters().get(tag); - List keyExprs = desc.getKeys().get(tag); - List valueExprs = desc.getExprs().get(tag); - return validateExprNodeDesc(filterExprs, VectorExpressionDescriptor.Mode.FILTER) && - validateExprNodeDesc(keyExprs) && validateExprNodeDesc(valueExprs); - } - - private boolean validateReduceSinkOperator(ReduceSinkOperator op) { - List keyDescs = op.getConf().getKeyCols(); - List partitionDescs = op.getConf().getPartitionCols(); - List valueDesc = op.getConf().getValueCols(); - return validateExprNodeDesc(keyDescs) && validateExprNodeDesc(partitionDescs) && - validateExprNodeDesc(valueDesc); - } - - private boolean validateSelectOperator(SelectOperator op) { - List descList = op.getConf().getColList(); - for (ExprNodeDesc desc : descList) { - boolean ret = validateExprNodeDesc(desc); - if (!ret) { - LOG.info("Cannot vectorize select expression: " + desc.toString()); - return false; - } - } - return true; - } - - private boolean validateFilterOperator(FilterOperator op) { - ExprNodeDesc desc = op.getConf().getPredicate(); - return validateExprNodeDesc(desc, VectorExpressionDescriptor.Mode.FILTER); - } - - private boolean validateGroupByOperator(GroupByOperator op, boolean isReduce, boolean isTez) { - GroupByDesc desc = op.getConf(); - VectorGroupByDesc vectorDesc = desc.getVectorDesc(); - - if (desc.isGroupingSetsPresent()) { - LOG.info("Grouping sets not supported in vector mode"); - return false; - } - if (desc.pruneGroupingSetId()) { - LOG.info("Pruning grouping set id not supported in vector mode"); - return false; - } - boolean ret = validateExprNodeDesc(desc.getKeys()); - if (!ret) { - LOG.info("Cannot vectorize groupby key expression"); - return false; - } - - if (!isReduce) { - - // MapWork - - ret = validateHashAggregationDesc(desc.getAggregators()); - if (!ret) { - return false; - } - } else { - - // ReduceWork - - boolean isComplete = desc.getMode() == GroupByDesc.Mode.COMPLETE; - if (desc.getMode() != GroupByDesc.Mode.HASH) { - - // Reduce Merge-Partial GROUP BY. - - // A merge-partial GROUP BY is fed by grouping by keys from reduce-shuffle. It is the - // first (or root) operator for its reduce task. - // TODO: Technically, we should also handle FINAL, PARTIAL1, PARTIAL2 and PARTIALS - // that are not hash or complete, but aren't merge-partial, somehow. - - if (desc.isDistinct()) { - LOG.info("Vectorized Reduce MergePartial GROUP BY does not support DISTINCT"); - return false; - } - - boolean hasKeys = (desc.getKeys().size() > 0); - - // Do we support merge-partial aggregation AND the output is primitive? - ret = validateReduceMergePartialAggregationDesc(desc.getAggregators(), hasKeys); - if (!ret) { - return false; - } - - if (hasKeys) { - if (op.getParentOperators().size() > 0 && !isComplete) { - LOG.info("Vectorized Reduce MergePartial GROUP BY keys can only handle a key group when it is fed by reduce-shuffle"); - return false; - } - - LOG.info("Vectorized Reduce MergePartial GROUP BY will process key groups"); - - // Primitive output validation above means we can output VectorizedRowBatch to the - // children operators. - vectorDesc.setVectorOutput(true); - } else { - LOG.info("Vectorized Reduce MergePartial GROUP BY will do global aggregation"); - } - if (!isComplete) { - vectorDesc.setIsReduceMergePartial(true); - } else { - vectorDesc.setIsReduceStreaming(true); - } - } else { - - // Reduce Hash GROUP BY or global aggregation. - - ret = validateHashAggregationDesc(desc.getAggregators()); - if (!ret) { - return false; - } - } - } - - return true; - } - - private boolean validateFileSinkOperator(FileSinkOperator op) { - return true; - } - - private boolean validateExprNodeDesc(List descs) { - return validateExprNodeDesc(descs, VectorExpressionDescriptor.Mode.PROJECTION); - } - - private boolean validateExprNodeDesc(List descs, - VectorExpressionDescriptor.Mode mode) { - for (ExprNodeDesc d : descs) { - boolean ret = validateExprNodeDesc(d, mode); - if (!ret) { - return false; - } - } - return true; - } - - - private boolean validateHashAggregationDesc(List descs) { - return validateAggregationDesc(descs, /* isReduceMergePartial */ false, false); - } - - private boolean validateReduceMergePartialAggregationDesc(List descs, boolean hasKeys) { - return validateAggregationDesc(descs, /* isReduceMergePartial */ true, hasKeys); - } - - private boolean validateAggregationDesc(List descs, boolean isReduceMergePartial, boolean hasKeys) { - for (AggregationDesc d : descs) { - boolean ret = validateAggregationDesc(d, isReduceMergePartial, hasKeys); - if (!ret) { - return false; - } - } - return true; - } - - private boolean validateExprNodeDescRecursive(ExprNodeDesc desc, VectorExpressionDescriptor.Mode mode) { - if (desc instanceof ExprNodeColumnDesc) { - ExprNodeColumnDesc c = (ExprNodeColumnDesc) desc; - // Currently, we do not support vectorized virtual columns (see HIVE-5570). - if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(c.getColumn())) { - LOG.info("Cannot vectorize virtual column " + c.getColumn()); - return false; - } - } - String typeName = desc.getTypeInfo().getTypeName(); - boolean ret = validateDataType(typeName, mode); - if (!ret) { - LOG.info("Cannot vectorize " + desc.toString() + " of type " + typeName); - return false; - } - if (desc instanceof ExprNodeGenericFuncDesc) { - ExprNodeGenericFuncDesc d = (ExprNodeGenericFuncDesc) desc; - boolean r = validateGenericUdf(d); - if (!r) { - LOG.info("Cannot vectorize UDF " + d); - return false; - } - } - if (desc.getChildren() != null) { - for (ExprNodeDesc d: desc.getChildren()) { - // Don't restrict child expressions for projection. Always use looser FILTER mode. - boolean r = validateExprNodeDescRecursive(d, VectorExpressionDescriptor.Mode.FILTER); - if (!r) { - return false; - } - } - } - return true; - } - - private boolean validateExprNodeDesc(ExprNodeDesc desc) { - return validateExprNodeDesc(desc, VectorExpressionDescriptor.Mode.PROJECTION); - } - - boolean validateExprNodeDesc(ExprNodeDesc desc, VectorExpressionDescriptor.Mode mode) { - if (!validateExprNodeDescRecursive(desc, mode)) { - return false; - } - try { - VectorizationContext vc = new ValidatorVectorizationContext(); - if (vc.getVectorExpression(desc, mode) == null) { - // TODO: this cannot happen - VectorizationContext throws in such cases. - LOG.info("getVectorExpression returned null"); - return false; - } - } catch (Exception e) { - LOG.info("Failed to vectorize", e); - return false; - } - return true; - } - - private boolean validateGenericUdf(ExprNodeGenericFuncDesc genericUDFExpr) { - if (VectorizationContext.isCustomUDF(genericUDFExpr)) { - return true; - } - GenericUDF genericUDF = genericUDFExpr.getGenericUDF(); - if (genericUDF instanceof GenericUDFBridge) { - Class udf = ((GenericUDFBridge) genericUDF).getUdfClass(); - return supportedGenericUDFs.contains(udf); - } else { - return supportedGenericUDFs.contains(genericUDF.getClass()); - } - } - - private boolean validateAggregationIsPrimitive(VectorAggregateExpression vectorAggrExpr) { - ObjectInspector outputObjInspector = vectorAggrExpr.getOutputObjectInspector(); - return (outputObjInspector.getCategory() == ObjectInspector.Category.PRIMITIVE); - } - - private boolean validateAggregationDesc(AggregationDesc aggDesc, boolean isReduceMergePartial, - boolean hasKeys) { - - String udfName = aggDesc.getGenericUDAFName().toLowerCase(); - if (!supportedAggregationUdfs.contains(udfName)) { - LOG.info("Cannot vectorize groupby aggregate expression: UDF " + udfName + " not supported"); - return false; - } - if (aggDesc.getParameters() != null && !validateExprNodeDesc(aggDesc.getParameters())) { - LOG.info("Cannot vectorize groupby aggregate expression: UDF parameters not supported"); - return false; - } - - // See if we can vectorize the aggregation. - VectorizationContext vc = new ValidatorVectorizationContext(); - VectorAggregateExpression vectorAggrExpr; - try { - vectorAggrExpr = vc.getAggregatorExpression(aggDesc, isReduceMergePartial); - } catch (Exception e) { - // We should have already attempted to vectorize in validateAggregationDesc. - LOG.info("Vectorization of aggreation should have succeeded ", e); - return false; - } - - if (isReduceMergePartial && hasKeys && !validateAggregationIsPrimitive(vectorAggrExpr)) { - LOG.info("Vectorized Reduce MergePartial GROUP BY keys can only handle aggregate outputs that are primitive types"); - return false; - } - - return true; - } - - private boolean validateDataType(String type, VectorExpressionDescriptor.Mode mode) { - type = type.toLowerCase(); - boolean result = supportedDataTypesPattern.matcher(type).matches(); - if (result && mode == VectorExpressionDescriptor.Mode.PROJECTION && type.equals("void")) { - return false; - } - return result; - } - - private VectorizationContext getVectorizationContext(RowSchema rowSchema, String contextName, - Map typeNameMap) { - - VectorizationContext vContext = new VectorizationContext(contextName); - - // Add all non-virtual columns to make a vectorization context for - // the TableScan operator. - int i = 0; - for (ColumnInfo c : rowSchema.getSignature()) { - // Earlier, validation code should have eliminated virtual columns usage (HIVE-5560). - if (!isVirtualColumn(c)) { - vContext.addInitialColumn(c.getInternalName()); - typeNameMap.put(i, c.getTypeName()); - i++; - } - } - vContext.finishedAddingInitialColumns(); - - return vContext; - } - - private void fixupParentChildOperators(Operator op, - Operator vectorOp) { - if (op.getParentOperators() != null) { - vectorOp.setParentOperators(op.getParentOperators()); - for (Operator p : op.getParentOperators()) { - p.replaceChild(op, vectorOp); - } - } - if (op.getChildOperators() != null) { - vectorOp.setChildOperators(op.getChildOperators()); - for (Operator c : op.getChildOperators()) { - c.replaceParent(op, vectorOp); - } - } - } - - private boolean isBigTableOnlyResults(MapJoinDesc desc) { - Byte[] order = desc.getTagOrder(); - byte posBigTable = (byte) desc.getPosBigTable(); - Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]); - - int[] smallTableIndices; - int smallTableIndicesSize; - if (desc.getValueIndices() != null && desc.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) { - smallTableIndices = desc.getValueIndices().get(posSingleVectorMapJoinSmallTable); - LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices " + Arrays.toString(smallTableIndices)); - smallTableIndicesSize = smallTableIndices.length; - } else { - smallTableIndices = null; - LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices EMPTY"); - smallTableIndicesSize = 0; - } - - List smallTableRetainList = desc.getRetainList().get(posSingleVectorMapJoinSmallTable); - LOG.info("Vectorizer isBigTableOnlyResults smallTableRetainList " + smallTableRetainList); - int smallTableRetainSize = smallTableRetainList.size(); - - if (smallTableIndicesSize > 0) { - // Small table indices has priority over retain. - for (int i = 0; i < smallTableIndicesSize; i++) { - if (smallTableIndices[i] < 0) { - // Negative numbers indicate a column to be (deserialize) read from the small table's - // LazyBinary value row. - LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices[i] < 0 returning false"); - return false; - } - } - } else if (smallTableRetainSize > 0) { - LOG.info("Vectorizer isBigTableOnlyResults smallTableRetainSize > 0 returning false"); - return false; - } - - LOG.info("Vectorizer isBigTableOnlyResults returning true"); - return true; - } - - Operator specializeMapJoinOperator(Operator op, - VectorizationContext vContext, MapJoinDesc desc) throws HiveException { - Operator vectorOp = null; - Class> opClass = null; - - VectorMapJoinDesc.HashTableImplementationType hashTableImplementationType = HashTableImplementationType.NONE; - VectorMapJoinDesc.HashTableKind hashTableKind = HashTableKind.NONE; - VectorMapJoinDesc.HashTableKeyType hashTableKeyType = HashTableKeyType.NONE; - - if (HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) { - hashTableImplementationType = HashTableImplementationType.FAST; - } else { - // Restrict to using BytesBytesMultiHashMap via MapJoinBytesTableContainer or - // HybridHashTableContainer. - hashTableImplementationType = HashTableImplementationType.OPTIMIZED; - } - - int joinType = desc.getConds()[0].getType(); - - boolean isInnerBigOnly = false; - if (joinType == JoinDesc.INNER_JOIN && isBigTableOnlyResults(desc)) { - isInnerBigOnly = true; - } - - // By default, we can always use the multi-key class. - hashTableKeyType = HashTableKeyType.MULTI_KEY; - - if (!HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MULTIKEY_ONLY_ENABLED)) { - - // Look for single column optimization. - byte posBigTable = (byte) desc.getPosBigTable(); - Map> keyExprs = desc.getKeys(); - List bigTableKeyExprs = keyExprs.get(posBigTable); - if (bigTableKeyExprs.size() == 1) { - String typeName = bigTableKeyExprs.get(0).getTypeString(); - LOG.info("Vectorizer vectorizeOperator map join typeName " + typeName); - if (typeName.equals("boolean")) { - hashTableKeyType = HashTableKeyType.BOOLEAN; - } else if (typeName.equals("tinyint")) { - hashTableKeyType = HashTableKeyType.BYTE; - } else if (typeName.equals("smallint")) { - hashTableKeyType = HashTableKeyType.SHORT; - } else if (typeName.equals("int")) { - hashTableKeyType = HashTableKeyType.INT; - } else if (typeName.equals("bigint") || typeName.equals("long")) { - hashTableKeyType = HashTableKeyType.LONG; - } else if (VectorizationContext.isStringFamily(typeName)) { - hashTableKeyType = HashTableKeyType.STRING; - } - } - } - - switch (joinType) { - case JoinDesc.INNER_JOIN: - if (!isInnerBigOnly) { - hashTableKind = HashTableKind.HASH_MAP; - } else { - hashTableKind = HashTableKind.HASH_MULTISET; - } - break; - case JoinDesc.LEFT_OUTER_JOIN: - case JoinDesc.RIGHT_OUTER_JOIN: - hashTableKind = HashTableKind.HASH_MAP; - break; - case JoinDesc.LEFT_SEMI_JOIN: - hashTableKind = HashTableKind.HASH_SET; - break; - default: - throw new HiveException("Unknown join type " + joinType); - } - - LOG.info("Vectorizer vectorizeOperator map join hashTableKind " + hashTableKind.name() + " hashTableKeyType " + hashTableKeyType.name()); - - switch (hashTableKeyType) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - switch (joinType) { - case JoinDesc.INNER_JOIN: - if (!isInnerBigOnly) { - opClass = VectorMapJoinInnerLongOperator.class; - } else { - opClass = VectorMapJoinInnerBigOnlyLongOperator.class; - } - break; - case JoinDesc.LEFT_OUTER_JOIN: - case JoinDesc.RIGHT_OUTER_JOIN: - opClass = VectorMapJoinOuterLongOperator.class; - break; - case JoinDesc.LEFT_SEMI_JOIN: - opClass = VectorMapJoinLeftSemiLongOperator.class; - break; - default: - throw new HiveException("Unknown join type " + joinType); - } - break; - case STRING: - switch (joinType) { - case JoinDesc.INNER_JOIN: - if (!isInnerBigOnly) { - opClass = VectorMapJoinInnerStringOperator.class; - } else { - opClass = VectorMapJoinInnerBigOnlyStringOperator.class; - } - break; - case JoinDesc.LEFT_OUTER_JOIN: - case JoinDesc.RIGHT_OUTER_JOIN: - opClass = VectorMapJoinOuterStringOperator.class; - break; - case JoinDesc.LEFT_SEMI_JOIN: - opClass = VectorMapJoinLeftSemiStringOperator.class; - break; - default: - throw new HiveException("Unknown join type " + joinType); - } - break; - case MULTI_KEY: - switch (joinType) { - case JoinDesc.INNER_JOIN: - if (!isInnerBigOnly) { - opClass = VectorMapJoinInnerMultiKeyOperator.class; - } else { - opClass = VectorMapJoinInnerBigOnlyMultiKeyOperator.class; - } - break; - case JoinDesc.LEFT_OUTER_JOIN: - case JoinDesc.RIGHT_OUTER_JOIN: - opClass = VectorMapJoinOuterMultiKeyOperator.class; - break; - case JoinDesc.LEFT_SEMI_JOIN: - opClass = VectorMapJoinLeftSemiMultiKeyOperator.class; - break; - default: - throw new HiveException("Unknown join type " + joinType); - } - break; - } - - vectorOp = OperatorFactory.getVectorOperator(opClass, op.getConf(), vContext); - LOG.info("Vectorizer vectorizeOperator map join class " + vectorOp.getClass().getSimpleName()); - - boolean minMaxEnabled = HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MINMAX_ENABLED); - - VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); - vectorDesc.setHashTableImplementationType(hashTableImplementationType); - vectorDesc.setHashTableKind(hashTableKind); - vectorDesc.setHashTableKeyType(hashTableKeyType); - vectorDesc.setMinMaxEnabled(minMaxEnabled); - return vectorOp; - } - - private boolean onExpressionHasNullSafes(MapJoinDesc desc) { - boolean[] nullSafes = desc.getNullSafes(); - for (boolean nullSafe : nullSafes) { - if (nullSafe) { - return true; - } - } - return false; - } - - private boolean canSpecializeMapJoin(Operator op, MapJoinDesc desc, - boolean isTez) { - - boolean specialize = false; - - if (op instanceof MapJoinOperator && - HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED)) { - - // Currently, only under Tez and non-N-way joins. - if (isTez && desc.getConds().length == 1 && !onExpressionHasNullSafes(desc)) { - - // Ok, all basic restrictions satisfied so far... - specialize = true; - - if (!HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) { - - // We are using the optimized hash table we have further - // restrictions (using optimized and key type). - - if (!HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE)) { - specialize = false; - } else { - byte posBigTable = (byte) desc.getPosBigTable(); - Map> keyExprs = desc.getKeys(); - List bigTableKeyExprs = keyExprs.get(posBigTable); - for (ExprNodeDesc exprNodeDesc : bigTableKeyExprs) { - String typeName = exprNodeDesc.getTypeString(); - if (!MapJoinKey.isSupportedField(typeName)) { - specialize = false; - break; - } - } - } - } else { - - // With the fast hash table implementation, we currently do not support - // Hybrid Grace Hash Join. - - if (HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN)) { - specialize = false; - } - } - } - } - return specialize; - } - - Operator vectorizeOperator(Operator op, - VectorizationContext vContext, boolean isTez) throws HiveException { - Operator vectorOp = null; - - switch (op.getType()) { - case MAPJOIN: - { - MapJoinDesc desc = (MapJoinDesc) op.getConf(); - boolean specialize = canSpecializeMapJoin(op, desc, isTez); - - if (!specialize) { - - Class> opClass = null; - if (op instanceof MapJoinOperator) { - - // *NON-NATIVE* vector map differences for LEFT OUTER JOIN and Filtered... - - List bigTableFilters = desc.getFilters().get((byte) desc.getPosBigTable()); - boolean isOuterAndFiltered = (!desc.isNoOuterJoin() && bigTableFilters.size() > 0); - if (!isOuterAndFiltered) { - opClass = VectorMapJoinOperator.class; - } else { - opClass = VectorMapJoinOuterFilteredOperator.class; - } - } else if (op instanceof SMBMapJoinOperator) { - opClass = VectorSMBMapJoinOperator.class; - } - - vectorOp = OperatorFactory.getVectorOperator(opClass, op.getConf(), vContext); - - } else { - - // TEMPORARY Until Native Vector Map Join with Hybrid passes tests... - // HiveConf.setBoolVar(physicalContext.getConf(), - // HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN, false); - - vectorOp = specializeMapJoinOperator(op, vContext, desc); - } - } - break; - case GROUPBY: - case FILTER: - case SELECT: - case FILESINK: - case REDUCESINK: - case LIMIT: - case EXTRACT: - case EVENT: - case HASHTABLESINK: - vectorOp = OperatorFactory.getVectorOperator(op.getConf(), vContext); - break; - default: - vectorOp = op; - break; - } - - LOG.info("vectorizeOperator " + (vectorOp == null ? "NULL" : vectorOp.getClass().getName())); - LOG.info("vectorizeOperator " + (vectorOp == null || vectorOp.getConf() == null ? "NULL" : vectorOp.getConf().getClass().getName())); - - if (vectorOp != op) { - fixupParentChildOperators(op, vectorOp); - ((AbstractOperatorDesc) vectorOp.getConf()).setVectorMode(true); - } - return vectorOp; - } - - private boolean isVirtualColumn(ColumnInfo column) { - - // Not using method column.getIsVirtualCol() because partitioning columns are also - // treated as virtual columns in ColumnInfo. - if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(column.getInternalName())) { - return true; - } - return false; - } - - public void debugDisplayAllMaps(BaseWork work) { - - Map columnNameMap = work.getVectorColumnNameMap(); - Map columnTypeMap = work.getVectorColumnTypeMap(); - Map scratchColumnTypeMap = work.getVectorScratchColumnTypeMap(); - - LOG.debug("debugDisplayAllMaps columnNameMap " + columnNameMap.toString()); - LOG.debug("debugDisplayAllMaps columnTypeMap " + columnTypeMap.toString()); - LOG.debug("debugDisplayAllMaps scratchColumnTypeMap " + scratchColumnTypeMap.toString()); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java index d574c5c..f6c5df5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java @@ -30,8 +30,10 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.hive.ql.plan.Explain.Level; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** @@ -64,9 +66,7 @@ public BaseWork(String name) { // Vectorization. - protected Map vectorColumnNameMap; - protected Map vectorColumnTypeMap; - protected Map vectorScratchColumnTypeMap; + protected VectorizedRowBatchCtx vectorizedRowBatchCtx; public void setGatheringStats(boolean gatherStats) { this.gatheringStats = gatherStats; @@ -152,29 +152,17 @@ public void addDummyOp(HashTableDummyOperator dummyOp) { return returnSet; } - public Map getVectorColumnNameMap() { - return vectorColumnNameMap; - } - - public void setVectorColumnNameMap(Map vectorColumnNameMap) { - this.vectorColumnNameMap = vectorColumnNameMap; - } + // ----------------------------------------------------------------------------------------------- - public Map getVectorColumnTypeMap() { - return vectorColumnTypeMap; + public VectorizedRowBatchCtx getVectorizedRowBatchCtx() { + return vectorizedRowBatchCtx; } - public void setVectorColumnTypeMap(Map vectorColumnTypeMap) { - this.vectorColumnTypeMap = vectorColumnTypeMap; + public void setVectorizedRowBatchCtx(VectorizedRowBatchCtx vectorizedRowBatchCtx) { + this.vectorizedRowBatchCtx = vectorizedRowBatchCtx; } - public Map getVectorScratchColumnTypeMap() { - return vectorScratchColumnTypeMap; - } - - public void setVectorScratchColumnTypeMap(Map vectorScratchColumnTypeMap) { - this.vectorScratchColumnTypeMap = vectorScratchColumnTypeMap; - } + // ----------------------------------------------------------------------------------------------- /** * @return the mapredLocalWork diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java index 864301c..b032349 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java @@ -68,11 +68,13 @@ private String baseFileName; + private VectorPartitionDesc vectorPartitionDesc; + public void setBaseFileName(String baseFileName) { this.baseFileName = baseFileName; } - public PartitionDesc() { + public PartitionDesc() { } public PartitionDesc(final TableDesc table, final LinkedHashMap partSpec) { @@ -271,6 +273,9 @@ public PartitionDesc clone() { ret.partSpec = new java.util.LinkedHashMap(); ret.partSpec.putAll(partSpec); } + if (vectorPartitionDesc != null) { + ret.vectorPartitionDesc = vectorPartitionDesc.clone(); + } return ret; } @@ -300,4 +305,12 @@ public void deriveBaseFileName(String path) { public void intern(Interner interner) { this.tableDesc = interner.intern(tableDesc); } + + public void setVectorPartitionDesc(VectorPartitionDesc vectorPartitionDesc) { + this.vectorPartitionDesc = vectorPartitionDesc; + } + + public VectorPartitionDesc getVectorPartitionDesc() { + return vectorPartitionDesc; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionConversion.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionConversion.java new file mode 100644 index 0000000..50153b8 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionConversion.java @@ -0,0 +1,177 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.util.HashMap; +import java.util.List; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * PartitionConversion. + * + */ +public class VectorPartitionConversion { + + private static long serialVersionUID = 1L; + + private boolean validConversion; + private boolean[] resultConversionFlags; + + private TypeInfo invalidFromTypeInfo; + private TypeInfo invalidToTypeInfo; + + public boolean getValidConversion() { + return validConversion; + } + + public boolean[] getResultConversionFlags() { + return resultConversionFlags; + } + + public TypeInfo getInvalidFromTypeInfo() { + return invalidFromTypeInfo; + } + + public TypeInfo getInvalidToTypeInfo() { + return invalidToTypeInfo; + } + + // Currently, we only support these no-precision-loss or promotion data type conversions: + // // + // Short -> Int IMPLICIT WITH VECTORIZATION + // Short -> BigInt IMPLICIT WITH VECTORIZATION + // Int --> BigInt IMPLICIT WITH VECTORIZATION + // + // CONSIDER ADDING: + // Float -> Double IMPLICIT WITH VECTORIZATION + // (Char | VarChar) -> String IMPLICIT WITH VECTORIZATION + // + private static HashMap validFromPrimitiveMap = + new HashMap(); + static { + validFromPrimitiveMap.put( + PrimitiveCategory.SHORT, + new PrimitiveCategory[] { PrimitiveCategory.INT, PrimitiveCategory.LONG }); + validFromPrimitiveMap.put( + PrimitiveCategory.INT, + new PrimitiveCategory[] { PrimitiveCategory.LONG }); +/* + validFromPrimitiveMap.put( + PrimitiveCategory.FLOAT, + new PrimitiveCategory[] { PrimitiveCategory.DOUBLE } ); + validFromPrimitiveMap.put( + PrimitiveCategory.CHAR, + new PrimitiveCategory[] { PrimitiveCategory.STRING } ); + validFromPrimitiveMap.put( + PrimitiveCategory.VARCHAR, + new PrimitiveCategory[] { PrimitiveCategory.STRING } ); +*/ + } + + private boolean validateOne(TypeInfo fromTypeInfo, TypeInfo toTypeInfo) { + + if (fromTypeInfo.equals(toTypeInfo)) { + return false; + } + + if (fromTypeInfo.getCategory() == Category.PRIMITIVE && + toTypeInfo.getCategory() == Category.PRIMITIVE) { + + PrimitiveCategory fromPrimitiveCategory = ((PrimitiveTypeInfo) fromTypeInfo).getPrimitiveCategory(); + PrimitiveCategory toPrimitiveCategory = ((PrimitiveTypeInfo) toTypeInfo).getPrimitiveCategory(); + + PrimitiveCategory[] toPrimitiveCategories = + validFromPrimitiveMap.get(fromPrimitiveCategory); + if (toPrimitiveCategories == null || + !ArrayUtils.contains(toPrimitiveCategories, toPrimitiveCategory)) { + invalidFromTypeInfo = fromTypeInfo; + invalidToTypeInfo = toTypeInfo; + + // Tell caller a bad one was found. + validConversion = false; + return false; + } + } else { + // Ignore checking complex types. Assume they will not be included in the query. + } + + return true; + } + + public void validateConversion(List fromTypeInfoList, + List toTypeInfoList) { + + final int columnCount = fromTypeInfoList.size(); + resultConversionFlags = new boolean[columnCount]; + + // The method validateOne will turn this off when invalid conversion is found. + validConversion = true; + + boolean atLeastOneConversion = false; + for (int i = 0; i < columnCount; i++) { + TypeInfo fromTypeInfo = fromTypeInfoList.get(i); + TypeInfo toTypeInfo = toTypeInfoList.get(i); + + resultConversionFlags[i] = validateOne(fromTypeInfo, toTypeInfo); + if (!validConversion) { + return; + } + } + + if (atLeastOneConversion) { + // Leave resultConversionFlags set. + } else { + resultConversionFlags = null; + } + } + + public void validateConversion(TypeInfo[] fromTypeInfos, TypeInfo[] toTypeInfos) { + + final int columnCount = fromTypeInfos.length; + resultConversionFlags = new boolean[columnCount]; + + // The method validateOne will turn this off when invalid conversion is found. + validConversion = true; + + boolean atLeastOneConversion = false; + for (int i = 0; i < columnCount; i++) { + TypeInfo fromTypeInfo = fromTypeInfos[i]; + TypeInfo toTypeInfo = toTypeInfos[i]; + + resultConversionFlags[i] = validateOne(fromTypeInfo, toTypeInfo); + if (!validConversion) { + return; + } + if (resultConversionFlags[i]) { + atLeastOneConversion = true; + } + } + + if (atLeastOneConversion) { + // Leave resultConversionFlags set. + } else { + resultConversionFlags = null; + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java new file mode 100644 index 0000000..c4c0f4e --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * VectorMapDesc. + * + * Extra vector information just for the PartitionDesc. + * + */ +public class VectorPartitionDesc { + + private static long serialVersionUID = 1L; + + // Data Type Conversion Needed? + // + // VECTORIZED_INPUT_FILE_FORMAT: + // No data type conversion check? Assume ALTER TABLE prevented conversions that + // VectorizedInputFileFormat cannot handle... + // + + public static enum VectorMapOperatorReadType { + NONE, + VECTORIZED_INPUT_FILE_FORMAT + } + + + private final VectorMapOperatorReadType vectorMapOperatorReadType; + + private final boolean needsDataTypeConversionCheck; + + private boolean[] conversionFlags; + + private TypeInfo[] typeInfos; + + private VectorPartitionDesc(VectorMapOperatorReadType vectorMapOperatorReadType, + boolean needsDataTypeConversionCheck) { + this.vectorMapOperatorReadType = vectorMapOperatorReadType; + this.needsDataTypeConversionCheck = needsDataTypeConversionCheck; + + conversionFlags = null; + typeInfos = null; + } + + public static VectorPartitionDesc VectorizedInputFileFormat() { + return new VectorPartitionDesc(VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT, true); + } + + + @Override + public VectorPartitionDesc clone() { + VectorPartitionDesc result = + new VectorPartitionDesc(vectorMapOperatorReadType, + needsDataTypeConversionCheck); + result.conversionFlags = + (conversionFlags == null ? null : + Arrays.copyOf(conversionFlags, conversionFlags.length)); + result.typeInfos = Arrays.copyOf(typeInfos, typeInfos.length); + return result; + } + + public VectorMapOperatorReadType getVectorMapOperatorReadType() { + return vectorMapOperatorReadType; + } + + public boolean getNeedsDataTypeConversionCheck() { + return needsDataTypeConversionCheck; + } + + public void setConversionFlags(boolean[] conversionFlags) { + this.conversionFlags = conversionFlags; + } + + public boolean[] getConversionFlags() { + return conversionFlags; + } + + public TypeInfo[] getTypeInfos() { + return typeInfos; + } + + public void setTypeInfos(List typeInfoList) { + typeInfos = typeInfoList.toArray(new TypeInfo[0]); + } + + public int getNonPartColumnCount() { + return typeInfos.length; + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java index 0f8712e..c076e6c 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import junit.framework.TestCase; @@ -50,13 +51,13 @@ void examineBatch(VectorizedRowBatch batch, VectorExtractRowSameBatch vectorExtr void testVectorRowObject(int caseNum, Random r) throws HiveException { - Map emptyScratchMap = new HashMap(); + String[] emptyScratchTypeNames = new String[0]; RandomRowObjectSource source = new RandomRowObjectSource(); source.init(r); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); - batchContext.init(emptyScratchMap, source.rowStructObjectInspector()); + batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); VectorAssignRowSameBatch vectorAssignRow = new VectorAssignRowSameBatch(); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java index 23e44f0..d3dc30d 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java @@ -71,6 +71,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.hive.serde2.fast.SerializeWrite; import org.apache.hadoop.io.BooleanWritable; @@ -331,13 +332,13 @@ void serializeBatch(VectorizedRowBatch batch, VectorSerializeRow vectorSerialize void testVectorSerializeRow(int caseNum, Random r, SerializationType serializationType) throws HiveException, IOException, SerDeException { - Map emptyScratchMap = new HashMap(); + String[] emptyScratchTypeNames = new String[0]; RandomRowObjectSource source = new RandomRowObjectSource(); source.init(r); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); - batchContext.init(emptyScratchMap, source.rowStructObjectInspector()); + batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); VectorAssignRowSameBatch vectorAssignRow = new VectorAssignRowSameBatch(); @@ -563,13 +564,13 @@ private LazySerDeParameters getSerDeParams(StructObjectInspector rowObjectInspec void testVectorDeserializeRow(int caseNum, Random r, SerializationType serializationType) throws HiveException, IOException, SerDeException { - Map emptyScratchMap = new HashMap(); + String[] emptyScratchTypeNames = new String[0]; RandomRowObjectSource source = new RandomRowObjectSource(); source.init(r); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); - batchContext.init(emptyScratchMap, source.rowStructObjectInspector()); + batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); int fieldCount = source.typeNames().size(); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java deleted file mode 100644 index 3321823..0000000 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java +++ /dev/null @@ -1,357 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector; - -import java.io.File; -import java.io.IOException; -import java.sql.Timestamp; -import java.util.Arrays; -import java.util.Calendar; -import java.util.List; -import java.util.Properties; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.RCFile; -import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; -import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; -import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; -import org.apache.hadoop.hive.serde2.io.ByteWritable; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; -import org.apache.hadoop.hive.serde2.io.ShortWritable; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.DataOutputBuffer; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.ObjectWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.compress.DefaultCodec; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -/** - * Class that tests the functionality of VectorizedRowBatchCtx. - */ -public class TestVectorizedRowBatchCtx { - - private Configuration conf; - private FileSystem fs; - private Path testFilePath; - private int colCount; - private ColumnarSerDe serDe; - private Properties tbl; - - @Before - public void openFileSystem() throws Exception { - conf = new Configuration(); - fs = FileSystem.getLocal(conf); - Path workDir = new Path(System.getProperty("test.tmp.dir", - "target" + File.separator + "test" + File.separator + "tmp")); - fs.setWorkingDirectory(workDir); - testFilePath = new Path("TestVectorizedRowBatchCtx.testDump.rc"); - fs.delete(testFilePath, false); - } - - private void initSerde() { - tbl = new Properties(); - - // Set the configuration parameters - tbl.setProperty(serdeConstants.SERIALIZATION_FORMAT, "6"); - tbl.setProperty("columns", - "ashort,aint,along,adouble,afloat,astring,abyte,aboolean,atimestamp"); - tbl.setProperty("columns.types", - "smallint:int:bigint:double:float:string:tinyint:boolean:timestamp"); - colCount = 9; - tbl.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); - - try { - serDe = new ColumnarSerDe(); - SerDeUtils.initializeSerDe(serDe, conf, tbl, null); - } catch (SerDeException e) { - throw new RuntimeException(e); - } - } - - private void WriteRCFile(FileSystem fs, Path file, Configuration conf) - throws IOException, SerDeException { - fs.delete(file, true); - - RCFileOutputFormat.setColumnNumber(conf, colCount); - RCFile.Writer writer = - new RCFile.Writer(fs, conf, file, null, null, - new DefaultCodec()); - - for (int i = 0; i < 10; ++i) { - BytesRefArrayWritable bytes = new BytesRefArrayWritable(colCount); - BytesRefWritable cu; - - if (i % 3 != 0) { - //if (i < 100) { - cu = new BytesRefWritable((i + "").getBytes("UTF-8"), 0, (i + "").getBytes("UTF-8").length); - bytes.set(0, cu); - - cu = new BytesRefWritable((i + 100 + "").getBytes("UTF-8"), 0, - (i + 100 + "").getBytes("UTF-8").length); - bytes.set(1, cu); - - cu = new BytesRefWritable((i + 200 + "").getBytes("UTF-8"), 0, - (i + 200 + "").getBytes("UTF-8").length); - bytes.set(2, cu); - - cu = new BytesRefWritable((i + 1.23 + "").getBytes("UTF-8"), 0, - (i + 1.23 + "").getBytes("UTF-8").length); - bytes.set(3, cu); - - cu = new BytesRefWritable((i + 2.23 + "").getBytes("UTF-8"), 0, - (i + 2.23 + "").getBytes("UTF-8").length); - bytes.set(4, cu); - - cu = new BytesRefWritable(("Test string").getBytes("UTF-8"), 0, - ("Test string").getBytes("UTF-8").length); - bytes.set(5, cu); - - cu = new BytesRefWritable((1 + "").getBytes("UTF-8"), 0, - (1 + "").getBytes("UTF-8").length); - bytes.set(6, cu); - - cu = new BytesRefWritable(("true").getBytes("UTF-8"), 0, - ("true").getBytes("UTF-8").length); - bytes.set(7, cu); - - Timestamp t = new Timestamp(Calendar.getInstance().getTime().getTime()); - cu = new BytesRefWritable(t.toString().getBytes("UTF-8"), 0, - t.toString().getBytes("UTF-8").length); - bytes.set(8, cu); - - } else { - cu = new BytesRefWritable((i + "").getBytes("UTF-8"), 0, (i + "").getBytes("UTF-8").length); - bytes.set(0, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(1, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(2, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(3, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(4, cu); - - cu = new BytesRefWritable(("Test string").getBytes("UTF-8"), 0, - ("Test string").getBytes("UTF-8").length); - bytes.set(5, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(6, cu); - - cu = new BytesRefWritable(new byte[0], 0, 0); - bytes.set(7, cu); - -// cu = new BytesRefWritable(new byte[0], 0, 0); -// bytes.set(8, cu); - Timestamp t = new Timestamp(Calendar.getInstance().getTime().getTime()); - cu = new BytesRefWritable(t.toString().getBytes("UTF-8"), 0, - t.toString().getBytes("UTF-8").length); - bytes.set(8, cu); - } - writer.append(bytes); - } - writer.close(); - } - - private VectorizedRowBatch GetRowBatch() throws SerDeException, HiveException, IOException { - - RCFile.Reader reader = new RCFile.Reader(fs, this.testFilePath, conf); - DataOutputBuffer buffer = new DataOutputBuffer(); - - // Get object inspector - StructObjectInspector oi = (StructObjectInspector) serDe - .getObjectInspector(); - List fieldRefs = oi.getAllStructFieldRefs(); - - Assert.assertEquals("Field size should be 9", colCount, fieldRefs.size()); - - // Create the context - VectorizedRowBatchCtx ctx = new VectorizedRowBatchCtx(oi, oi, serDe, null, null); - VectorizedRowBatch batch = ctx.createVectorizedRowBatch(); - VectorizedBatchUtil.setNoNullFields(batch); - - // Iterate thru the rows and populate the batch - LongWritable rowID = new LongWritable(); - for (int i = 0; i < 10; i++) { - reader.next(rowID); - BytesRefArrayWritable cols = new BytesRefArrayWritable(); - reader.getCurrentRow(cols); - cols.resetValid(colCount); - ctx.addRowToBatch(i, cols, batch, buffer); - } - reader.close(); - batch.size = 10; - return batch; - } - - void ValidateRowBatch(VectorizedRowBatch batch) throws IOException, SerDeException { - - LongWritable rowID = new LongWritable(); - RCFile.Reader reader = new RCFile.Reader(fs, this.testFilePath, conf); - for (int i = 0; i < batch.size; i++) { - reader.next(rowID); - BytesRefArrayWritable cols = new BytesRefArrayWritable(); - reader.getCurrentRow(cols); - cols.resetValid(colCount); - Object row = serDe.deserialize(cols); - - StructObjectInspector oi = (StructObjectInspector) serDe - .getObjectInspector(); - List fieldRefs = oi.getAllStructFieldRefs(); - - for (int j = 0; j < fieldRefs.size(); j++) { - Object fieldData = oi.getStructFieldData(row, fieldRefs.get(j)); - ObjectInspector foi = fieldRefs.get(j).getFieldObjectInspector(); - - // Vectorization only supports PRIMITIVE data types. Assert the same - Assert.assertEquals(true, foi.getCategory() == Category.PRIMITIVE); - - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi; - Object writableCol = poi.getPrimitiveWritableObject(fieldData); - if (writableCol != null) { - switch (poi.getPrimitiveCategory()) { - case BOOLEAN: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == (((BooleanWritable) writableCol).get() ? 1 - : 0)); - } - break; - case BYTE: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == (long) ((ByteWritable) writableCol).get()); - } - break; - case SHORT: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == ((ShortWritable) writableCol).get()); - } - break; - case INT: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == ((IntWritable) writableCol).get()); - } - break; - case LONG: { - LongColumnVector lcv = (LongColumnVector) batch.cols[j]; - Assert.assertEquals(true, lcv.vector[i] == ((LongWritable) writableCol).get()); - } - break; - case FLOAT: { - DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[j]; - Assert.assertEquals(true, dcv.vector[i] == ((FloatWritable) writableCol).get()); - } - break; - case DOUBLE: { - DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[j]; - Assert.assertEquals(true, dcv.vector[i] == ((DoubleWritable) writableCol).get()); - } - break; - case BINARY: { - BytesColumnVector bcv = (BytesColumnVector) batch.cols[j]; - BytesWritable colBinary = (BytesWritable) writableCol; - BytesWritable batchBinary = new BytesWritable(); - batchBinary.set(bcv.vector[i], bcv.start[i], bcv.length[i]); - byte[] a = colBinary.getBytes(); - byte[] b = batchBinary.getBytes(); - Assert.assertEquals(true, Arrays.equals(a, b)); - } - break; - case STRING: { - BytesColumnVector bcv = (BytesColumnVector) batch.cols[j]; - Text colText = (Text) writableCol; - Text batchText = new Text(); - batchText.set(bcv.vector[i], bcv.start[i], bcv.length[i]); - String a = colText.toString(); - String b = batchText.toString(); - Assert.assertEquals(true, a.equals(b)); - } - break; - case TIMESTAMP: { - LongColumnVector tcv = (LongColumnVector) batch.cols[j]; - Timestamp t = ((TimestampWritable) writableCol).getTimestamp(); - long timeInNanoSec = (t.getTime() * 1000000) + (t.getNanos() % 1000000); - Assert.assertEquals(true, tcv.vector[i] == timeInNanoSec); - } - break; - default: - Assert.assertTrue("Unknown type", false); - } - } else { - Assert.assertEquals(true, batch.cols[j].isNull[i]); - } - } - - // Check repeating - Assert.assertEquals(false, batch.cols[0].isRepeating); - Assert.assertEquals(false, batch.cols[1].isRepeating); - Assert.assertEquals(false, batch.cols[2].isRepeating); - Assert.assertEquals(false, batch.cols[3].isRepeating); - Assert.assertEquals(false, batch.cols[4].isRepeating); - - // Check non null - Assert.assertEquals(true, batch.cols[0].noNulls); - Assert.assertEquals(false, batch.cols[1].noNulls); - Assert.assertEquals(false, batch.cols[2].noNulls); - Assert.assertEquals(false, batch.cols[3].noNulls); - Assert.assertEquals(false, batch.cols[4].noNulls); - } - reader.close(); - } - - @Test - public void TestCtx() throws Exception { - initSerde(); - WriteRCFile(this.fs, this.testFilePath, this.conf); - VectorizedRowBatch batch = GetRowBatch(); - ValidateRowBatch(batch); - - // Test VectorizedColumnarSerDe - VectorizedColumnarSerDe vcs = new VectorizedColumnarSerDe(); - SerDeUtils.initializeSerDe(vcs, this.conf, tbl, null); - Writable w = vcs.serializeVector(batch, (StructObjectInspector) serDe - .getObjectInspector()); - BytesRefArrayWritable[] refArray = (BytesRefArrayWritable[]) ((ObjectWritable) w).get(); - vcs.deserializeVector(refArray, 10, batch); - ValidateRowBatch(batch); - } -} diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 69cb6ff..77d6254 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -60,6 +60,7 @@ import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.AcidInputFormat; import org.apache.hadoop.hive.ql.io.AcidOutputFormat; import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; @@ -71,6 +72,7 @@ import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; @@ -1422,6 +1424,7 @@ public void testDefaultTypes() throws Exception { * @param isVectorized should run vectorized * @return a JobConf that contains the necessary information * @throws IOException + * @throws HiveException */ JobConf createMockExecutionEnvironment(Path workDir, Path warehouseDir, @@ -1429,7 +1432,7 @@ JobConf createMockExecutionEnvironment(Path workDir, ObjectInspector objectInspector, boolean isVectorized, int partitions - ) throws IOException { + ) throws IOException, HiveException { Utilities.clearWorkMap(); JobConf conf = new JobConf(); conf.set("hive.exec.plan", workDir.toString()); @@ -1484,6 +1487,11 @@ JobConf createMockExecutionEnvironment(Path workDir, MapWork mapWork = new MapWork(); mapWork.setVectorMode(isVectorized); + if (isVectorized) { + VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx(); + vectorizedRowBatchCtx.init(structOI, new String[0]); + mapWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx); + } mapWork.setUseBucketizedHiveInputFormat(false); LinkedHashMap> aliasMap = new LinkedHashMap>(); diff --git ql/src/test/queries/clientpositive/schema_evol_orc_acid_table.q ql/src/test/queries/clientpositive/schema_evol_orc_acid_table.q new file mode 100644 index 0000000..5784695 --- /dev/null +++ ql/src/test/queries/clientpositive/schema_evol_orc_acid_table.q @@ -0,0 +1,97 @@ +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.enforce.bucketing=true; +set hive.cli.print.header=true; +set hive.fetch.task.conversion=none; + +-- ACID +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true"); + +insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original'); + +alter table acid_partitioned1 add columns(c int, d string); + +insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty'); + +insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred'); + +explain +select part,a,b,c,d from acid_partitioned1 order by a; + +select part,a,b,c,d from acid_partitioned1 order by a; + +describe extended acid_partitioned1 partition(part=1); +describe extended acid_partitioned1 partition(part=2); + +alter table acid_partitioned1 partition(part=1) add columns(c int, d string); + +describe extended acid_partitioned1 partition(part=1); + +explain +select part,a,b,c,d from acid_partitioned1 order by a; + +select part,a,b,c,d from acid_partitioned1 order by a; + + +-- ACID +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true"); + +insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original'); + +alter table acid_partitioned2 add columns(c int, d string) cascade; + +insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty'); + +insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred'); + +select part,a,b,c,d from acid_partitioned2 order by a; + +describe extended acid_partitioned2 partition(part=1); +describe extended acid_partitioned2 partition(part=2); + + +-- ACID +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true"); +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- ACID +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true"); + +insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); + +alter table acid_partitioned4 change column a a int cascade; + +insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); + +insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new'); + +select part,a,b from acid_partitioned4 order by a; + +describe extended acid_partitioned4 partition(part=1); +describe extended acid_partitioned4 partition(part=2); \ No newline at end of file diff --git ql/src/test/queries/clientpositive/schema_evol_orc_nonvec_table.q ql/src/test/queries/clientpositive/schema_evol_orc_nonvec_table.q new file mode 100644 index 0000000..b9f4c69 --- /dev/null +++ ql/src/test/queries/clientpositive/schema_evol_orc_nonvec_table.q @@ -0,0 +1,94 @@ +set hive.cli.print.header=true; +set hive.fetch.task.conversion=none; + +-- Non-Vec +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC; + +insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original'); + +alter table acid_partitioned1 add columns(c int, d string); + +insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty'); + +insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred'); + +explain +select part,a,b,c,d from acid_partitioned1 order by a; + +select part,a,b,c,d from acid_partitioned1 order by a; + +describe extended acid_partitioned1 partition(part=1); +describe extended acid_partitioned1 partition(part=2); + +alter table acid_partitioned1 partition(part=1) add columns(c int, d string); + +describe extended acid_partitioned1 partition(part=1); + +explain +select part,a,b,c,d from acid_partitioned1 order by a; + +select part,a,b,c,d from acid_partitioned1 order by a; + + +-- Non-Vec +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC; + +insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original'); + +alter table acid_partitioned2 add columns(c int, d string) cascade; + +insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty'); + +insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred'); + +select part,a,b,c,d from acid_partitioned2 order by a; + +describe extended acid_partitioned2 partition(part=1); +describe extended acid_partitioned2 partition(part=2); + + +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC; +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC; + +insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); + +alter table acid_partitioned4 change column a a int cascade; + +insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); + +insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new'); + +select part,a,b from acid_partitioned4 order by a; + +describe extended acid_partitioned4 partition(part=1); +describe extended acid_partitioned4 partition(part=2); \ No newline at end of file diff --git ql/src/test/queries/clientpositive/schema_evol_orc_vec_table.q ql/src/test/queries/clientpositive/schema_evol_orc_vec_table.q new file mode 100644 index 0000000..cad37f4 --- /dev/null +++ ql/src/test/queries/clientpositive/schema_evol_orc_vec_table.q @@ -0,0 +1,95 @@ +SET hive.vectorized.execution.enabled=true; +set hive.cli.print.header=true; +set hive.fetch.task.conversion=none; + +-- Non-Vec +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC; + +insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original'); + +alter table acid_partitioned1 add columns(c int, d string); + +insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty'); + +insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred'); + +explain +select part,a,b,c,d from acid_partitioned1 order by a; + +select part,a,b,c,d from acid_partitioned1 order by a; + +describe extended acid_partitioned1 partition(part=1); +describe extended acid_partitioned1 partition(part=2); + +alter table acid_partitioned1 partition(part=1) add columns(c int, d string); + +describe extended acid_partitioned1 partition(part=1); + +explain +select part,a,b,c,d from acid_partitioned1 order by a; + +select part,a,b,c,d from acid_partitioned1 order by a; + + +-- Non-Vec +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC; + +insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original'); + +alter table acid_partitioned2 add columns(c int, d string) cascade; + +insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty'); + +insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred'); + +select part,a,b,c,d from acid_partitioned2 order by a; + +describe extended acid_partitioned2 partition(part=1); +describe extended acid_partitioned2 partition(part=2); + + +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC; +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC; + +insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); + +alter table acid_partitioned4 change column a a int cascade; + +insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); + +insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new'); + +select part,a,b from acid_partitioned4 order by a; + +describe extended acid_partitioned4 partition(part=1); +describe extended acid_partitioned4 partition(part=2); \ No newline at end of file diff --git ql/src/test/queries/clientpositive/schema_evol_text_nonvec_table.q ql/src/test/queries/clientpositive/schema_evol_text_nonvec_table.q new file mode 100644 index 0000000..8cc9f5f --- /dev/null +++ ql/src/test/queries/clientpositive/schema_evol_text_nonvec_table.q @@ -0,0 +1,94 @@ +set hive.cli.print.header=true; +set hive.fetch.task.conversion=none; + +-- Non-Vec +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT); + +insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original'); + +alter table acid_partitioned1 add columns(c int, d string); + +insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty'); + +insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred'); + +explain +select part,a,b,c,d from acid_partitioned1 order by a; + +select part,a,b,c,d from acid_partitioned1 order by a; + +describe extended acid_partitioned1 partition(part=1); +describe extended acid_partitioned1 partition(part=2); + +alter table acid_partitioned1 partition(part=1) add columns(c int, d string); + +describe extended acid_partitioned1 partition(part=1); + +explain +select part,a,b,c,d from acid_partitioned1 order by a; + +select part,a,b,c,d from acid_partitioned1 order by a; + + +-- Non-Vec +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT); + +insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original'); + +alter table acid_partitioned2 add columns(c int, d string) cascade; + +insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty'); + +insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred'); + +select part,a,b,c,d from acid_partitioned2 order by a; + +describe extended acid_partitioned2 partition(part=1); +describe extended acid_partitioned2 partition(part=2); + + +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT); +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT); + +insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); + +alter table acid_partitioned4 change column a a int cascade; + +insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); + +insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new'); + +select part,a,b from acid_partitioned4 order by a; + +describe extended acid_partitioned4 partition(part=1); +describe extended acid_partitioned4 partition(part=2); \ No newline at end of file diff --git ql/src/test/results/clientpositive/schema_evol_orc_acid_table.q.out ql/src/test/results/clientpositive/schema_evol_orc_acid_table.q.out new file mode 100644 index 0000000..b6fae83 --- /dev/null +++ ql/src/test/results/clientpositive/schema_evol_orc_acid_table.q.out @@ -0,0 +1,545 @@ +PREHOOK: query: -- ACID +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned1 +POSTHOOK: query: -- ACID +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned1 +PREHOOK: query: insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).b SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned1 add columns(c int, d string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Output: default@acid_partitioned1 +POSTHOOK: query: alter table acid_partitioned1 add columns(c int, d string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Output: default@acid_partitioned1 +PREHOOK: query: insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@acid_partitioned1@part=2 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@acid_partitioned1@part=2 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).a EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).b SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).c EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).d SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__3 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__3 +POSTHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).a EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).b SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).c EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).d SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_partitioned1 + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: part (type: int), a (type: int), b (type: string), c (type: int), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col2 (type: string), _col3 (type: int), _col4 (type: string) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: string), VALUE._col2 (type: int), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Input: default@acid_partitioned1@part=1 +PREHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +1 3 original NULL NULL +2 3 new 30 thirty +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: describe extended acid_partitioned1 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned1 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: alter table acid_partitioned1 partition(part=1) add columns(c int, d string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: alter table acid_partitioned1 partition(part=1) add columns(c int, d string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Output: default@acid_partitioned1@part=1 +PREHOOK: query: describe extended acid_partitioned1 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_partitioned1 + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: part (type: int), a (type: int), b (type: string), c (type: int), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col2 (type: string), _col3 (type: int), _col4 (type: string) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: string), VALUE._col2 (type: int), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 4531 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Input: default@acid_partitioned1@part=1 +PREHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +1 3 original NULL NULL +2 3 new 30 thirty +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: -- ACID +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned2 +POSTHOOK: query: -- ACID +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned2 +PREHOOK: query: insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__4 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__4 +POSTHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).a EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).b SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned2 add columns(c int, d string) cascade +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned2 +PREHOOK: Output: default@acid_partitioned2 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: alter table acid_partitioned2 add columns(c int, d string) cascade +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned2 +POSTHOOK: Output: default@acid_partitioned2 +POSTHOOK: Output: default@acid_partitioned2@part=1 +PREHOOK: query: insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__5 +PREHOOK: Output: default@acid_partitioned2@part=2 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__5 +POSTHOOK: Output: default@acid_partitioned2@part=2 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).a EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).b SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).c EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).d SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__6 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__6 +POSTHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).a EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).b SIMPLE [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).c EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).d SIMPLE [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: select part,a,b,c,d from acid_partitioned2 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned2 +PREHOOK: Input: default@acid_partitioned2@part=1 +PREHOOK: Input: default@acid_partitioned2@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned2 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned2 +POSTHOOK: Input: default@acid_partitioned2@part=1 +POSTHOOK: Input: default@acid_partitioned2@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +1 3 original NULL NULL +2 3 new 30 thirty +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: describe extended acid_partitioned2 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned2 +POSTHOOK: query: describe extended acid_partitioned2 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned2 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned2 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned2 +POSTHOOK: query: describe extended acid_partitioned2 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned2 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: -- ACID +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true"); +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- ACID +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned4 +POSTHOOK: query: -- ACID +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true"); +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- ACID +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional"="true") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned4 +PREHOOK: query: insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__7 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__7 +POSTHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).a EXPRESSION [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).b SIMPLE [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned4 change column a a int cascade +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@acid_partitioned4 +PREHOOK: Output: default@acid_partitioned4 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: alter table acid_partitioned4 change column a a int cascade +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@acid_partitioned4 +POSTHOOK: Output: default@acid_partitioned4 +POSTHOOK: Output: default@acid_partitioned4@part=1 +PREHOOK: query: insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__8 +PREHOOK: Output: default@acid_partitioned4@part=2 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__8 +POSTHOOK: Output: default@acid_partitioned4@part=2 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=2).a EXPRESSION [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=2).b SIMPLE [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__9 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__9 +POSTHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).a EXPRESSION [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).b SIMPLE [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: select part,a,b from acid_partitioned4 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned4 +PREHOOK: Input: default@acid_partitioned4@part=1 +PREHOOK: Input: default@acid_partitioned4@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b from acid_partitioned4 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned4 +POSTHOOK: Input: default@acid_partitioned4@part=1 +POSTHOOK: Input: default@acid_partitioned4@part=2 +#### A masked pattern was here #### +part a b +1 3 original +1 4 original +2 200 new +1 1000 original +1 5000 new +1 6737 original +2 32768 new +2 40000 new +2 72909 new +1 90000 new +PREHOOK: query: describe extended acid_partitioned4 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned4 +POSTHOOK: query: describe extended acid_partitioned4 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned4 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned4 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned4 +POSTHOOK: query: describe extended acid_partitioned4 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned4 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/schema_evol_orc_nonvec_table.q.out ql/src/test/results/clientpositive/schema_evol_orc_nonvec_table.q.out new file mode 100644 index 0000000..bb8481e --- /dev/null +++ ql/src/test/results/clientpositive/schema_evol_orc_nonvec_table.q.out @@ -0,0 +1,545 @@ +PREHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned1 +POSTHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned1 +PREHOOK: query: insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).b SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned1 add columns(c int, d string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Output: default@acid_partitioned1 +POSTHOOK: query: alter table acid_partitioned1 add columns(c int, d string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Output: default@acid_partitioned1 +PREHOOK: query: insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@acid_partitioned1@part=2 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@acid_partitioned1@part=2 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).a EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).b SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).c EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).d SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__3 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__3 +POSTHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).a EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).b SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).c EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).d SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_partitioned1 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: part (type: int), a (type: int), b (type: string), c (type: int), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col2 (type: string), _col3 (type: int), _col4 (type: string) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: string), VALUE._col2 (type: int), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Input: default@acid_partitioned1@part=1 +PREHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +2 3 new 30 thirty +1 3 original NULL NULL +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: describe extended acid_partitioned1 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned1 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: alter table acid_partitioned1 partition(part=1) add columns(c int, d string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: alter table acid_partitioned1 partition(part=1) add columns(c int, d string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Output: default@acid_partitioned1@part=1 +PREHOOK: query: describe extended acid_partitioned1 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_partitioned1 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: part (type: int), a (type: int), b (type: string), c (type: int), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col2 (type: string), _col3 (type: int), _col4 (type: string) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: string), VALUE._col2 (type: int), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Input: default@acid_partitioned1@part=1 +PREHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +2 3 new 30 thirty +1 3 original NULL NULL +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned2 +POSTHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned2 +PREHOOK: query: insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__4 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__4 +POSTHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).a EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).b SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned2 add columns(c int, d string) cascade +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned2 +PREHOOK: Output: default@acid_partitioned2 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: alter table acid_partitioned2 add columns(c int, d string) cascade +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned2 +POSTHOOK: Output: default@acid_partitioned2 +POSTHOOK: Output: default@acid_partitioned2@part=1 +PREHOOK: query: insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__5 +PREHOOK: Output: default@acid_partitioned2@part=2 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__5 +POSTHOOK: Output: default@acid_partitioned2@part=2 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).a EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).b SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).c EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).d SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__6 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__6 +POSTHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).a EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).b SIMPLE [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).c EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).d SIMPLE [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: select part,a,b,c,d from acid_partitioned2 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned2 +PREHOOK: Input: default@acid_partitioned2@part=1 +PREHOOK: Input: default@acid_partitioned2@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned2 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned2 +POSTHOOK: Input: default@acid_partitioned2@part=1 +POSTHOOK: Input: default@acid_partitioned2@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +2 3 new 30 thirty +1 3 original NULL NULL +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: describe extended acid_partitioned2 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned2 +POSTHOOK: query: describe extended acid_partitioned2 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned2 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned2 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned2 +POSTHOOK: query: describe extended acid_partitioned2 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned2 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: -- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC; +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned4 +POSTHOOK: query: -- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC; +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned4 +PREHOOK: query: insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__7 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__7 +POSTHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).a EXPRESSION [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).b SIMPLE [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned4 change column a a int cascade +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@acid_partitioned4 +PREHOOK: Output: default@acid_partitioned4 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: alter table acid_partitioned4 change column a a int cascade +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@acid_partitioned4 +POSTHOOK: Output: default@acid_partitioned4 +POSTHOOK: Output: default@acid_partitioned4@part=1 +PREHOOK: query: insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__8 +PREHOOK: Output: default@acid_partitioned4@part=2 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__8 +POSTHOOK: Output: default@acid_partitioned4@part=2 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=2).a EXPRESSION [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=2).b SIMPLE [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__9 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__9 +POSTHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).a EXPRESSION [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).b SIMPLE [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: select part,a,b from acid_partitioned4 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned4 +PREHOOK: Input: default@acid_partitioned4@part=1 +PREHOOK: Input: default@acid_partitioned4@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b from acid_partitioned4 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned4 +POSTHOOK: Input: default@acid_partitioned4@part=1 +POSTHOOK: Input: default@acid_partitioned4@part=2 +#### A masked pattern was here #### +part a b +1 3 original +1 4 original +2 200 new +1 1000 original +1 5000 new +1 6737 original +2 32768 new +2 40000 new +2 72909 new +1 90000 new +PREHOOK: query: describe extended acid_partitioned4 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned4 +POSTHOOK: query: describe extended acid_partitioned4 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned4 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned4 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned4 +POSTHOOK: query: describe extended acid_partitioned4 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned4 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/schema_evol_orc_vec_table.q.out ql/src/test/results/clientpositive/schema_evol_orc_vec_table.q.out new file mode 100644 index 0000000..6d6af86 --- /dev/null +++ ql/src/test/results/clientpositive/schema_evol_orc_vec_table.q.out @@ -0,0 +1,546 @@ +PREHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned1 +POSTHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned1 +PREHOOK: query: insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).b SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned1 add columns(c int, d string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Output: default@acid_partitioned1 +POSTHOOK: query: alter table acid_partitioned1 add columns(c int, d string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Output: default@acid_partitioned1 +PREHOOK: query: insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@acid_partitioned1@part=2 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@acid_partitioned1@part=2 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).a EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).b SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).c EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).d SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__3 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__3 +POSTHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).a EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).b SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).c EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).d SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_partitioned1 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: part (type: int), a (type: int), b (type: string), c (type: int), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col2 (type: string), _col3 (type: int), _col4 (type: string) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: string), VALUE._col2 (type: int), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Input: default@acid_partitioned1@part=1 +PREHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +2 3 new 30 thirty +1 3 original NULL NULL +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: describe extended acid_partitioned1 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned1 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: alter table acid_partitioned1 partition(part=1) add columns(c int, d string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: alter table acid_partitioned1 partition(part=1) add columns(c int, d string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Output: default@acid_partitioned1@part=1 +PREHOOK: query: describe extended acid_partitioned1 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_partitioned1 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: part (type: int), a (type: int), b (type: string), c (type: int), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col2 (type: string), _col3 (type: int), _col4 (type: string) + Execution mode: vectorized + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: string), VALUE._col2 (type: int), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 1496 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Input: default@acid_partitioned1@part=1 +PREHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +2 3 new 30 thirty +1 3 original NULL NULL +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned2 +POSTHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned2 +PREHOOK: query: insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__4 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__4 +POSTHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).a EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).b SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned2 add columns(c int, d string) cascade +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned2 +PREHOOK: Output: default@acid_partitioned2 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: alter table acid_partitioned2 add columns(c int, d string) cascade +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned2 +POSTHOOK: Output: default@acid_partitioned2 +POSTHOOK: Output: default@acid_partitioned2@part=1 +PREHOOK: query: insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__5 +PREHOOK: Output: default@acid_partitioned2@part=2 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__5 +POSTHOOK: Output: default@acid_partitioned2@part=2 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).a EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).b SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).c EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).d SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__6 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__6 +POSTHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).a EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).b SIMPLE [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).c EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).d SIMPLE [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: select part,a,b,c,d from acid_partitioned2 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned2 +PREHOOK: Input: default@acid_partitioned2@part=1 +PREHOOK: Input: default@acid_partitioned2@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned2 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned2 +POSTHOOK: Input: default@acid_partitioned2@part=1 +POSTHOOK: Input: default@acid_partitioned2@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +2 3 new 30 thirty +1 3 original NULL NULL +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: describe extended acid_partitioned2 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned2 +POSTHOOK: query: describe extended acid_partitioned2 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned2 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned2 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned2 +POSTHOOK: query: describe extended acid_partitioned2 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned2 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: -- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC; +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned4 +POSTHOOK: query: -- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC; +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned4 +PREHOOK: query: insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__7 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__7 +POSTHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).a EXPRESSION [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).b SIMPLE [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned4 change column a a int cascade +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@acid_partitioned4 +PREHOOK: Output: default@acid_partitioned4 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: alter table acid_partitioned4 change column a a int cascade +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@acid_partitioned4 +POSTHOOK: Output: default@acid_partitioned4 +POSTHOOK: Output: default@acid_partitioned4@part=1 +PREHOOK: query: insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__8 +PREHOOK: Output: default@acid_partitioned4@part=2 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__8 +POSTHOOK: Output: default@acid_partitioned4@part=2 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=2).a EXPRESSION [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=2).b SIMPLE [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__9 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__9 +POSTHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).a EXPRESSION [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).b SIMPLE [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: select part,a,b from acid_partitioned4 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned4 +PREHOOK: Input: default@acid_partitioned4@part=1 +PREHOOK: Input: default@acid_partitioned4@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b from acid_partitioned4 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned4 +POSTHOOK: Input: default@acid_partitioned4@part=1 +POSTHOOK: Input: default@acid_partitioned4@part=2 +#### A masked pattern was here #### +part a b +1 3 original +1 4 original +2 200 new +1 1000 original +1 5000 new +1 6737 original +2 32768 new +2 40000 new +2 72909 new +1 90000 new +PREHOOK: query: describe extended acid_partitioned4 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned4 +POSTHOOK: query: describe extended acid_partitioned4 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned4 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned4 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned4 +POSTHOOK: query: describe extended acid_partitioned4 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned4 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/schema_evol_text_nonvec_table.q.out ql/src/test/results/clientpositive/schema_evol_text_nonvec_table.q.out new file mode 100644 index 0000000..d0d862f --- /dev/null +++ ql/src/test/results/clientpositive/schema_evol_text_nonvec_table.q.out @@ -0,0 +1,545 @@ +PREHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned1 +POSTHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... RESTRICT +CREATE TABLE acid_partitioned1(a INT, b STRING) PARTITIONED BY(part INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned1 +PREHOOK: query: insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).b SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned1 add columns(c int, d string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Output: default@acid_partitioned1 +POSTHOOK: query: alter table acid_partitioned1 add columns(c int, d string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Output: default@acid_partitioned1 +PREHOOK: query: insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@acid_partitioned1@part=2 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@acid_partitioned1@part=2 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).a EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).b SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).c EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=2).d SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__3 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: insert into table acid_partitioned1 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__3 +POSTHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).a EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).b SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).c EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned1 PARTITION(part=1).d SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_partitioned1 + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: part (type: int), a (type: int), b (type: string), c (type: int), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col2 (type: string), _col3 (type: int), _col4 (type: string) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: string), VALUE._col2 (type: int), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Input: default@acid_partitioned1@part=1 +PREHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +2 3 new 30 thirty +1 3 original NULL NULL +2 4 new 40 forty +1 4 original NULL NULL +1 5 new NULL NULL +1 6 new NULL NULL +PREHOOK: query: describe extended acid_partitioned1 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned1 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: alter table acid_partitioned1 partition(part=1) add columns(c int, d string) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Output: default@acid_partitioned1@part=1 +POSTHOOK: query: alter table acid_partitioned1 partition(part=1) add columns(c int, d string) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Output: default@acid_partitioned1@part=1 +PREHOOK: query: describe extended acid_partitioned1 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned1 +POSTHOOK: query: describe extended acid_partitioned1 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned1 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +POSTHOOK: query: explain +select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_partitioned1 + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: part (type: int), a (type: int), b (type: string), c (type: int), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col2 (type: string), _col3 (type: int), _col4 (type: string) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: string), VALUE._col2 (type: int), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 134 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned1 +PREHOOK: Input: default@acid_partitioned1@part=1 +PREHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned1 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned1 +POSTHOOK: Input: default@acid_partitioned1@part=1 +POSTHOOK: Input: default@acid_partitioned1@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +2 3 new 30 thirty +1 3 original NULL NULL +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned2 +POSTHOOK: query: -- Non-Vec +-- ALTER TABLE ADD COLUMNS ... CASCADE +CREATE TABLE acid_partitioned2(a INT, b STRING) PARTITIONED BY(part INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned2 +PREHOOK: query: insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__4 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=1) values(1, 'original'),(2, 'original'), (3, 'original'),(4, 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__4 +POSTHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).a EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).b SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned2 add columns(c int, d string) cascade +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@acid_partitioned2 +PREHOOK: Output: default@acid_partitioned2 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: alter table acid_partitioned2 add columns(c int, d string) cascade +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@acid_partitioned2 +POSTHOOK: Output: default@acid_partitioned2 +POSTHOOK: Output: default@acid_partitioned2@part=1 +PREHOOK: query: insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__5 +PREHOOK: Output: default@acid_partitioned2@part=2 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=2) values(1, 'new', 10, 'ten'),(2, 'new', 20, 'twenty'), (3, 'new', 30, 'thirty'),(4, 'new', 40, 'forty') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__5 +POSTHOOK: Output: default@acid_partitioned2@part=2 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).a EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).b SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).c EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=2).d SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__6 +PREHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: query: insert into table acid_partitioned2 partition(part=1) values(5, 'new', 100, 'hundred'),(6, 'new', 200, 'two hundred') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__6 +POSTHOOK: Output: default@acid_partitioned2@part=1 +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).a EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).b SIMPLE [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).c EXPRESSION [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned2 PARTITION(part=1).d SIMPLE [(values__tmp__table__6)values__tmp__table__6.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +_col0 _col1 _col2 _col3 +PREHOOK: query: select part,a,b,c,d from acid_partitioned2 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned2 +PREHOOK: Input: default@acid_partitioned2@part=1 +PREHOOK: Input: default@acid_partitioned2@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b,c,d from acid_partitioned2 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned2 +POSTHOOK: Input: default@acid_partitioned2@part=1 +POSTHOOK: Input: default@acid_partitioned2@part=2 +#### A masked pattern was here #### +part a b c d +2 1 new 10 ten +1 1 original NULL NULL +2 2 new 20 twenty +1 2 original NULL NULL +2 3 new 30 thirty +1 3 original NULL NULL +2 4 new 40 forty +1 4 original NULL NULL +1 5 new 100 hundred +1 6 new 200 two hundred +PREHOOK: query: describe extended acid_partitioned2 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned2 +POSTHOOK: query: describe extended acid_partitioned2 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned2 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned2 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned2 +POSTHOOK: query: describe extended acid_partitioned2 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned2 +col_name data_type comment +a int +b string +c int +d string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: -- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT); +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_partitioned4 +POSTHOOK: query: -- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... RESTRICT +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +-- Diabled now because of java.io.IOException: ORC does not support type conversion from INT to SHORT +-- CREATE TABLE acid_partitioned3(a smallint, b STRING) PARTITIONED BY(part INT); +-- +-- insert into table acid_partitioned3 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original'); +-- +-- alter table acid_partitioned3 change column a a int; +-- +-- insert into table acid_partitioned3 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new'); +-- +-- insert into table acid_partitioned3 partition(part=1) values(5000, 'new'),(90000, 'new'); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- describe extended acid_partitioned3 partition(part=2); +-- +-- alter table acid_partitioned3 partition(part=1) change column value a int; +-- +-- describe extended acid_partitioned3 partition(part=1); +-- +-- select part,a,b from acid_partitioned3 order by a; +-- +-- +-- Non-Vec +-- ALTER TABLE CHANGE COLUMN ... CASCADE +-- smallint = (2-byte signed integer, from -32,768 to 32,767) +CREATE TABLE acid_partitioned4(a smallint, b STRING) PARTITIONED BY(part INT) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_partitioned4 +PREHOOK: query: insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__7 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=1) values(1000, 'original'),(6737, 'original'), ('3', 'original'),('4', 'original') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__7 +POSTHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).a EXPRESSION [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).b SIMPLE [(values__tmp__table__7)values__tmp__table__7.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: alter table acid_partitioned4 change column a a int cascade +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@acid_partitioned4 +PREHOOK: Output: default@acid_partitioned4 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: alter table acid_partitioned4 change column a a int cascade +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@acid_partitioned4 +POSTHOOK: Output: default@acid_partitioned4 +POSTHOOK: Output: default@acid_partitioned4@part=1 +PREHOOK: query: insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__8 +PREHOOK: Output: default@acid_partitioned4@part=2 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=2) values(72909, 'new'),(200, 'new'), (32768, 'new'),(40000, 'new') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__8 +POSTHOOK: Output: default@acid_partitioned4@part=2 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=2).a EXPRESSION [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=2).b SIMPLE [(values__tmp__table__8)values__tmp__table__8.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__9 +PREHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: query: insert into table acid_partitioned4 partition(part=1) values(5000, 'new'),(90000, 'new') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__9 +POSTHOOK: Output: default@acid_partitioned4@part=1 +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).a EXPRESSION [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: acid_partitioned4 PARTITION(part=1).b SIMPLE [(values__tmp__table__9)values__tmp__table__9.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +_col0 _col1 +PREHOOK: query: select part,a,b from acid_partitioned4 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_partitioned4 +PREHOOK: Input: default@acid_partitioned4@part=1 +PREHOOK: Input: default@acid_partitioned4@part=2 +#### A masked pattern was here #### +POSTHOOK: query: select part,a,b from acid_partitioned4 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_partitioned4 +POSTHOOK: Input: default@acid_partitioned4@part=1 +POSTHOOK: Input: default@acid_partitioned4@part=2 +#### A masked pattern was here #### +part a b +1 3 original +1 4 original +2 200 new +1 1000 original +1 5000 new +1 6737 original +2 32768 new +2 40000 new +2 72909 new +1 90000 new +PREHOOK: query: describe extended acid_partitioned4 partition(part=1) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned4 +POSTHOOK: query: describe extended acid_partitioned4 partition(part=1) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned4 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### +PREHOOK: query: describe extended acid_partitioned4 partition(part=2) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@acid_partitioned4 +POSTHOOK: query: describe extended acid_partitioned4 partition(part=2) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@acid_partitioned4 +col_name data_type comment +a int +b string +part int + +# Partition Information +# col_name data_type comment + +part int + +#### A masked pattern was here #### diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java index 7c18da6..01dacf8 100644 --- storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java @@ -39,6 +39,10 @@ public int[] projectedColumns; public int projectionSize; + private int nonPartitionColumnCount; + private int partitionColumnCount; + + /* * If no filtering has been applied yet, selectedInUse is false, * meaning that all rows qualify. If it is true, then the selected[] array @@ -86,6 +90,22 @@ public VectorizedRowBatch(int numCols, int size) { for (int i = 0; i < numCols; i++) { projectedColumns[i] = i; } + + nonPartitionColumnCount = -1; + partitionColumnCount = -1; + } + + public void setPartitionInfo(int nonPartitionColumnCount, int partitionColumnCount) { + this.nonPartitionColumnCount = nonPartitionColumnCount; + this.partitionColumnCount = partitionColumnCount; + } + + public int getNonPartitionColumnCount() { + return nonPartitionColumnCount; + } + + public int getPartitionColumnCount() { + return partitionColumnCount; } /**