diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 0a1a25f5f2..5023f2f4d4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -249,6 +249,8 @@ private static final Pattern supportedDataTypesPattern; + private static final TypeInfo[] EMPTY_TYPEINFO_ARRAY = new TypeInfo[0]; + static { StringBuilder patternBuilder = new StringBuilder(); patternBuilder.append("int"); @@ -1372,10 +1374,16 @@ private boolean verifyAndSetVectorPartDesc( Set inputFileFormatClassNameSet, Map vectorPartitionDescMap, Set enabledConditionsMetSet, ArrayList enabledConditionsNotMetList, - Set newSupportSet) { + Set newSupportSet, List dataTypeInfoList) { Class inputFileFormatClass = pd.getInputFileFormatClass(); String inputFileFormatClassName = inputFileFormatClass.getName(); + final TypeInfo[] dataTypeInfos; + if (dataTypeInfoList == null) { + dataTypeInfos = EMPTY_TYPEINFO_ARRAY; + } else { + dataTypeInfos = dataTypeInfoList.toArray(new TypeInfo[dataTypeInfoList.size()]); + } // Always collect input file formats. inputFileFormatClassNameSet.add(inputFileFormatClassName); @@ -1401,7 +1409,9 @@ private boolean verifyAndSetVectorPartDesc( addVectorPartitionDesc( pd, VectorPartitionDesc.createVectorizedInputFileFormat( - inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)), + inputFileFormatClassName, + Utilities.isInputFileFormatSelfDescribing(pd), + dataTypeInfos), vectorPartitionDescMap); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); @@ -1427,7 +1437,9 @@ private boolean verifyAndSetVectorPartDesc( addVectorPartitionDesc( pd, VectorPartitionDesc.createVectorizedInputFileFormat( - inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)), + inputFileFormatClassName, + Utilities.isInputFileFormatSelfDescribing(pd), + dataTypeInfos), vectorPartitionDescMap); enabledConditionsMetSet.add( @@ -1495,7 +1507,7 @@ private boolean verifyAndSetVectorPartDesc( addVectorPartitionDesc( pd, VectorPartitionDesc.createVectorDeserialize( - inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE), + inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE, dataTypeInfos), vectorPartitionDescMap); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname); @@ -1506,7 +1518,7 @@ private boolean verifyAndSetVectorPartDesc( addVectorPartitionDesc( pd, VectorPartitionDesc.createVectorDeserialize( - inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY), + inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY, dataTypeInfos), vectorPartitionDescMap); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname); @@ -1527,7 +1539,8 @@ private boolean verifyAndSetVectorPartDesc( VectorPartitionDesc.createRowDeserialize( inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd), - deserializerClassName), + deserializerClassName, + dataTypeInfos), vectorPartitionDescMap); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE.varname); @@ -1728,30 +1741,17 @@ private void setValidateInputFormatAndSchemaEvolutionExplain(MapWork mapWork, continue; } Set newSupportSet = new TreeSet(); - final boolean isVerifiedVectorPartDesc = - verifyAndSetVectorPartDesc( - partDesc, isFullAcidTable, - allTypeInfoList, - inputFileFormatClassNameSet, - vectorPartitionDescMap, - enabledConditionsMetSet, enabledConditionsNotMetList, - newSupportSet); - - if (!isVerifiedVectorPartDesc) { - - // Always set these so EXPLAIN can see. - setValidateInputFormatAndSchemaEvolutionExplain( - mapWork, inputFileFormatClassNameSet, vectorPartitionDescMap, - enabledConditionsMetSet, enabledConditionsNotMetList); - // We consider this an enable issue, not a not vectorized issue. - return new ImmutablePair(false, true); + final List nextDataTypeInfoList; + final Deserializer deserializer; + final StructObjectInspector partObjectInspector; + try { + deserializer = partDesc.getDeserializer(hiveConf); + partObjectInspector = (StructObjectInspector) deserializer.getObjectInspector(); + } catch (Exception e) { + throw new SemanticException(e); } - handleSupport(isFirstPartition, inputFormatSupportSet, newSupportSet); - - VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); - if (isFirst) { /* @@ -1778,17 +1778,55 @@ private void setValidateInputFormatAndSchemaEvolutionExplain(MapWork mapWork, isFirst = false; } + if (Utilities.isInputFileFormatSelfDescribing(partDesc)) { + + /* + * Self-Describing Input Format will convert its data to the table schema. So, there + * will be no VectorMapOperator conversion needed. + */ + nextDataTypeInfoList = tableDataTypeInfoList; + } else { + String nextDataTypesString = ObjectInspectorUtils.getFieldTypes(partObjectInspector); + + /* + * We convert to an array of TypeInfo using a library routine since it parses the + * information and can handle use of different separators, etc. We cannot use the + * raw type string for comparison in the map because of the different separators used. + */ + nextDataTypeInfoList = + TypeInfoUtils.getTypeInfosFromTypeString(nextDataTypesString); + } + + // HIVE-20419: Vectorization: Prevent mutation of VectorPartitionDesc after being used in a + // hashmap key + final boolean isVerifiedVectorPartDesc = + verifyAndSetVectorPartDesc( + partDesc, isFullAcidTable, + allTypeInfoList, + inputFileFormatClassNameSet, + vectorPartitionDescMap, + enabledConditionsMetSet, enabledConditionsNotMetList, + newSupportSet, + nextDataTypeInfoList); + + final VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); + + if (!isVerifiedVectorPartDesc) { + + // Always set these so EXPLAIN can see. + setValidateInputFormatAndSchemaEvolutionExplain( + mapWork, inputFileFormatClassNameSet, vectorPartitionDescMap, + enabledConditionsMetSet, enabledConditionsNotMetList); + + // We consider this an enable issue, not a not vectorized issue. + return new ImmutablePair(false, true); + } + + handleSupport(isFirstPartition, inputFormatSupportSet, newSupportSet); + // We need to get the partition's column names from the partition serde. // (e.g. Avro provides the table schema and ignores the partition schema..). // - Deserializer deserializer; - StructObjectInspector partObjectInspector; - try { - deserializer = partDesc.getDeserializer(hiveConf); - partObjectInspector = (StructObjectInspector) deserializer.getObjectInspector(); - } catch (Exception e) { - throw new SemanticException(e); - } String nextDataColumnsString = ObjectInspectorUtils.getFieldNames(partObjectInspector); String[] nextDataColumns = nextDataColumnsString.split(","); List nextDataColumnList = Arrays.asList(nextDataColumns); @@ -1833,26 +1871,8 @@ private void setValidateInputFormatAndSchemaEvolutionExplain(MapWork mapWork, } boolean isPartitionRowConversion = false; - List nextDataTypeInfoList; - if (vectorPartDesc.getIsInputFileFormatSelfDescribing()) { - - /* - * Self-Describing Input Format will convert its data to the table schema. So, there - * will be no VectorMapOperator conversion needed. - */ - nextDataTypeInfoList = tableDataTypeInfoList; - - } else { - String nextDataTypesString = ObjectInspectorUtils.getFieldTypes(partObjectInspector); - - /* - * We convert to an array of TypeInfo using a library routine since it parses the - * information and can handle use of different separators, etc. We cannot use the - * raw type string for comparison in the map because of the different separators used. - */ - nextDataTypeInfoList = - TypeInfoUtils.getTypeInfosFromTypeString(nextDataTypesString); + if (!vectorPartDesc.getIsInputFileFormatSelfDescribing()) { final int nextDataTypeInfoSize = nextDataTypeInfoList.size(); if (nextDataTypeInfoSize > tableDataTypeInfoList.size()) { enabledConditionsNotMetList.add( @@ -1891,8 +1911,6 @@ private void setValidateInputFormatAndSchemaEvolutionExplain(MapWork mapWork, enabledConditionsMetSet, enabledConditionsNotMetList); return new ImmutablePair(false, true); } - - vectorPartDesc.setDataTypeInfos(nextDataTypeInfoList); } // For now, we don't know which virtual columns are going to be included. We'll add them diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java index 2c8904dad8..dd597fbe84 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java @@ -77,13 +77,14 @@ private TypeInfo[] dataTypeInfos; private VectorPartitionDesc(String inputFileFormatClassName, - boolean isInputFileFormatSelfDescribing, VectorMapOperatorReadType vectorMapOperatorReadType) { + boolean isInputFileFormatSelfDescribing, VectorMapOperatorReadType vectorMapOperatorReadType, + TypeInfo[] dataTypeInfos) { this.vectorMapOperatorReadType = vectorMapOperatorReadType; this.vectorDeserializeType = VectorDeserializeType.NONE; this.inputFileFormatClassName = inputFileFormatClassName; rowDeserializerClassName = null; this.isInputFileFormatSelfDescribing = isInputFileFormatSelfDescribing; - dataTypeInfos = null; + this.dataTypeInfos = dataTypeInfos; } /** @@ -93,13 +94,13 @@ private VectorPartitionDesc(String inputFileFormatClassName, * @param needsDataTypeConversionCheck */ private VectorPartitionDesc(String inputFileFormatClassName, - VectorDeserializeType vectorDeserializeType) { + VectorDeserializeType vectorDeserializeType, TypeInfo[] dataTypeInfos) { this.vectorMapOperatorReadType = VectorMapOperatorReadType.VECTOR_DESERIALIZE; this.vectorDeserializeType = vectorDeserializeType; this.inputFileFormatClassName = inputFileFormatClassName; rowDeserializerClassName = null; isInputFileFormatSelfDescribing = false; - dataTypeInfos = null; + this.dataTypeInfos = dataTypeInfos; } /** @@ -108,32 +109,35 @@ private VectorPartitionDesc(String inputFileFormatClassName, * @param inputFileFormatClassName */ private VectorPartitionDesc(String inputFileFormatClassName, - boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName) { + boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName, + TypeInfo[] dataTypeInfos) { this.vectorMapOperatorReadType = VectorMapOperatorReadType.ROW_DESERIALIZE; this.vectorDeserializeType = VectorDeserializeType.NONE; this.inputFileFormatClassName = inputFileFormatClassName; this.rowDeserializerClassName = rowDeserializerClassName; this.isInputFileFormatSelfDescribing = isInputFileFormatSelfDescribing; - dataTypeInfos = null; + this.dataTypeInfos = dataTypeInfos; } public static VectorPartitionDesc createVectorizedInputFileFormat(String inputFileFormatClassName, - boolean isInputFileFormatSelfDescribing) { + boolean isInputFileFormatSelfDescribing, TypeInfo[] dataTypeInfos) { return new VectorPartitionDesc( inputFileFormatClassName, isInputFileFormatSelfDescribing, - VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT); + VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT, + dataTypeInfos); } public static VectorPartitionDesc createVectorDeserialize(String inputFileFormatClassName, - VectorDeserializeType vectorDeserializeType) { - return new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType); + VectorDeserializeType vectorDeserializeType, TypeInfo[] dataTypeInfos) { + return new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType, dataTypeInfos); } public static VectorPartitionDesc createRowDeserialize(String inputFileFormatClassName, - boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName) { + boolean isInputFileFormatSelfDescribing, String rowDeserializerClassName, + TypeInfo[] dataTypeInfos) { return new VectorPartitionDesc(rowDeserializerClassName, isInputFileFormatSelfDescribing, - inputFileFormatClassName); + inputFileFormatClassName, dataTypeInfos); } @Override @@ -142,14 +146,14 @@ public VectorPartitionDesc clone() { switch (vectorMapOperatorReadType) { case VECTORIZED_INPUT_FILE_FORMAT: result = new VectorPartitionDesc(inputFileFormatClassName, isInputFileFormatSelfDescribing, - vectorMapOperatorReadType); + vectorMapOperatorReadType, dataTypeInfos); break; case VECTOR_DESERIALIZE: - result = new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType); + result = new VectorPartitionDesc(inputFileFormatClassName, vectorDeserializeType, dataTypeInfos); break; case ROW_DESERIALIZE: result = new VectorPartitionDesc(inputFileFormatClassName, isInputFileFormatSelfDescribing, - rowDeserializerClassName); + rowDeserializerClassName, dataTypeInfos); break; default: throw new RuntimeException("Unexpected vector map operator read type " + vectorMapOperatorReadType.name()); diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 3a83408430..91458ea87a 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -2208,7 +2208,7 @@ JobConf createMockExecutionEnvironment(Path workDir, PartitionDesc part = new PartitionDesc(tbl, partSpec); if (isVectorized) { part.setVectorPartitionDesc( - VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false)); + VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false, null)); } partMap.put(path, part); }