diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java index fc82cf7..c97b5b9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java @@ -20,6 +20,7 @@ import java.io.EOFException; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -42,8 +43,12 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; @@ -91,6 +96,82 @@ public VectorDeserializeRow(T deserializeRead) { private VectorDeserializeRow() { } + private static class Field { + + private Category category; + + private PrimitiveCategory primitiveCategory; + //The data type primitive category of the column being deserialized. + + private int maxLength; + // For the CHAR and VARCHAR data types, the maximum character length of + // the column. Otherwise, 0. + + private boolean isConvert; + + /* + * This member has information for data type conversion. + * Not defined if there is no conversion. + */ + Writable conversionWritable; + // Conversion requires source be placed in writable so we can call upon + // VectorAssignRow to convert and assign the row column. + + private ComplexTypeHelper complexTypeHelper; + // For a complex type, a helper object that describes elements, key/value pairs, + // or fields. + + public Field(PrimitiveCategory primitiveCategory, int maxLength) { + this.category = Category.PRIMITIVE; + this.primitiveCategory = primitiveCategory; + this.maxLength = maxLength; + this.isConvert = false; + this.conversionWritable = null; + this.complexTypeHelper = null; + } + + public Field(Category category, ComplexTypeHelper complexTypeHelper) { + this.category = category; + this.primitiveCategory = null; + this.maxLength = 0; + this.isConvert = false; + this.conversionWritable = null; + this.complexTypeHelper = complexTypeHelper; + } + + public Category getCategory() { + return category; + } + + public PrimitiveCategory getPrimitiveCategory() { + return primitiveCategory; + } + + public int getMaxLength() { + return maxLength; + } + + public void setIsConvert(boolean isConvert) { + this.isConvert = isConvert; + } + + public boolean getIsConvert() { + return isConvert; + } + + public void setConversionWritable(Writable conversionWritable) { + this.conversionWritable = conversionWritable; + } + + public Writable getConversionWritable() { + return conversionWritable; + } + + public ComplexTypeHelper getComplexHelper() { + return complexTypeHelper; + } + } + /* * These members have information for deserializing a row into the VectorizedRowBatch * columns. @@ -105,30 +186,11 @@ private VectorDeserializeRow() { private int[] readFieldLogicalIndices; // The logical indices for reading with readField. - private boolean[] isConvert; - // For each column, are we converting the row column? - private int[] projectionColumnNums; // Assigning can be a subset of columns, so this is the projection -- // the batch column numbers. - private Category[] sourceCategories; - // The data type category of each column being deserialized. - - private PrimitiveCategory[] sourcePrimitiveCategories; - //The data type primitive category of each column being deserialized. - - private int[] maxLengths; - // For the CHAR and VARCHAR data types, the maximum character length of - // the columns. Otherwise, 0. - - /* - * These members have information for data type conversion. - * Not defined if there is no conversion. - */ - Writable[] convertSourceWritables; - // Conversion requires source be placed in writable so we can call upon - // VectorAssignRow to convert and assign the row column. + private Field topLevelFields[]; VectorAssignRow convertVectorAssignRow; // Use its conversion ability. @@ -137,62 +199,117 @@ private VectorDeserializeRow() { * Allocate the source deserialization related arrays. */ private void allocateArrays(int count) { - isConvert = new boolean[count]; projectionColumnNums = new int[count]; Arrays.fill(projectionColumnNums, -1); - sourceCategories = new Category[count]; - sourcePrimitiveCategories = new PrimitiveCategory[count]; - maxLengths = new int[count]; + topLevelFields = new Field[count]; } - /* - * Allocate the conversion related arrays (optional). - */ - private void allocateConvertArrays(int count) { - convertSourceWritables = new Writable[count]; + private Field allocatePrimitiveField(TypeInfo sourceTypeInfo) { + PrimitiveTypeInfo sourcePrimitiveTypeInfo = (PrimitiveTypeInfo) sourceTypeInfo; + PrimitiveCategory sourcePrimitiveCategory = sourcePrimitiveTypeInfo.getPrimitiveCategory(); + int maxLength; + switch (sourcePrimitiveCategory) { + case CHAR: + maxLength = ((CharTypeInfo) sourcePrimitiveTypeInfo).getLength(); + break; + case VARCHAR: + maxLength = ((VarcharTypeInfo) sourcePrimitiveTypeInfo).getLength(); + break; + default: + // No additional data type specific setting. + maxLength = 0; + break; + } + return new Field(sourcePrimitiveCategory, maxLength); + } + + private Field allocateComplexField(TypeInfo sourceTypeInfo) { + Category category = sourceTypeInfo.getCategory(); + switch (category) { + case LIST: + { + ListTypeInfo listTypeInfo = (ListTypeInfo) sourceTypeInfo; + ListComplexTypeHelper listHelper = + new ListComplexTypeHelper( + allocateField(listTypeInfo.getListElementTypeInfo())); + return new Field(category, listHelper); + } + case MAP: + { + MapTypeInfo mapTypeInfo = (MapTypeInfo) sourceTypeInfo; + MapComplexTypeHelper mapHelper = + new MapComplexTypeHelper( + allocateField(mapTypeInfo.getMapKeyTypeInfo()), + allocateField(mapTypeInfo.getMapValueTypeInfo())); + return new Field(category, mapHelper); + } + case STRUCT: + { + StructTypeInfo structTypeInfo = (StructTypeInfo) sourceTypeInfo; + ArrayList fieldTypeInfoList = structTypeInfo.getAllStructFieldTypeInfos(); + final int count = fieldTypeInfoList.size(); + Field[] fields = new Field[count]; + for (int i = 0; i < count; i++) { + fields[i] = allocateField(fieldTypeInfoList.get(i)); + } + StructComplexTypeHelper structHelper = + new StructComplexTypeHelper(fields); + return new Field(category, structHelper); + } + case UNION: + { + UnionTypeInfo unionTypeInfo = (UnionTypeInfo) sourceTypeInfo; + List fieldTypeInfoList = unionTypeInfo.getAllUnionObjectTypeInfos(); + final int count = fieldTypeInfoList.size(); + Field[] fields = new Field[count]; + for (int i = 0; i < count; i++) { + fields[i] = allocateField(fieldTypeInfoList.get(i)); + } + UnionComplexTypeHelper unionHelper = + new UnionComplexTypeHelper(fields); + return new Field(category, unionHelper); + } + default: + throw new RuntimeException("Category " + category + " not supported"); + } + } + + private Field allocateField(TypeInfo sourceTypeInfo) { + switch (sourceTypeInfo.getCategory()) { + case PRIMITIVE: + return allocatePrimitiveField(sourceTypeInfo); + case LIST: + case MAP: + case STRUCT: + case UNION: + return allocateComplexField(sourceTypeInfo); + default: + throw new RuntimeException("Category " + sourceTypeInfo.getCategory() + " not supported"); + } } /* - * Initialize one column's source deserializtion related arrays. + * Initialize one column's source deserializtion information. */ - private void initSourceEntry(int logicalColumnIndex, int projectionColumnNum, TypeInfo sourceTypeInfo) { - isConvert[logicalColumnIndex] = false; + private void initTopLevelField(int logicalColumnIndex, int projectionColumnNum, TypeInfo sourceTypeInfo) { + projectionColumnNums[logicalColumnIndex] = projectionColumnNum; - Category sourceCategory = sourceTypeInfo.getCategory(); - sourceCategories[logicalColumnIndex] = sourceCategory; - if (sourceCategory == Category.PRIMITIVE) { - PrimitiveTypeInfo sourcePrimitiveTypeInfo = (PrimitiveTypeInfo) sourceTypeInfo; - PrimitiveCategory sourcePrimitiveCategory = sourcePrimitiveTypeInfo.getPrimitiveCategory(); - sourcePrimitiveCategories[logicalColumnIndex] = sourcePrimitiveCategory; - switch (sourcePrimitiveCategory) { - case CHAR: - maxLengths[logicalColumnIndex] = ((CharTypeInfo) sourcePrimitiveTypeInfo).getLength(); - break; - case VARCHAR: - maxLengths[logicalColumnIndex] = ((VarcharTypeInfo) sourcePrimitiveTypeInfo).getLength(); - break; - default: - // No additional data type specific setting. - break; - } - } else { - // We don't currently support complex types. - Preconditions.checkState(false); - } + + topLevelFields[logicalColumnIndex] = allocateField(sourceTypeInfo); } /* - * Initialize the conversion related arrays. Assumes initSourceEntry has already been called. + * Initialize the conversion related arrays. Assumes initTopLevelField has already been called. */ - private void initConvertTargetEntry(int logicalColumnIndex) { - isConvert[logicalColumnIndex] = true; + private void addTopLevelConversion(int logicalColumnIndex) { + Field field = topLevelFields[logicalColumnIndex]; - if (sourceCategories[logicalColumnIndex] == Category.PRIMITIVE) { - convertSourceWritables[logicalColumnIndex] = - VectorizedBatchUtil.getPrimitiveWritable(sourcePrimitiveCategories[logicalColumnIndex]); - } else { - // We don't currently support complex types. - Preconditions.checkState(false); + field.setIsConvert(true); + + if (field.getCategory() == Category.PRIMITIVE) { + + field.setConversionWritable( + VectorizedBatchUtil.getPrimitiveWritable(field.getPrimitiveCategory())); } } @@ -206,7 +323,7 @@ public void init(int[] outputColumns) throws HiveException { for (int i = 0; i < count; i++) { int outputColumn = outputColumns[i]; - initSourceEntry(i, outputColumn, sourceTypeInfos[i]); + initTopLevelField(i, outputColumn, sourceTypeInfos[i]); } } @@ -220,7 +337,7 @@ public void init(List outputColumns) throws HiveException { for (int i = 0; i < count; i++) { int outputColumn = outputColumns.get(i); - initSourceEntry(i, outputColumn, sourceTypeInfos[i]); + initTopLevelField(i, outputColumn, sourceTypeInfos[i]); } } @@ -234,7 +351,7 @@ public void init(int startColumn) throws HiveException { for (int i = 0; i < count; i++) { int outputColumn = startColumn + i; - initSourceEntry(i, outputColumn, sourceTypeInfos[i]); + initTopLevelField(i, outputColumn, sourceTypeInfos[i]); } } @@ -260,7 +377,7 @@ public void init(boolean[] columnsToIncludeTruncated) throws HiveException { } else { - initSourceEntry(i, i, sourceTypeInfos[i]); + initTopLevelField(i, i, sourceTypeInfos[i]); includedIndices[includedCount++] = i; } } @@ -298,7 +415,6 @@ public void initConversion(TypeInfo[] targetTypeInfos, final int columnCount = sourceTypeInfos.length; allocateArrays(columnCount); - allocateConvertArrays(columnCount); int includedCount = 0; int[] includedIndices = new int[columnCount]; @@ -320,20 +436,22 @@ public void initConversion(TypeInfo[] targetTypeInfos, if (VectorPartitionConversion.isImplicitVectorColumnConversion(sourceTypeInfo, targetTypeInfo)) { // Do implicit conversion from source type to target type. - initSourceEntry(i, i, sourceTypeInfo); + initTopLevelField(i, i, sourceTypeInfo); } else { // Do formal conversion... - initSourceEntry(i, i, sourceTypeInfo); - initConvertTargetEntry(i); + initTopLevelField(i, i, sourceTypeInfo); + + // UNDONE: No for List and Map; Yes for Struct and Union when field count different... + addTopLevelConversion(i); atLeastOneConvert = true; } } else { // No conversion. - initSourceEntry(i, i, sourceTypeInfo); + initTopLevelField(i, i, sourceTypeInfo); } @@ -360,6 +478,405 @@ public void init() throws HiveException { init(0); } + private void storePrimitiveRowColumn(ColumnVector colVector, + Field field, int batchIndex, + boolean canRetainByteRef) throws IOException { + switch (field.getPrimitiveCategory()) { + case VOID: + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + return; + case BOOLEAN: + ((LongColumnVector) colVector).vector[batchIndex] = + (deserializeRead.currentBoolean ? 1 : 0); + break; + case BYTE: + ((LongColumnVector) colVector).vector[batchIndex] = + deserializeRead.currentByte; + break; + case SHORT: + ((LongColumnVector) colVector).vector[batchIndex] = + deserializeRead.currentShort; + break; + case INT: + ((LongColumnVector) colVector).vector[batchIndex] = + deserializeRead.currentInt; + break; + case LONG: + ((LongColumnVector) colVector).vector[batchIndex] = + deserializeRead.currentLong; + break; + case TIMESTAMP: + ((TimestampColumnVector) colVector).set( + batchIndex, deserializeRead.currentTimestampWritable.getTimestamp()); + break; + case DATE: + ((LongColumnVector) colVector).vector[batchIndex] = + deserializeRead.currentDateWritable.getDays(); + break; + case FLOAT: + ((DoubleColumnVector) colVector).vector[batchIndex] = + deserializeRead.currentFloat; + break; + case DOUBLE: + ((DoubleColumnVector) colVector).vector[batchIndex] = + deserializeRead.currentDouble; + break; + case BINARY: + case STRING: + { + BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); + if (deserializeRead.currentExternalBufferNeeded) { + bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); + deserializeRead.copyToExternalBuffer( + bytesColVec.getValPreallocatedBytes(), bytesColVec.getValPreallocatedStart()); + bytesColVec.setValPreallocated( + batchIndex, + deserializeRead.currentExternalBufferNeededLen); + } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { + bytesColVec.setRef( + batchIndex, + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength); + } else { + bytesColVec.setVal( + batchIndex, + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength); + } + } + break; + case VARCHAR: + { + // Use the basic STRING bytes read to get access, then use our optimal truncate/trim method + // that does not use Java String objects. + BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); + if (deserializeRead.currentExternalBufferNeeded) { + // Write directly into our BytesColumnVector value buffer. + bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); + byte[] convertBuffer = bytesColVec.getValPreallocatedBytes(); + int convertBufferStart = bytesColVec.getValPreallocatedStart(); + deserializeRead.copyToExternalBuffer( + convertBuffer, + convertBufferStart); + bytesColVec.setValPreallocated( + batchIndex, + StringExpr.truncate( + convertBuffer, + convertBufferStart, + deserializeRead.currentExternalBufferNeededLen, + field.getMaxLength())); + } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { + bytesColVec.setRef( + batchIndex, + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + StringExpr.truncate( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength, + field.getMaxLength())); + } else { + bytesColVec.setVal( + batchIndex, + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + StringExpr.truncate( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength, + field.getMaxLength())); + } + } + break; + case CHAR: + { + // Use the basic STRING bytes read to get access, then use our optimal truncate/trim method + // that does not use Java String objects. + BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); + if (deserializeRead.currentExternalBufferNeeded) { + // Write directly into our BytesColumnVector value buffer. + bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); + byte[] convertBuffer = bytesColVec.getValPreallocatedBytes(); + int convertBufferStart = bytesColVec.getValPreallocatedStart(); + deserializeRead.copyToExternalBuffer( + convertBuffer, + convertBufferStart); + bytesColVec.setValPreallocated( + batchIndex, + StringExpr.rightTrimAndTruncate( + convertBuffer, + convertBufferStart, + deserializeRead.currentExternalBufferNeededLen, + field.getMaxLength())); + } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { + bytesColVec.setRef( + batchIndex, + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + StringExpr.rightTrimAndTruncate( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength, + field.getMaxLength())); + } else { + bytesColVec.setVal( + batchIndex, + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + StringExpr.rightTrimAndTruncate( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength, + field.getMaxLength())); + } + } + break; + case DECIMAL: + // The DecimalColumnVector set method will quickly copy the deserialized decimal writable fields. + ((DecimalColumnVector) colVector).set( + batchIndex, deserializeRead.currentHiveDecimalWritable); + break; + case INTERVAL_YEAR_MONTH: + ((LongColumnVector) colVector).vector[batchIndex] = + deserializeRead.currentHiveIntervalYearMonthWritable.getHiveIntervalYearMonth().getTotalMonths(); + break; + case INTERVAL_DAY_TIME: + ((IntervalDayTimeColumnVector) colVector).set( + batchIndex, deserializeRead.currentHiveIntervalDayTimeWritable.getHiveIntervalDayTime()); + break; + default: + throw new RuntimeException("Primitive category " + field.getPrimitiveCategory() + + " not supported"); + } + } + + private static class ComplexTypeHelper { + } + + private static class ListComplexTypeHelper extends ComplexTypeHelper { + + private Field elementField; + + public ListComplexTypeHelper(Field elementField) { + this.elementField = elementField; + } + + public Field getElementField() { + return elementField; + } + } + + private static class MapComplexTypeHelper extends ComplexTypeHelper { + + private Field keyField; + private Field valueField; + + public MapComplexTypeHelper(Field keyField, Field valueField) { + this.keyField = keyField; + this.valueField = valueField; + } + + public Field getKeyField() { + return keyField; + } + + public Field getValueField() { + return valueField; + } + } + + private static class FieldsComplexTypeHelper extends ComplexTypeHelper { + + private Field[] fields; + + public FieldsComplexTypeHelper(Field[] fields) { + this.fields = fields; + } + + public Field[] getFields() { + return fields; + } + } + + private static class StructComplexTypeHelper extends FieldsComplexTypeHelper { + + public StructComplexTypeHelper(Field[] fields) { + super(fields); + } + } + + private static class UnionComplexTypeHelper extends FieldsComplexTypeHelper { + + public UnionComplexTypeHelper(Field[] fields) { + super(fields); + } + } + + // UNDONE: Presumption of *append* + + private void storeComplexFieldRowColumn(ColumnVector fieldColVector, + Field field, int batchIndex, boolean canRetainByteRef) throws IOException { + + if (!deserializeRead.readComplexField()) { + fieldColVector.isNull[batchIndex] = true; + fieldColVector.noNulls = false; + return; + } + + switch (field.getCategory()) { + case PRIMITIVE: + storePrimitiveRowColumn( + fieldColVector, + field, + batchIndex, + canRetainByteRef); + break; + case LIST: + storeListRowColumn( + fieldColVector, + field, + batchIndex, + canRetainByteRef); + break; + case MAP: + storeMapRowColumn( + fieldColVector, + field, + batchIndex, + canRetainByteRef); + break; + case STRUCT: + storeStructRowColumn( + fieldColVector, + field, + batchIndex, + canRetainByteRef); + break; + case UNION: + storeUnionRowColumn( + fieldColVector, + field, + batchIndex, + canRetainByteRef); + break; + default: + throw new RuntimeException("Category " + field.getCategory() + " not supported"); + } + } + + private void storeListRowColumn(ColumnVector colVector, + Field field, int batchIndex, boolean canRetainByteRef) throws IOException { + + // The read field of the List gives us its length. + final int listLength = deserializeRead.currentInt; + + ListColumnVector listColVector = (ListColumnVector) colVector; + listColVector.isNull[batchIndex] = false; + int offset = listColVector.childCount; + listColVector.offsets[batchIndex] = offset; + listColVector.childCount += listLength; + listColVector.lengths[batchIndex] = listLength; + + ColumnVector elementColVector = listColVector.child; + + ListComplexTypeHelper listHelper = (ListComplexTypeHelper) field.getComplexHelper(); + + for (int i = 0; i < listLength; i++) { + storeComplexFieldRowColumn( + elementColVector, + listHelper.getElementField(), + offset, + canRetainByteRef); + offset++; + } + } + + private void storeMapRowColumn(ColumnVector colVector, + Field field, int batchIndex, boolean canRetainByteRef) throws IOException { + + // The read field of the Map gives us its key/value count. + final int keyValueCount = deserializeRead.currentInt; + + MapColumnVector mapColVector = (MapColumnVector) colVector; + + mapColVector.isNull[batchIndex] = false; + int offset = mapColVector.childCount; + mapColVector.offsets[batchIndex] = offset; + mapColVector.childCount += keyValueCount; + mapColVector.lengths[batchIndex] = keyValueCount; + + ColumnVector keysColVector = mapColVector.keys; + ColumnVector valuesColVector = mapColVector.values; + + MapComplexTypeHelper mapHelper = (MapComplexTypeHelper) field.getComplexHelper(); + + for (int i = 0; i < keyValueCount; i++) { + + // Key. + storeComplexFieldRowColumn( + keysColVector, + mapHelper.getKeyField(), + offset, + canRetainByteRef); + + // Value. + storeComplexFieldRowColumn( + valuesColVector, + mapHelper.getValueField(), + offset, + canRetainByteRef); + + offset++; + } + } + + private void storeStructRowColumn(ColumnVector colVector, + Field field, int batchIndex, boolean canRetainByteRef) throws IOException { + + StructColumnVector structColVector = (StructColumnVector) colVector; + + structColVector.isNull[batchIndex] = false; + + ColumnVector[] colVectorFields = structColVector.fields; + + StructComplexTypeHelper structHelper = (StructComplexTypeHelper) field.getComplexHelper(); + + Field[] fields = structHelper.getFields(); + int i = 0; + for (ColumnVector colVectorField : colVectorFields) { + storeComplexFieldRowColumn( + colVectorField, + fields[i], + batchIndex, + canRetainByteRef); + i++; + } + } + + private void storeUnionRowColumn(ColumnVector colVector, + Field field, int batchIndex, boolean canRetainByteRef) throws IOException { + + // The read field of the Union gives us its tag. + final int tag = deserializeRead.currentInt; + + UnionColumnVector unionColVector = (UnionColumnVector) colVector; + + unionColVector.isNull[batchIndex] = false; + + ColumnVector[] colVectorFields = unionColVector.fields; + unionColVector.tags[batchIndex] = tag; + + UnionComplexTypeHelper unionHelper = (UnionComplexTypeHelper) field.getComplexHelper(); + + storeComplexFieldRowColumn( + colVectorFields[tag], + unionHelper.getFields()[tag], + batchIndex, + canRetainByteRef); + } + /** * Store one row column value that is the current value in deserializeRead. * @@ -374,186 +891,49 @@ public void init() throws HiveException { * @throws IOException */ private void storeRowColumn(VectorizedRowBatch batch, int batchIndex, - int logicalColumnIndex, boolean canRetainByteRef) throws IOException { + Field field, int logicalColumnIndex, boolean canRetainByteRef) throws IOException { final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; - switch (sourceCategories[logicalColumnIndex]) { + ColumnVector colVector = batch.cols[projectionColumnNum]; + + switch (field.getCategory()) { case PRIMITIVE: - { - PrimitiveCategory sourcePrimitiveCategory = sourcePrimitiveCategories[logicalColumnIndex]; - switch (sourcePrimitiveCategory) { - case VOID: - VectorizedBatchUtil.setNullColIsNullValue(batch.cols[projectionColumnNum], batchIndex); - return; - case BOOLEAN: - ((LongColumnVector) batch.cols[projectionColumnNum]).vector[batchIndex] = - (deserializeRead.currentBoolean ? 1 : 0); - break; - case BYTE: - ((LongColumnVector) batch.cols[projectionColumnNum]).vector[batchIndex] = - deserializeRead.currentByte; - break; - case SHORT: - ((LongColumnVector) batch.cols[projectionColumnNum]).vector[batchIndex] = - deserializeRead.currentShort; - break; - case INT: - ((LongColumnVector) batch.cols[projectionColumnNum]).vector[batchIndex] = - deserializeRead.currentInt; - break; - case LONG: - ((LongColumnVector) batch.cols[projectionColumnNum]).vector[batchIndex] = - deserializeRead.currentLong; - break; - case TIMESTAMP: - ((TimestampColumnVector) batch.cols[projectionColumnNum]).set( - batchIndex, deserializeRead.currentTimestampWritable.getTimestamp()); - break; - case DATE: - ((LongColumnVector) batch.cols[projectionColumnNum]).vector[batchIndex] = - deserializeRead.currentDateWritable.getDays(); - break; - case FLOAT: - ((DoubleColumnVector) batch.cols[projectionColumnNum]).vector[batchIndex] = - deserializeRead.currentFloat; - break; - case DOUBLE: - ((DoubleColumnVector) batch.cols[projectionColumnNum]).vector[batchIndex] = - deserializeRead.currentDouble; - break; - case BINARY: - case STRING: - { - BytesColumnVector bytesColVec = ((BytesColumnVector) batch.cols[projectionColumnNum]); - if (deserializeRead.currentExternalBufferNeeded) { - bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); - deserializeRead.copyToExternalBuffer( - bytesColVec.getValPreallocatedBytes(), bytesColVec.getValPreallocatedStart()); - bytesColVec.setValPreallocated( - batchIndex, - deserializeRead.currentExternalBufferNeededLen); - } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { - bytesColVec.setRef( - batchIndex, - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesLength); - } else { - bytesColVec.setVal( - batchIndex, - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesLength); - } - } - break; - case VARCHAR: - { - // Use the basic STRING bytes read to get access, then use our optimal truncate/trim method - // that does not use Java String objects. - BytesColumnVector bytesColVec = ((BytesColumnVector) batch.cols[projectionColumnNum]); - if (deserializeRead.currentExternalBufferNeeded) { - // Write directly into our BytesColumnVector value buffer. - bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); - byte[] convertBuffer = bytesColVec.getValPreallocatedBytes(); - int convertBufferStart = bytesColVec.getValPreallocatedStart(); - deserializeRead.copyToExternalBuffer( - convertBuffer, - convertBufferStart); - bytesColVec.setValPreallocated( - batchIndex, - StringExpr.truncate( - convertBuffer, - convertBufferStart, - deserializeRead.currentExternalBufferNeededLen, - maxLengths[logicalColumnIndex])); - } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { - bytesColVec.setRef( - batchIndex, - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - StringExpr.truncate( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesLength, - maxLengths[logicalColumnIndex])); - } else { - bytesColVec.setVal( - batchIndex, - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - StringExpr.truncate( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesLength, - maxLengths[logicalColumnIndex])); - } - } - break; - case CHAR: - { - // Use the basic STRING bytes read to get access, then use our optimal truncate/trim method - // that does not use Java String objects. - BytesColumnVector bytesColVec = ((BytesColumnVector) batch.cols[projectionColumnNum]); - if (deserializeRead.currentExternalBufferNeeded) { - // Write directly into our BytesColumnVector value buffer. - bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); - byte[] convertBuffer = bytesColVec.getValPreallocatedBytes(); - int convertBufferStart = bytesColVec.getValPreallocatedStart(); - deserializeRead.copyToExternalBuffer( - convertBuffer, - convertBufferStart); - bytesColVec.setValPreallocated( - batchIndex, - StringExpr.rightTrimAndTruncate( - convertBuffer, - convertBufferStart, - deserializeRead.currentExternalBufferNeededLen, - maxLengths[logicalColumnIndex])); - } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { - bytesColVec.setRef( - batchIndex, - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - StringExpr.rightTrimAndTruncate( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesLength, - maxLengths[logicalColumnIndex])); - } else { - bytesColVec.setVal( - batchIndex, - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - StringExpr.rightTrimAndTruncate( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesLength, - maxLengths[logicalColumnIndex])); - } - } - break; - case DECIMAL: - // The DecimalColumnVector set method will quickly copy the deserialized decimal writable fields. - ((DecimalColumnVector) batch.cols[projectionColumnNum]).set( - batchIndex, deserializeRead.currentHiveDecimalWritable); - break; - case INTERVAL_YEAR_MONTH: - ((LongColumnVector) batch.cols[projectionColumnNum]).vector[batchIndex] = - deserializeRead.currentHiveIntervalYearMonthWritable.getHiveIntervalYearMonth().getTotalMonths(); - break; - case INTERVAL_DAY_TIME: - ((IntervalDayTimeColumnVector) batch.cols[projectionColumnNum]).set( - batchIndex, deserializeRead.currentHiveIntervalDayTimeWritable.getHiveIntervalDayTime()); - break; - default: - throw new RuntimeException("Primitive category " + sourcePrimitiveCategory.name() + - " not supported"); - } - } + storePrimitiveRowColumn( + colVector, + field, + batchIndex, + canRetainByteRef); + break; + case LIST: + storeListRowColumn( + colVector, + field, + batchIndex, + canRetainByteRef); + break; + case MAP: + storeMapRowColumn( + colVector, + field, + batchIndex, + canRetainByteRef); + break; + case STRUCT: + storeStructRowColumn( + colVector, + field, + batchIndex, + canRetainByteRef); + break; + case UNION: + storeUnionRowColumn( + colVector, + field, + batchIndex, + canRetainByteRef); break; default: - throw new RuntimeException("Category " + sourceCategories[logicalColumnIndex] + " not supported"); + throw new RuntimeException("Category " + field.getCategory() + " not supported"); } // We always set the null flag to false when there is a value. @@ -572,13 +952,13 @@ private void storeRowColumn(VectorizedRowBatch batch, int batchIndex, * @throws IOException */ private void convertRowColumn(VectorizedRowBatch batch, int batchIndex, - int logicalColumnIndex) throws IOException { - final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; - Writable convertSourceWritable = convertSourceWritables[logicalColumnIndex]; - switch (sourceCategories[logicalColumnIndex]) { + Field field, int logicalColumnIndex) throws IOException { + + Writable convertSourceWritable = field.getConversionWritable(); + switch (field.getCategory()) { case PRIMITIVE: { - switch (sourcePrimitiveCategories[logicalColumnIndex]) { + switch (field.getPrimitiveCategory()) { case VOID: convertSourceWritable = null; break; @@ -611,7 +991,9 @@ private void convertRowColumn(VectorizedRowBatch batch, int batchIndex, break; case BINARY: if (deserializeRead.currentBytes == null) { - LOG.info("null binary entry: batchIndex " + batchIndex + " projection column num " + projectionColumnNum); + LOG.info( + "null binary entry: batchIndex " + batchIndex + " projection column num " + + projectionColumnNums[logicalColumnIndex]); } ((BytesWritable) convertSourceWritable).set( @@ -622,7 +1004,8 @@ private void convertRowColumn(VectorizedRowBatch batch, int batchIndex, case STRING: if (deserializeRead.currentBytes == null) { throw new RuntimeException( - "null string entry: batchIndex " + batchIndex + " projection column num " + projectionColumnNum); + "null string entry: batchIndex " + batchIndex + " projection column num " + + projectionColumnNums[logicalColumnIndex]); } // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. @@ -637,14 +1020,15 @@ private void convertRowColumn(VectorizedRowBatch batch, int batchIndex, // that does not use Java String objects. if (deserializeRead.currentBytes == null) { throw new RuntimeException( - "null varchar entry: batchIndex " + batchIndex + " projection column num " + projectionColumnNum); + "null varchar entry: batchIndex " + batchIndex + " projection column num " + + projectionColumnNums[logicalColumnIndex]); } int adjustedLength = StringExpr.truncate( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength, - maxLengths[logicalColumnIndex]); + field.getMaxLength()); ((HiveVarcharWritable) convertSourceWritable).set( new String( @@ -661,14 +1045,15 @@ private void convertRowColumn(VectorizedRowBatch batch, int batchIndex, // that does not use Java String objects. if (deserializeRead.currentBytes == null) { throw new RuntimeException( - "null char entry: batchIndex " + batchIndex + " projection column num " + projectionColumnNum); + "null char entry: batchIndex " + batchIndex + " projection column num " + + projectionColumnNums[logicalColumnIndex]); } int adjustedLength = StringExpr.rightTrimAndTruncate( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength, - maxLengths[logicalColumnIndex]); + field.getMaxLength()); ((HiveCharWritable) convertSourceWritable).set( new String( @@ -691,13 +1076,26 @@ private void convertRowColumn(VectorizedRowBatch batch, int batchIndex, deserializeRead.currentHiveIntervalDayTimeWritable); break; default: - throw new RuntimeException("Primitive category " + sourcePrimitiveCategories[logicalColumnIndex] + + throw new RuntimeException("Primitive category " + field.getPrimitiveCategory() + " not supported"); } } break; + + case STRUCT: + case UNION: + // The only aspect of conversion to Struct / Union themselves is add fields as NULL on the end + // (no removal from end? which would mean skipping fields...) + + // UNDONE + break; + + case LIST: + case MAP: + // Conversion only happens below to List elements or Map key and/or values and not to the + // List or Map itself. default: - throw new RuntimeException("Category " + sourceCategories[logicalColumnIndex] + " not supported"); + throw new RuntimeException("Category " + field.getCategory() + " not supported"); } /* @@ -739,7 +1137,10 @@ public void deserialize(VectorizedRowBatch batch, int batchIndex) throws IOExcep // Pass false for canRetainByteRef since we will NOT be keeping byte references to the input // bytes with the BytesColumnVector.setRef method. - final int count = isConvert.length; + final int count = topLevelFields.length; + + Field field; + if (!useReadField) { for (int i = 0; i < count; i++) { final int projectionColumnNum = projectionColumnNums[i]; @@ -755,10 +1156,11 @@ public void deserialize(VectorizedRowBatch batch, int batchIndex) throws IOExcep continue; } // The current* members of deserializeRead have the field value. - if (isConvert[i]) { - convertRowColumn(batch, batchIndex, i); + field = topLevelFields[i]; + if (field.getIsConvert()) { + convertRowColumn(batch, batchIndex, field, i); } else { - storeRowColumn(batch, batchIndex, i, /* canRetainByteRef */ false); + storeRowColumn(batch, batchIndex, field, i, /* canRetainByteRef */ false); } } } else { @@ -773,10 +1175,11 @@ public void deserialize(VectorizedRowBatch batch, int batchIndex) throws IOExcep continue; } // The current* members of deserializeRead have the field value. - if (isConvert[logicalIndex]) { - convertRowColumn(batch, batchIndex, logicalIndex); + field = topLevelFields[logicalIndex]; + if (field.getIsConvert()) { + convertRowColumn(batch, batchIndex, field, logicalIndex); } else { - storeRowColumn(batch, batchIndex, logicalIndex, /* canRetainByteRef */ false); + storeRowColumn(batch, batchIndex, field, logicalIndex, /* canRetainByteRef */ false); } } } @@ -803,7 +1206,11 @@ public void deserialize(VectorizedRowBatch batch, int batchIndex) throws IOExcep * @throws IOException */ public void deserializeByRef(VectorizedRowBatch batch, int batchIndex) throws IOException { - final int count = isConvert.length; + + final int count = topLevelFields.length; + + Field field; + if (!useReadField) { for (int i = 0; i < count; i++) { final int projectionColumnNum = projectionColumnNums[i]; @@ -819,10 +1226,11 @@ public void deserializeByRef(VectorizedRowBatch batch, int batchIndex) throws IO continue; } // The current* members of deserializeRead have the field value. - if (isConvert[i]) { - convertRowColumn(batch, batchIndex, i); + field = topLevelFields[i]; + if (field.getIsConvert()) { + convertRowColumn(batch, batchIndex, field, i); } else { - storeRowColumn(batch, batchIndex, i, /* canRetainByteRef */ true); + storeRowColumn(batch, batchIndex, field, i, /* canRetainByteRef */ true); } } } else { @@ -837,10 +1245,11 @@ public void deserializeByRef(VectorizedRowBatch batch, int batchIndex) throws IO continue; } // The current* members of deserializeRead have the field value. - if (isConvert[logicalIndex]) { - convertRowColumn(batch, batchIndex, logicalIndex); + field = topLevelFields[logicalIndex]; + if (field.getIsConvert()) { + convertRowColumn(batch, batchIndex, field, logicalIndex); } else { - storeRowColumn(batch, batchIndex, logicalIndex, /* canRetainByteRef */ true); + storeRowColumn(batch, batchIndex, field, logicalIndex, /* canRetainByteRef */ true); } } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java index e9ce8e8..906950a 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java @@ -51,7 +51,9 @@ void testVectorRowObject(int caseNum, boolean sort, Random r) throws HiveExcepti String[] emptyScratchTypeNames = new String[0]; VectorRandomRowSource source = new VectorRandomRowSource(); - source.init(r); + + // UNDONE: Until we have full complex support -- disable. + source.init(r, false, 4); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java index b29bb8b..f9e5ecd 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java @@ -315,7 +315,10 @@ void testVectorSerializeRow(Random r, SerializationType serializationType) String[] emptyScratchTypeNames = new String[0]; VectorRandomRowSource source = new VectorRandomRowSource(); - source.init(r); + + // UNDONE: Until we have full complex support -- disable. + source.init(r, false, 4); + VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); @@ -339,12 +342,14 @@ void testVectorSerializeRow(Random r, SerializationType serializationType) case LAZY_SIMPLE: { StructObjectInspector rowObjectInspector = source.rowStructObjectInspector(); - LazySerDeParameters lazySerDeParams = getSerDeParams(rowObjectInspector); byte separator = (byte) '\t'; - deserializeRead = new LazySimpleDeserializeRead(source.primitiveTypeInfos(), /* useExternalBuffer */ false, - separator, lazySerDeParams); - serializeWrite = new LazySimpleSerializeWrite(fieldCount, - separator, lazySerDeParams); + LazySerDeParameters lazySerDeParams = getSerDeParams(rowObjectInspector, new byte[] { separator }); + deserializeRead = + new LazySimpleDeserializeRead( + source.primitiveTypeInfos(), + /* useExternalBuffer */ false, + lazySerDeParams); + serializeWrite = new LazySimpleSerializeWrite(fieldCount, lazySerDeParams); } break; default: @@ -531,18 +536,26 @@ private void addToProperties(Properties tbl, String fieldNames, String fieldType tbl.setProperty("columns", fieldNames); tbl.setProperty("columns.types", fieldTypes); + tbl.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); } - private LazySerDeParameters getSerDeParams( StructObjectInspector rowObjectInspector) throws SerDeException { - return getSerDeParams(new Configuration(), new Properties(), rowObjectInspector); + private LazySerDeParameters getSerDeParams( StructObjectInspector rowObjectInspector, + byte[] separators) throws SerDeException { + return getSerDeParams(new Configuration(), new Properties(), rowObjectInspector, separators); } - private LazySerDeParameters getSerDeParams(Configuration conf, Properties tbl, StructObjectInspector rowObjectInspector) throws SerDeException { + private LazySerDeParameters getSerDeParams( + Configuration conf, Properties tbl, StructObjectInspector rowObjectInspector, + byte[] separators) throws SerDeException { String fieldNames = ObjectInspectorUtils.getFieldNames(rowObjectInspector); String fieldTypes = ObjectInspectorUtils.getFieldTypes(rowObjectInspector); addToProperties(tbl, fieldNames, fieldTypes); - return new LazySerDeParameters(conf, tbl, LazySimpleSerDe.class.getName()); + LazySerDeParameters lazySerDeParams = new LazySerDeParameters(conf, tbl, LazySimpleSerDe.class.getName()); + for (int i = 0; i < separators.length; i++) { + lazySerDeParams.setSeparator(i, separators[i]); + } + return lazySerDeParams; } void testVectorDeserializeRow(Random r, SerializationType serializationType, @@ -553,7 +566,9 @@ void testVectorDeserializeRow(Random r, SerializationType serializationType, String[] emptyScratchTypeNames = new String[0]; VectorRandomRowSource source = new VectorRandomRowSource(); - source.init(r); + + // UNDONE: Until we have full complex support -- disable. + source.init(r, false, 4); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); @@ -623,7 +638,8 @@ void testVectorDeserializeRow(Random r, SerializationType serializationType, tbl.setProperty(serdeConstants.ESCAPE_CHAR, escapeString); } - LazySerDeParameters lazySerDeParams = getSerDeParams(conf, tbl, rowObjectInspector); + LazySerDeParameters lazySerDeParams = + getSerDeParams(conf, tbl, rowObjectInspector, new byte[] { separator }); if (useLazySimpleEscapes) { // LazySimple seems to throw away everything but \n and \r. @@ -645,10 +661,12 @@ void testVectorDeserializeRow(Random r, SerializationType serializationType, source.addEscapables(needsEscapeStr); } } - deserializeRead = new LazySimpleDeserializeRead(source.primitiveTypeInfos(), useExternalBuffer, - separator, lazySerDeParams); - serializeWrite = new LazySimpleSerializeWrite(fieldCount, - separator, lazySerDeParams); + deserializeRead = + new LazySimpleDeserializeRead( + source.primitiveTypeInfos(), + useExternalBuffer, + lazySerDeParams); + serializeWrite = new LazySimpleSerializeWrite(fieldCount, lazySerDeParams); } break; default: diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/VectorRandomRowSource.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/VectorRandomRowSource.java index cbde615..f531e1f 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/VectorRandomRowSource.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/VectorRandomRowSource.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector; +import java.io.IOException; import java.sql.Date; import java.sql.Timestamp; import java.util.ArrayList; @@ -34,12 +35,24 @@ import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.common.type.RandomTypeUtil; -import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector; @@ -58,11 +71,19 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hive.common.util.DateUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.BytesWritable; +import com.google.common.base.Preconditions; import com.google.common.base.Charsets; /** @@ -76,6 +97,14 @@ private List typeNames; + private Category[] categories; + + private TypeInfo[] typeInfos; + + private List objectInspectorList; + + // Primitive. + private PrimitiveCategory[] primitiveCategories; private PrimitiveTypeInfo[] primitiveTypeInfos; @@ -93,6 +122,14 @@ return typeNames; } + public Category[] categories() { + return categories; + } + + public TypeInfo[] typeInfos() { + return typeInfos; + } + public PrimitiveCategory[] primitiveCategories() { return primitiveCategories; } @@ -106,30 +143,28 @@ public StructObjectInspector rowStructObjectInspector() { } public StructObjectInspector partialRowStructObjectInspector(int partialFieldCount) { - ArrayList partialPrimitiveObjectInspectorList = + ArrayList partialObjectInspectorList = new ArrayList(partialFieldCount); List columnNames = new ArrayList(partialFieldCount); for (int i = 0; i < partialFieldCount; i++) { columnNames.add(String.format("partial%d", i)); - partialPrimitiveObjectInspectorList.add( - PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( - primitiveTypeInfos[i])); + partialObjectInspectorList.add(getObjectInspector(typeInfos[i])); } return ObjectInspectorFactory.getStandardStructObjectInspector( - columnNames, primitiveObjectInspectorList); + columnNames, objectInspectorList); } - public void init(Random r) { + public void init(Random r, boolean includeComplexTypes, int maxComplexDepth) { this.r = r; - chooseSchema(); + chooseSchema(includeComplexTypes, maxComplexDepth); } /* * For now, exclude CHAR until we determine why there is a difference (blank padding) * serializing with LazyBinarySerializeWrite and the regular SerDe... */ - private static String[] possibleHiveTypeNames = { + private static String[] possibleHivePrimitiveTypeNames = { "boolean", "tinyint", "smallint", @@ -149,7 +184,146 @@ public void init(Random r) { "decimal" }; - private void chooseSchema() { + private static String[] possibleHiveComplexTypeNames = { + "array", + "map", + "struct", + "uniontype" + }; + + private String getRandomTypeName(boolean includeComplexTypes) { + String typeName; + if (!includeComplexTypes || r.nextInt(10) != 0) { + typeName = possibleHivePrimitiveTypeNames[r.nextInt(possibleHivePrimitiveTypeNames.length)]; + } else { + typeName = possibleHiveComplexTypeNames[r.nextInt(possibleHiveComplexTypeNames.length)]; + } + return typeName; + } + + private String getDecoratedTypeName(String typeName, boolean includeComplexTypes, int depth, int maxDepth) { + depth++; + boolean includeChildrenComplexTypes = includeComplexTypes && depth < maxDepth; + if (typeName.equals("char")) { + int maxLength = 1 + r.nextInt(100); + typeName = String.format("char(%d)", maxLength); + } else if (typeName.equals("varchar")) { + int maxLength = 1 + r.nextInt(100); + typeName = String.format("varchar(%d)", maxLength); + } else if (typeName.equals("decimal")) { + typeName = String.format("decimal(%d,%d)", HiveDecimal.SYSTEM_DEFAULT_PRECISION, HiveDecimal.SYSTEM_DEFAULT_SCALE); + } else if (typeName.equals("array")) { + String elementTypeName = getRandomTypeName(includeChildrenComplexTypes); + elementTypeName = getDecoratedTypeName(elementTypeName, includeChildrenComplexTypes, depth, maxDepth); + typeName = String.format("array<%s>", elementTypeName); + } else if (typeName.equals("map")) { + String keyTypeName = getRandomTypeName(includeChildrenComplexTypes); + keyTypeName = getDecoratedTypeName(keyTypeName, includeChildrenComplexTypes, depth, maxDepth); + String valueTypeName = getRandomTypeName(includeChildrenComplexTypes); + valueTypeName = getDecoratedTypeName(valueTypeName, includeChildrenComplexTypes, depth, maxDepth); + typeName = String.format("map<%s,%s>", keyTypeName, valueTypeName); + } else if (typeName.equals("struct")) { + final int fieldCount = 1 + r.nextInt(10); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < fieldCount; i++) { + String fieldTypeName = getRandomTypeName(includeChildrenComplexTypes); + fieldTypeName = getDecoratedTypeName(fieldTypeName, includeChildrenComplexTypes, depth, maxDepth); + if (i > 0) { + sb.append(","); + } + sb.append("col"); + sb.append(i); + sb.append(":"); + sb.append(fieldTypeName); + } + typeName = String.format("struct<%s>", sb.toString()); + } else if (typeName.equals("struct") || + typeName.equals("uniontype")) { + final int fieldCount = 1 + r.nextInt(10); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < fieldCount; i++) { + String fieldTypeName = getRandomTypeName(includeChildrenComplexTypes); + fieldTypeName = getDecoratedTypeName(fieldTypeName, includeChildrenComplexTypes, depth, maxDepth); + if (i > 0) { + sb.append(","); + } + sb.append(fieldTypeName); + } + typeName = String.format("uniontype<%s>", sb.toString()); + } + return typeName; + } + + private ObjectInspector getObjectInspector(TypeInfo typeInfo) { + ObjectInspector objectInspector; + switch (typeInfo.getCategory()) { + case PRIMITIVE: + { + PrimitiveTypeInfo primitiveType = (PrimitiveTypeInfo) typeInfo; + objectInspector = + PrimitiveObjectInspectorFactory. + getPrimitiveWritableObjectInspector(primitiveType); + } + break; + case MAP: + { + MapTypeInfo mapType = (MapTypeInfo) typeInfo; + MapObjectInspector mapInspector = + ObjectInspectorFactory.getStandardMapObjectInspector( + getObjectInspector(mapType.getMapKeyTypeInfo()), + getObjectInspector(mapType.getMapValueTypeInfo())); + objectInspector = mapInspector; + } + break; + case LIST: + { + ListTypeInfo listType = (ListTypeInfo) typeInfo; + ListObjectInspector listInspector = + ObjectInspectorFactory.getStandardListObjectInspector( + getObjectInspector(listType.getListElementTypeInfo())); + objectInspector = listInspector; + } + break; + case STRUCT: + { + StructTypeInfo structType = (StructTypeInfo) typeInfo; + List fieldTypes = structType.getAllStructFieldTypeInfos(); + + List fieldInspectors = new ArrayList(); + for (TypeInfo fieldType : fieldTypes) { + fieldInspectors.add(getObjectInspector(fieldType)); + } + + StructObjectInspector structInspector = + ObjectInspectorFactory.getStandardStructObjectInspector( + structType.getAllStructFieldNames(), fieldInspectors); + objectInspector = structInspector; + } + break; + case UNION: + { + UnionTypeInfo unionType = (UnionTypeInfo) typeInfo; + List fieldTypes = unionType.getAllUnionObjectTypeInfos(); + + List fieldInspectors = new ArrayList(); + for (TypeInfo fieldType : fieldTypes) { + fieldInspectors.add(getObjectInspector(fieldType)); + } + + UnionObjectInspector unionInspector = + ObjectInspectorFactory.getStandardUnionObjectInspector( + fieldInspectors); + objectInspector = unionInspector; + } + break; + default: + throw new RuntimeException("Unexpected category " + typeInfo.getCategory()); + } + Preconditions.checkState(objectInspector != null); + return objectInspector; + } + + private void chooseSchema(boolean includeComplexTypes, int maxComplexDepth) { HashSet hashSet = null; boolean allTypes; boolean onlyOne = (r.nextInt(100) == 7); @@ -160,13 +334,20 @@ private void chooseSchema() { allTypes = r.nextBoolean(); if (allTypes) { // One of each type. - columnCount = possibleHiveTypeNames.length; + columnCount = possibleHivePrimitiveTypeNames.length; + if (includeComplexTypes) { + columnCount += possibleHiveComplexTypeNames.length; + } hashSet = new HashSet(); } else { columnCount = 1 + r.nextInt(20); } } typeNames = new ArrayList(columnCount); + categories = new Category[columnCount]; + typeInfos = new TypeInfo[columnCount]; + objectInspectorList = new ArrayList(columnCount); + primitiveCategories = new PrimitiveCategory[columnCount]; primitiveTypeInfos = new PrimitiveTypeInfo[columnCount]; primitiveObjectInspectorList = new ArrayList(columnCount); @@ -176,12 +357,18 @@ private void chooseSchema() { String typeName; if (onlyOne) { - typeName = possibleHiveTypeNames[r.nextInt(possibleHiveTypeNames.length)]; + typeName = getRandomTypeName(includeComplexTypes); } else { int typeNum; if (allTypes) { + int maxTypeNum = possibleHivePrimitiveTypeNames.length; + if (includeComplexTypes) { + maxTypeNum += possibleHiveComplexTypeNames.length; + } while (true) { - typeNum = r.nextInt(possibleHiveTypeNames.length); + + typeNum = r.nextInt(maxTypeNum); + Integer typeNumInteger = new Integer(typeNum); if (!hashSet.contains(typeNumInteger)) { hashSet.add(typeNumInteger); @@ -189,30 +376,92 @@ private void chooseSchema() { } } } else { - typeNum = r.nextInt(possibleHiveTypeNames.length); + if (!includeComplexTypes || r.nextInt(10) != 0) { + typeNum = r.nextInt(possibleHivePrimitiveTypeNames.length); + } else { + typeNum = possibleHivePrimitiveTypeNames.length + r.nextInt(possibleHiveComplexTypeNames.length); + } + } + if (typeNum < possibleHivePrimitiveTypeNames.length) { + typeName = possibleHivePrimitiveTypeNames[typeNum]; + } else { + typeName = possibleHiveComplexTypeNames[typeNum - possibleHivePrimitiveTypeNames.length]; + } + + } + + String decoratedTypeName = getDecoratedTypeName(typeName, includeComplexTypes, 0, maxComplexDepth); + + TypeInfo typeInfo; + try { + typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(decoratedTypeName); + } catch (Exception e) { + throw new RuntimeException("Cannot convert type name " + decoratedTypeName + " to a type " + e); + } + + typeInfos[c] = typeInfo; + Category category = typeInfo.getCategory(); + categories[c] = category; + ObjectInspector objectInspector = getObjectInspector(typeInfo); + switch (category) { + case PRIMITIVE: + { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + objectInspector = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo); + primitiveTypeInfos[c] = primitiveTypeInfo; + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + primitiveCategories[c] = primitiveCategory; + primitiveObjectInspectorList.add(objectInspector); } - typeName = possibleHiveTypeNames[typeNum]; + break; + case LIST: + case MAP: + case STRUCT: + case UNION: + primitiveObjectInspectorList.add(null); + break; + default: + throw new RuntimeException("Unexpected catagory " + category); } - if (typeName.equals("char")) { - int maxLength = 1 + r.nextInt(100); - typeName = String.format("char(%d)", maxLength); - } else if (typeName.equals("varchar")) { - int maxLength = 1 + r.nextInt(100); - typeName = String.format("varchar(%d)", maxLength); - } else if (typeName.equals("decimal")) { - typeName = String.format("decimal(%d,%d)", HiveDecimal.SYSTEM_DEFAULT_PRECISION, HiveDecimal.SYSTEM_DEFAULT_SCALE); + objectInspectorList.add(objectInspector); + + if (category == Category.PRIMITIVE) { } - PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); - primitiveTypeInfos[c] = primitiveTypeInfo; - PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); - primitiveCategories[c] = primitiveCategory; - primitiveObjectInspectorList.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo)); - typeNames.add(typeName); + typeNames.add(decoratedTypeName); } - rowStructObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, primitiveObjectInspectorList); + rowStructObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, objectInspectorList); alphabets = new String[columnCount]; } + public Object[][] randomRows(int n) { + Object[][] result = new Object[n][]; + for (int i = 0; i < n; i++) { + result[i] = randomRow(); + } + return result; + } + + public Object[] randomRow() { + Object row[] = new Object[columnCount]; + for (int c = 0; c < columnCount; c++) { + row[c] = randomWritable(c); + } + return row; + } + + public Object[] randomPrimitiveRow(int columnCount) { + return randomPrimitiveRow(columnCount, r, primitiveTypeInfos); + } + + public static Object[] randomPrimitiveRow(int columnCount, Random r, + PrimitiveTypeInfo[] primitiveTypeInfos) { + Object row[] = new Object[columnCount]; + for (int c = 0; c < columnCount; c++) { + row[c] = randomPrimitiveObject(r, primitiveTypeInfos[c]); + } + return row; + } + public void addBinarySortableAlphabets() { for (int c = 0; c < columnCount; c++) { switch (primitiveCategories[c]) { @@ -241,52 +490,6 @@ public void addEscapables(String needsEscapeStr) { this.needsEscapeStr = needsEscapeStr; } - public Object[][] randomRows(int n) { - Object[][] result = new Object[n][]; - for (int i = 0; i < n; i++) { - result[i] = randomRow(); - } - return result; - } - - public Object[] randomRow() { - Object row[] = new Object[columnCount]; - for (int c = 0; c < columnCount; c++) { - Object object = randomObject(c); - if (object == null) { - throw new Error("Unexpected null for column " + c); - } - row[c] = getWritableObject(c, object); - if (row[c] == null) { - throw new Error("Unexpected null for writable for column " + c); - } - } - return row; - } - - public Object[] randomRow(int columnCount) { - return randomRow(columnCount, r, primitiveObjectInspectorList, primitiveCategories, - primitiveTypeInfos); - } - - public static Object[] randomRow(int columnCount, Random r, - List primitiveObjectInspectorList, PrimitiveCategory[] primitiveCategories, - PrimitiveTypeInfo[] primitiveTypeInfos) { - Object row[] = new Object[columnCount]; - for (int c = 0; c < columnCount; c++) { - Object object = randomObject(c, r, primitiveCategories, primitiveTypeInfos); - if (object == null) { - throw new Error("Unexpected null for column " + c); - } - row[c] = getWritableObject(c, object, primitiveObjectInspectorList, - primitiveCategories, primitiveTypeInfos); - if (row[c] == null) { - throw new Error("Unexpected null for writable for column " + c); - } - } - return row; - } - public static void sort(Object[][] rows, ObjectInspector oi) { for (int i = 0; i < rows.length; i++) { for (int j = i + 1; j < rows.length; j++) { @@ -303,18 +506,9 @@ public void sort(Object[][] rows) { VectorRandomRowSource.sort(rows, rowStructObjectInspector); } - public Object getWritableObject(int column, Object object) { - return getWritableObject(column, object, primitiveObjectInspectorList, - primitiveCategories, primitiveTypeInfos); - } - - public static Object getWritableObject(int column, Object object, - List primitiveObjectInspectorList, PrimitiveCategory[] primitiveCategories, - PrimitiveTypeInfo[] primitiveTypeInfos) { - ObjectInspector objectInspector = primitiveObjectInspectorList.get(column); - PrimitiveCategory primitiveCategory = primitiveCategories[column]; - PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; - switch (primitiveCategory) { + public Object getWritablePrimitiveObject(PrimitiveTypeInfo primitiveTypeInfo, + ObjectInspector objectInspector, Object object) { + switch (primitiveTypeInfo.getPrimitiveCategory()) { case BOOLEAN: return ((WritableBooleanObjectInspector) objectInspector).create((boolean) object); case BYTE: @@ -357,103 +551,206 @@ public static Object getWritableObject(int column, Object object, { WritableHiveDecimalObjectInspector writableDecimalObjectInspector = new WritableHiveDecimalObjectInspector((DecimalTypeInfo) primitiveTypeInfo); - HiveDecimalWritable result = (HiveDecimalWritable) writableDecimalObjectInspector.create((HiveDecimal) object); - return result; + return writableDecimalObjectInspector.create((HiveDecimal) object); } default: - throw new Error("Unknown primitive category " + primitiveCategory); + throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory()); } } - public Object randomObject(int column) { - return randomObject(column, r, primitiveCategories, primitiveTypeInfos, alphabets, addEscapables, needsEscapeStr); + public Object randomWritable(int column) { + return randomWritable(typeInfos[column], objectInspectorList.get(column)); } - public static Object randomObject(int column, Random r, PrimitiveCategory[] primitiveCategories, - PrimitiveTypeInfo[] primitiveTypeInfos) { - return randomObject(column, r, primitiveCategories, primitiveTypeInfos, null, false, ""); - } - - public static Object randomObject(int column, Random r, PrimitiveCategory[] primitiveCategories, - PrimitiveTypeInfo[] primitiveTypeInfos, String[] alphabets, boolean addEscapables, String needsEscapeStr) { - PrimitiveCategory primitiveCategory = primitiveCategories[column]; - PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; - try { - switch (primitiveCategory) { - case BOOLEAN: - return Boolean.valueOf(r.nextInt(1) == 1); - case BYTE: - return Byte.valueOf((byte) r.nextInt()); - case SHORT: - return Short.valueOf((short) r.nextInt()); - case INT: - return Integer.valueOf(r.nextInt()); - case LONG: - return Long.valueOf(r.nextLong()); - case DATE: - return RandomTypeUtil.getRandDate(r); - case FLOAT: - return Float.valueOf(r.nextFloat() * 10 - 5); - case DOUBLE: - return Double.valueOf(r.nextDouble() * 10 - 5); - case STRING: - case CHAR: - case VARCHAR: - { - String result; - if (alphabets != null && alphabets[column] != null) { - result = RandomTypeUtil.getRandString(r, alphabets[column], r.nextInt(10)); - } else { - result = RandomTypeUtil.getRandString(r); + public Object randomWritable(TypeInfo typeInfo, ObjectInspector objectInspector) { + switch (typeInfo.getCategory()) { + case PRIMITIVE: + { + Object object = randomPrimitiveObject(r, (PrimitiveTypeInfo) typeInfo); + return getWritablePrimitiveObject((PrimitiveTypeInfo) typeInfo, objectInspector, object); + } + case LIST: + { + if (r.nextInt(20) == 0) { + return null; + } + // Always generate a list with at least 1 value? + final int elementCount = 1 + r.nextInt(100); + StandardListObjectInspector listObjectInspector = + (StandardListObjectInspector) objectInspector; + ObjectInspector elementObjectInspector = + listObjectInspector.getListElementObjectInspector(); + TypeInfo elementTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + elementObjectInspector); + boolean isStringFamily = false; + PrimitiveCategory primitiveCategory = null; + if (elementTypeInfo.getCategory() == Category.PRIMITIVE) { + primitiveCategory = ((PrimitiveTypeInfo) elementTypeInfo).getPrimitiveCategory(); + if (primitiveCategory == PrimitiveCategory.STRING || + primitiveCategory == PrimitiveCategory.BINARY || + primitiveCategory == PrimitiveCategory.CHAR || + primitiveCategory == PrimitiveCategory.VARCHAR) { + isStringFamily = true; } - if (addEscapables && result.length() > 0) { - int escapeCount = 1 + r.nextInt(2); - for (int i = 0; i < escapeCount; i++) { - int index = r.nextInt(result.length()); - String begin = result.substring(0, index); - String end = result.substring(index); - Character needsEscapeChar = needsEscapeStr.charAt(r.nextInt(needsEscapeStr.length())); - result = begin + needsEscapeChar + end; - } + } + Object listObj = listObjectInspector.create(elementCount); + for (int i = 0; i < elementCount; i++) { + Object ele = randomWritable(elementTypeInfo, elementObjectInspector); + // UNDONE: For now, a 1-element list with a null element is a null list... + if (ele == null && elementCount == 1) { + return null; } - switch (primitiveCategory) { - case STRING: - return result; - case CHAR: - return new HiveChar(result, ((CharTypeInfo) primitiveTypeInfo).getLength()); - case VARCHAR: - return new HiveVarchar(result, ((VarcharTypeInfo) primitiveTypeInfo).getLength()); - default: - throw new Error("Unknown primitive category " + primitiveCategory); + if (isStringFamily && elementCount == 1) { + switch (primitiveCategory) { + case STRING: + if (((Text) ele).getLength() == 0) { + return null; + } + break; + case BINARY: + if (((BytesWritable) ele).getLength() == 0) { + return null; + } + break; + case CHAR: + if (((HiveCharWritable) ele).getHiveChar().getStrippedValue().isEmpty()) { + return null; + } + break; + case VARCHAR: + if (((HiveVarcharWritable) ele).getHiveVarchar().getValue().isEmpty()) { + return null; + } + break; + default: + throw new RuntimeException("Unexpected primitive category " + primitiveCategory); + } } + listObjectInspector.set(listObj, i, ele); } - case BINARY: - return getRandBinary(r, 1 + r.nextInt(100)); - case TIMESTAMP: - return RandomTypeUtil.getRandTimestamp(r); - case INTERVAL_YEAR_MONTH: - return getRandIntervalYearMonth(r); - case INTERVAL_DAY_TIME: - return getRandIntervalDayTime(r); - case DECIMAL: - return getRandHiveDecimal(r, (DecimalTypeInfo) primitiveTypeInfo); - default: - throw new Error("Unknown primitive category " + primitiveCategory); + return listObj; + } + case MAP: + { + if (r.nextInt(20) == 0) { + return null; + } + final int keyPairCount = r.nextInt(100); + StandardMapObjectInspector mapObjectInspector = + (StandardMapObjectInspector) objectInspector; + ObjectInspector keyObjectInspector = + mapObjectInspector.getMapKeyObjectInspector(); + TypeInfo keyTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + keyObjectInspector); + ObjectInspector valueObjectInspector = + mapObjectInspector.getMapValueObjectInspector(); + TypeInfo valueTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + valueObjectInspector); + Object mapObj = mapObjectInspector.create(); + for (int i = 0; i < keyPairCount; i++) { + Object key = randomWritable(keyTypeInfo, keyObjectInspector); + Object value = randomWritable(valueTypeInfo, valueObjectInspector); + mapObjectInspector.put(mapObj, key, value); + } + return mapObj; } - } catch (Exception e) { - throw new RuntimeException("randomObject failed on column " + column + " type " + primitiveCategory, e); + case STRUCT: + { + if (r.nextInt(20) == 0) { + return null; + } + StandardStructObjectInspector structObjectInspector = + (StandardStructObjectInspector) objectInspector; + List fieldRefsList = structObjectInspector.getAllStructFieldRefs(); + final int fieldCount = fieldRefsList.size(); + Object structObj = structObjectInspector.create(); + for (int i = 0; i < fieldCount; i++) { + StructField fieldRef = fieldRefsList.get(i); + ObjectInspector fieldObjectInspector = + fieldRef.getFieldObjectInspector(); + TypeInfo fieldTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + fieldObjectInspector); + Object fieldObj = randomWritable(fieldTypeInfo, fieldObjectInspector); + structObjectInspector.setStructFieldData(structObj, fieldRef, fieldObj); + } + return structObj; + } + case UNION: + { + StandardUnionObjectInspector unionObjectInspector = + (StandardUnionObjectInspector) objectInspector; + List objectInspectorList = unionObjectInspector.getObjectInspectors(); + final int unionCount = objectInspectorList.size(); + final byte tag = (byte) r.nextInt(unionCount); + Object unionObj = unionObjectInspector.create(); + ObjectInspector fieldObjectInspector = + objectInspectorList.get(tag); + TypeInfo fieldTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + fieldObjectInspector); + Object fieldObj = randomWritable(fieldTypeInfo, fieldObjectInspector); + return new StandardUnion(tag, fieldObj); + } + default: + throw new RuntimeException("Unexpected category " + typeInfo.getCategory()); } } - public static HiveChar getRandHiveChar(Random r, CharTypeInfo charTypeInfo, String alphabet) { - int maxLength = 1 + r.nextInt(charTypeInfo.getLength()); - String randomString = RandomTypeUtil.getRandString(r, alphabet, 100); - HiveChar hiveChar = new HiveChar(randomString, maxLength); - return hiveChar; + public Object randomPrimitiveObject(int column) { + return randomPrimitiveObject(r, primitiveTypeInfos[column]); + } + + public static Object randomPrimitiveObject(Random r, PrimitiveTypeInfo primitiveTypeInfo) { + switch (primitiveTypeInfo.getPrimitiveCategory()) { + case BOOLEAN: + return Boolean.valueOf(r.nextBoolean()); + case BYTE: + return Byte.valueOf((byte) r.nextInt()); + case SHORT: + return Short.valueOf((short) r.nextInt()); + case INT: + return Integer.valueOf(r.nextInt()); + case LONG: + return Long.valueOf(r.nextLong()); + case DATE: + return RandomTypeUtil.getRandDate(r); + case FLOAT: + return Float.valueOf(r.nextFloat() * 10 - 5); + case DOUBLE: + return Double.valueOf(r.nextDouble() * 10 - 5); + case STRING: + return RandomTypeUtil.getRandString(r); + case CHAR: + return getRandHiveChar(r, (CharTypeInfo) primitiveTypeInfo); + case VARCHAR: + return getRandHiveVarchar(r, (VarcharTypeInfo) primitiveTypeInfo); + case BINARY: + return getRandBinary(r, 1 + r.nextInt(100)); + case TIMESTAMP: + return RandomTypeUtil.getRandTimestamp(r); + case INTERVAL_YEAR_MONTH: + return getRandIntervalYearMonth(r); + case INTERVAL_DAY_TIME: + return getRandIntervalDayTime(r); + case DECIMAL: + { + HiveDecimal dec = getRandHiveDecimal(r, (DecimalTypeInfo) primitiveTypeInfo); + return dec; + } + default: + throw new Error("Unknown primitive category " + primitiveTypeInfo.getCategory()); + } } public static HiveChar getRandHiveChar(Random r, CharTypeInfo charTypeInfo) { - return getRandHiveChar(r, charTypeInfo, "abcdefghijklmnopqrstuvwxyz"); + int maxLength = 1 + r.nextInt(charTypeInfo.getLength()); + String randomString = RandomTypeUtil.getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); + HiveChar hiveChar = new HiveChar(randomString, maxLength); + return hiveChar; } public static HiveVarchar getRandHiveVarchar(Random r, VarcharTypeInfo varcharTypeInfo, String alphabet) { diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java index 3f02eb3..a455c0c 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java @@ -109,8 +109,7 @@ private void addAndVerifyRows(VectorRandomRowSource valueSource, Object[][] rows byte[] key; if (random.nextBoolean() || verifyTable.getCount() == 0) { Object[] keyRow = - VectorRandomRowSource.randomRow(keyCount, random, keyPrimitiveObjectInspectorList, - keyPrimitiveCategories, keyPrimitiveTypeInfos); + VectorRandomRowSource.randomPrimitiveRow(keyCount, random, keyPrimitiveTypeInfos); Output keyOutput = new Output(); keySerializeWrite.set(keyOutput); @@ -152,7 +151,9 @@ public void testBigIntRows() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -176,7 +177,9 @@ public void testIntRows() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -200,7 +203,9 @@ public void testStringRows() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -224,7 +229,9 @@ public void testMultiKeyRows1() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -248,7 +255,9 @@ public void testMultiKeyRows2() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -272,7 +281,9 @@ public void testMultiKeyRows3() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -296,7 +307,9 @@ public void testBigIntRowsClipped() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -320,7 +333,9 @@ public void testIntRowsClipped() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -344,7 +359,9 @@ public void testStringRowsClipped() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -368,7 +385,9 @@ public void testMultiKeyRowsClipped1() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -392,7 +411,9 @@ public void testMultiKeyRowsClipped2() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -416,7 +437,9 @@ public void testMultiKeyRowsClipped3() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -441,7 +464,9 @@ public void testBigIntRowsExact() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -465,7 +490,9 @@ public void testIntRowsExact() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -489,7 +516,9 @@ public void testStringRowsExact() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -513,7 +542,9 @@ public void testMultiKeyRowsExact1() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -537,7 +568,9 @@ public void testMultiKeyRowsExact2() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -561,7 +594,9 @@ public void testMultiKeyRowsExact3() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -585,7 +620,9 @@ public void testBigIntRowsClippedExact() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -609,7 +646,9 @@ public void testIntRowsClippedExact() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -633,7 +672,9 @@ public void testStringRowsClippedExact() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -657,7 +698,9 @@ public void testMultiKeyRowsClippedExact1() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -681,7 +724,9 @@ public void testMultiKeyRowsClippedExact2() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); @@ -705,7 +750,9 @@ public void testMultiKeyRowsClippedExact3() throws Exception { VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); - valueSource.init(random); + + // UNDONE: Until we have full complex support -- disable. + valueSource.init(random, false, 4); int rowCount = 10000; Object[][] rows = valueSource.randomRows(rowCount); diff --git serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableSerializeWrite.java serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableSerializeWrite.java index a9ea7c0..f1a32ba 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableSerializeWrite.java +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableSerializeWrite.java @@ -43,7 +43,7 @@ * * This is an alternative way to serialize than what is provided by BinarySortableSerDe. */ -public final class BinarySortableSerializeWrite implements SerializeWrite { +public final class BinarySortableSerializeWrite extends SerializeWrite { public static final Logger LOG = LoggerFactory.getLogger(BinarySortableSerializeWrite.class.getName()); private Output output; diff --git serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java index ac931d6..6b8fe1d 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.serde2.fast; import java.io.IOException; + import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; @@ -26,8 +27,12 @@ import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; /* * Directly deserialize with the caller reading field-by-field a serialization format. @@ -52,6 +57,68 @@ protected Category[] categories; protected PrimitiveCategory[] primitiveCategories; + /* + * This class is used to read one field at a time. Simple fields like long, double, int are read + * into to primitive current* members; the non-simple field types like Date, Timestamp, etc, are + * read into a current object that this method will allocate. + * + * This method handles complex type fields by recursively calling this method. + */ + private void allocateCurrentWritable(TypeInfo typeInfo) { + switch (typeInfo.getCategory()) { + case PRIMITIVE: + switch (((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()) { + case DATE: + if (currentDateWritable == null) { + currentDateWritable = new DateWritable(); + } + break; + case TIMESTAMP: + if (currentTimestampWritable == null) { + currentTimestampWritable = new TimestampWritable(); + } + break; + case INTERVAL_YEAR_MONTH: + if (currentHiveIntervalYearMonthWritable == null) { + currentHiveIntervalYearMonthWritable = new HiveIntervalYearMonthWritable(); + } + break; + case INTERVAL_DAY_TIME: + if (currentHiveIntervalDayTimeWritable == null) { + currentHiveIntervalDayTimeWritable = new HiveIntervalDayTimeWritable(); + } + break; + case DECIMAL: + if (currentHiveDecimalWritable == null) { + currentHiveDecimalWritable = new HiveDecimalWritable(); + } + break; + default: + // No writable needed for this data type. + } + break; + case LIST: + allocateCurrentWritable(((ListTypeInfo) typeInfo).getListElementTypeInfo()); + break; + case MAP: + allocateCurrentWritable(((MapTypeInfo) typeInfo).getMapKeyTypeInfo()); + allocateCurrentWritable(((MapTypeInfo) typeInfo).getMapValueTypeInfo()); + break; + case STRUCT: + for (TypeInfo fieldTypeInfo : ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos()) { + allocateCurrentWritable(fieldTypeInfo); + } + break; + case UNION: + for (TypeInfo fieldTypeInfo : ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos()) { + allocateCurrentWritable(fieldTypeInfo); + } + break; + default: + throw new RuntimeException("Unexpected category " + typeInfo.getCategory()); + } + } + /** * Constructor. * @@ -85,37 +152,8 @@ public DeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer) { PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); primitiveCategories[i] = primitiveCategory; - - switch (primitiveCategory) { - case DATE: - if (currentDateWritable == null) { - currentDateWritable = new DateWritable(); - } - break; - case TIMESTAMP: - if (currentTimestampWritable == null) { - currentTimestampWritable = new TimestampWritable(); - } - break; - case INTERVAL_YEAR_MONTH: - if (currentHiveIntervalYearMonthWritable == null) { - currentHiveIntervalYearMonthWritable = new HiveIntervalYearMonthWritable(); - } - break; - case INTERVAL_DAY_TIME: - if (currentHiveIntervalDayTimeWritable == null) { - currentHiveIntervalDayTimeWritable = new HiveIntervalDayTimeWritable(); - } - break; - case DECIMAL: - if (currentHiveDecimalWritable == null) { - currentHiveDecimalWritable = new HiveDecimalWritable(); - } - break; - default: - // No writable needed for this data type. - } } + allocateCurrentWritable(typeInfo); this.useExternalBuffer = useExternalBuffer; } @@ -178,6 +216,30 @@ public boolean readField(int fieldIndex) throws IOException { } /* + * Tests whether there is another List element or another Map key/value pair. + */ + public boolean isNextComplexMultiValue() { + throw new RuntimeException("Not implemented"); + } + + /* + * Read a field that is under a complex type. It may be a primitive type or deeper complex type. + */ + public boolean readComplexField() throws IOException { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + /* + * Used by Struct and Union complex type readers to indicate the (final) field has been fully + * read and the current complex type is finished. + */ + public void finishComplexVariableFieldsType() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + /* * Call this method may be called after all the all fields have been read to check * for unread fields. * diff --git serde/src/java/org/apache/hadoop/hive/serde2/fast/SerializeWrite.java serde/src/java/org/apache/hadoop/hive/serde2/fast/SerializeWrite.java index 17d2385..d4d7712 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/fast/SerializeWrite.java +++ serde/src/java/org/apache/hadoop/hive/serde2/fast/SerializeWrite.java @@ -37,63 +37,63 @@ * (or calling writeNull if the field is a NULL). * */ -public interface SerializeWrite { +public abstract class SerializeWrite { /* * Set the buffer that will receive the serialized data. The output buffer will be reset. */ - void set(Output output); + public abstract void set(Output output); /* * Set the buffer that will receive the serialized data. The output buffer will NOT be reset. */ - void setAppend(Output output); + public abstract void setAppend(Output output); /* * Reset the previously supplied buffer that will receive the serialized data. */ - void reset(); + public abstract void reset(); /* * Write a NULL field. */ - void writeNull() throws IOException; + public abstract void writeNull() throws IOException; /* * BOOLEAN. */ - void writeBoolean(boolean v) throws IOException; + public abstract void writeBoolean(boolean v) throws IOException; /* * BYTE. */ - void writeByte(byte v) throws IOException; + public abstract void writeByte(byte v) throws IOException; /* * SHORT. */ - void writeShort(short v) throws IOException; + public abstract void writeShort(short v) throws IOException; /* * INT. */ - void writeInt(int v) throws IOException; + public abstract void writeInt(int v) throws IOException; /* * LONG. */ - void writeLong(long v) throws IOException; + public abstract void writeLong(long v) throws IOException; /* * FLOAT. */ - void writeFloat(float vf) throws IOException; + public abstract void writeFloat(float vf) throws IOException; /* * DOUBLE. */ - void writeDouble(double vd) throws IOException; + public abstract void writeDouble(double vd) throws IOException; /* * STRING. @@ -101,50 +101,50 @@ * Can be used to write CHAR and VARCHAR when the caller takes responsibility for * truncation/padding issues. */ - void writeString(byte[] v) throws IOException; - void writeString(byte[] v, int start, int length) throws IOException; + public abstract void writeString(byte[] v) throws IOException; + public abstract void writeString(byte[] v, int start, int length) throws IOException; /* * CHAR. */ - void writeHiveChar(HiveChar hiveChar) throws IOException; + public abstract void writeHiveChar(HiveChar hiveChar) throws IOException; /* * VARCHAR. */ - void writeHiveVarchar(HiveVarchar hiveVarchar) throws IOException; + public abstract void writeHiveVarchar(HiveVarchar hiveVarchar) throws IOException; /* * BINARY. */ - void writeBinary(byte[] v) throws IOException; - void writeBinary(byte[] v, int start, int length) throws IOException; + public abstract void writeBinary(byte[] v) throws IOException; + public abstract void writeBinary(byte[] v, int start, int length) throws IOException; /* * DATE. */ - void writeDate(Date date) throws IOException; + public abstract void writeDate(Date date) throws IOException; // We provide a faster way to write a date without a Date object. - void writeDate(int dateAsDays) throws IOException; + public abstract void writeDate(int dateAsDays) throws IOException; /* * TIMESTAMP. */ - void writeTimestamp(Timestamp vt) throws IOException; + public abstract void writeTimestamp(Timestamp vt) throws IOException; /* * INTERVAL_YEAR_MONTH. */ - void writeHiveIntervalYearMonth(HiveIntervalYearMonth viyt) throws IOException; + public abstract void writeHiveIntervalYearMonth(HiveIntervalYearMonth viyt) throws IOException; // We provide a faster way to write a hive interval year month without a HiveIntervalYearMonth object. - void writeHiveIntervalYearMonth(int totalMonths) throws IOException; + public abstract void writeHiveIntervalYearMonth(int totalMonths) throws IOException; /* * INTERVAL_DAY_TIME. */ - void writeHiveIntervalDayTime(HiveIntervalDayTime vidt) throws IOException; + public abstract void writeHiveIntervalDayTime(HiveIntervalDayTime vidt) throws IOException; /* * DECIMAL. @@ -152,6 +152,68 @@ * NOTE: The scale parameter is for text serialization (e.g. HiveDecimal.toFormatString) that * creates trailing zeroes output decimals. */ - void writeHiveDecimal(HiveDecimal dec, int scale) throws IOException; - void writeHiveDecimal(HiveDecimalWritable decWritable, int scale) throws IOException; + public abstract void writeHiveDecimal(HiveDecimal dec, int scale) throws IOException; + public abstract void writeHiveDecimal(HiveDecimalWritable decWritable, int scale) throws IOException; + + /* + * LIST. + */ + public void beginList() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + public void finishList() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + /* + * MAP. + */ + public void beginMap() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + public void writeMapKeySeparator() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + public void writeMapKeyPairSeparator() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + public void finishMap() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + /* + * STRUCT. + */ + public void beginStruct() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + public void finishStruct() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + /* + * UNION. + */ + public void beginUnion(int tag) throws IOException { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } + + public void finishUnion() { + // UNDONE: Until all variations implement complex types... + throw new RuntimeException("Not implemented"); + } } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/VerifyLazy.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/VerifyLazy.java new file mode 100644 index 0000000..0905f08 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/VerifyLazy.java @@ -0,0 +1,372 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.lazy; + +import java.io.IOException; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; +import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.fast.DeserializeRead; +import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.FloatWritable; + +/** + * TestBinarySortableSerDe. + * + */ +public class VerifyLazy { + + public static boolean lazyCompareList(ListTypeInfo listTypeInfo, List list, List expectedList) { + TypeInfo elementTypeInfo = listTypeInfo.getListElementTypeInfo(); + final int size = list.size(); + for (int i = 0; i < size; i++) { + Object lazyEleObj = list.get(i); + Object expectedEleObj = expectedList.get(i); + if (!lazyCompare(elementTypeInfo, lazyEleObj, expectedEleObj)) { + throw new RuntimeException("List element deserialized value does not match elementTypeInfo " + elementTypeInfo.toString()); + } + } + return true; + } + + public static boolean lazyCompareMap(MapTypeInfo mapTypeInfo, Map map, Map expectedMap) { + TypeInfo keyTypeInfo = mapTypeInfo.getMapKeyTypeInfo(); + TypeInfo valueTypeInfo = mapTypeInfo.getMapValueTypeInfo(); + if (map.size() != expectedMap.size()) { + throw new RuntimeException("Map key/value deserialized map.size() " + map.size() + " map " + map.toString() + " expectedMap.size() " + expectedMap.size() + " expectedMap " + expectedMap.toString() + " does not match keyTypeInfo " + keyTypeInfo.toString() + " valueTypeInfo " + valueTypeInfo.toString()); + } + final int size = map.size(); + return true; // Compare MAP? + } + + public static boolean lazyCompareStruct(StructTypeInfo structTypeInfo, List fields, List expectedFields) { + ArrayList fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos(); + final int size = fieldTypeInfos.size(); + for (int i = 0; i < size; i++) { + Object lazyEleObj = fields.get(i); + Object expectedEleObj = expectedFields.get(i); + if (!lazyCompare(fieldTypeInfos.get(i), lazyEleObj, expectedEleObj)) { + throw new RuntimeException("SerDe deserialized value does not match"); + } + } + return true; + } + + public static boolean lazyCompare(TypeInfo typeInfo, Object lazyObject, Object expectedObject) { + if (expectedObject == null) { + if (lazyObject != null) { + throw new RuntimeException("Expected object is null but object is not null " + lazyObject.toString() + " typeInfo " + typeInfo.toString()); + } + return true; + } else if (lazyObject == null) { + throw new RuntimeException("Expected object is not null \"" + expectedObject.toString() + "\" typeInfo " + typeInfo.toString() + " but object is null"); + } + if (lazyObject instanceof LazyPrimitive) { + Object primitiveObject = ((LazyPrimitive) lazyObject).getObject(); + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + switch (primitiveTypeInfo.getPrimitiveCategory()) { + case BOOLEAN: + { + if (!(primitiveObject instanceof LazyBoolean)) { + throw new RuntimeException("Expected LazyBoolean"); + } + boolean value = ((LazyBoolean) primitiveObject).getWritableObject().get(); + boolean expected = ((BooleanWritable) expectedObject).get(); + if (value != expected) { + throw new RuntimeException("Boolean field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case BYTE: + { + if (!(primitiveObject instanceof LazyByte)) { + throw new RuntimeException("Expected LazyByte"); + } + byte value = ((LazyByte) primitiveObject).getWritableObject().get(); + byte expected = ((ByteWritable) expectedObject).get(); + if (value != expected) { + throw new RuntimeException("Byte field mismatch (expected " + (int) expected + " found " + (int) value + ")"); + } + } + break; + case SHORT: + { + if (!(primitiveObject instanceof LazyShort)) { + throw new RuntimeException("Expected LazyShort"); + } + short value = ((LazyShort) primitiveObject).getWritableObject().get(); + short expected = ((ShortWritable) expectedObject).get(); + if (value != expected) { + throw new RuntimeException("Short field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case INT: + { + if (!(primitiveObject instanceof LazyInteger)) { + throw new RuntimeException("Expected LazyInteger"); + } + int value = ((LazyInteger) primitiveObject).getWritableObject().get(); + int expected = ((IntWritable) expectedObject).get(); + if (value != expected) { + throw new RuntimeException("Int field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case LONG: + { + if (!(primitiveObject instanceof LazyLong)) { + throw new RuntimeException("Expected LazyLong"); + } + long value = ((LazyLong) primitiveObject).getWritableObject().get(); + long expected = ((LongWritable) expectedObject).get(); + if (value != expected) { + throw new RuntimeException("Long field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case FLOAT: + { + if (!(primitiveObject instanceof LazyFloat)) { + throw new RuntimeException("Expected LazyFloat"); + } + float value = ((LazyFloat) primitiveObject).getWritableObject().get(); + float expected = ((FloatWritable) expectedObject).get(); + if (value != expected) { + throw new RuntimeException("Float field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case DOUBLE: + { + if (!(primitiveObject instanceof LazyDouble)) { + throw new RuntimeException("Expected LazyDouble"); + } + double value = ((LazyDouble) primitiveObject).getWritableObject().get(); + double expected = ((DoubleWritable) expectedObject).get(); + if (value != expected) { + throw new RuntimeException("Double field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case STRING: + { + if (!(primitiveObject instanceof LazyString)) { + throw new RuntimeException("Text expected writable not Text"); + } + Text value = ((LazyString) primitiveObject).getWritableObject(); + Text expected = ((Text) expectedObject); + if (!value.equals(expected)) { + throw new RuntimeException("String field mismatch (expected '" + expected + "' found '" + value + "')"); + } + } + break; + case CHAR: + { + if (!(primitiveObject instanceof LazyHiveChar)) { + throw new RuntimeException("Expected LazyHiveChar"); + } + HiveChar value = ((LazyHiveChar) primitiveObject).getWritableObject().getHiveChar(); + HiveChar expected = ((HiveCharWritable) expectedObject).getHiveChar(); + + if (!value.equals(expected)) { + throw new RuntimeException("HiveChar field mismatch (expected '" + expected + "' found '" + value + "')"); + } + } + break; + case VARCHAR: + { + if (!(primitiveObject instanceof LazyHiveVarchar)) { + throw new RuntimeException("Expected LazyHiveVarchar"); + } + HiveVarchar value = ((LazyHiveVarchar) primitiveObject).getWritableObject().getHiveVarchar(); + HiveVarchar expected = ((HiveVarcharWritable) expectedObject).getHiveVarchar(); + + if (!value.equals(expected)) { + throw new RuntimeException("HiveVarchar field mismatch (expected '" + expected + "' found '" + value + "')"); + } + } + break; + case DECIMAL: + { + if (!(primitiveObject instanceof LazyHiveDecimal)) { + throw new RuntimeException("Expected LazyDecimal"); + } + HiveDecimal value = ((LazyHiveDecimal) primitiveObject).getWritableObject().getHiveDecimal(); + HiveDecimal expected = ((HiveDecimalWritable) expectedObject).getHiveDecimal(); + + if (!value.equals(expected)) { + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) primitiveTypeInfo; + int precision = decimalTypeInfo.getPrecision(); + int scale = decimalTypeInfo.getScale(); + throw new RuntimeException("Decimal field mismatch (expected " + expected.toString() + " found " + value.toString() + ") precision " + precision + ", scale " + scale); + } + } + break; + case DATE: + { + if (!(primitiveObject instanceof LazyDate)) { + throw new RuntimeException("Expected LazyDate"); + } + Date value = ((LazyDate) primitiveObject).getWritableObject().get(); + Date expected = ((DateWritable) expectedObject).get(); + if (!value.equals(expected)) { + throw new RuntimeException("Date field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case TIMESTAMP: + { + if (!(primitiveObject instanceof LazyTimestamp)) { + throw new RuntimeException("TimestampWritable expected writable not TimestampWritable"); + } + Timestamp value = ((LazyTimestamp) primitiveObject).getWritableObject().getTimestamp(); + Timestamp expected = ((TimestampWritable) expectedObject).getTimestamp(); + if (!value.equals(expected)) { + throw new RuntimeException("Timestamp field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case INTERVAL_YEAR_MONTH: + { + if (!(primitiveObject instanceof LazyHiveIntervalYearMonth)) { + throw new RuntimeException("Expected LazyHiveIntervalYearMonth"); + } + HiveIntervalYearMonth value = ((LazyHiveIntervalYearMonth) primitiveObject).getWritableObject().getHiveIntervalYearMonth(); + HiveIntervalYearMonth expected = ((HiveIntervalYearMonthWritable) expectedObject).getHiveIntervalYearMonth(); + if (!value.equals(expected)) { + throw new RuntimeException("HiveIntervalYearMonth field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case INTERVAL_DAY_TIME: + { + if (!(primitiveObject instanceof LazyHiveIntervalDayTime)) { + throw new RuntimeException("Expected writable LazyHiveIntervalDayTime"); + } + HiveIntervalDayTime value = ((LazyHiveIntervalDayTime) primitiveObject).getWritableObject().getHiveIntervalDayTime(); + HiveIntervalDayTime expected = ((HiveIntervalDayTimeWritable) expectedObject).getHiveIntervalDayTime(); + if (!value.equals(expected)) { + throw new RuntimeException("HiveIntervalDayTime field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case BINARY: + { + if (!(primitiveObject instanceof LazyBinary)) { + throw new RuntimeException("Expected LazyBinary"); + } + BytesWritable bytesWritable = ((LazyBinary) primitiveObject).getWritableObject(); + byte[] value = Arrays.copyOfRange(bytesWritable.getBytes(), 0, bytesWritable.getLength()); + BytesWritable bytesWritableExpected = (BytesWritable) expectedObject; + byte[] expected = Arrays.copyOfRange(bytesWritableExpected.getBytes(), 0, bytesWritableExpected.getLength()); + if (value.length != expected.length){ + throw new RuntimeException("Byte Array field mismatch (expected " + Arrays.toString(expected) + + " found " + Arrays.toString(value) + ")"); + } + for (int b = 0; b < value.length; b++) { + if (value[b] != expected[b]) { + throw new RuntimeException("Byte Array field mismatch (expected " + Arrays.toString(expected) + + " found " + Arrays.toString(value) + ")"); + } + } + } + break; + default: + throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory()); + } + } else if (lazyObject instanceof LazyArray) { + LazyArray lazyArray = (LazyArray) lazyObject; + List list = lazyArray.getList(); + List expectedList = (List) expectedObject; + ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; + if (list.size() != expectedList.size()) { + throw new RuntimeException("SerDe deserialized list length does not match (list " + list.toString() + " list.size() " + list.size() + + " expectedList " + expectedList.toString() + " expectedList.size() " + expectedList.size() + ") elementTypeInfo " + listTypeInfo.getListElementTypeInfo().toString()); + } + return lazyCompareList((ListTypeInfo) typeInfo, list, expectedList); + } else if (typeInfo instanceof ListTypeInfo) { + List list = (List) lazyObject; + List expectedList = (List) expectedObject; + if (list.size() != expectedList.size()) { + throw new RuntimeException("SerDe deserialized list length does not match (list " + list.toString() + " list.size() " + list.size() + " expectedList " + expectedList.toString() + " expectedList.size() " + expectedList.size() + ")"); + } + return lazyCompareList((ListTypeInfo) typeInfo, list, expectedList); + } else if (lazyObject instanceof LazyMap) { + LazyMap lazyMap = (LazyMap) lazyObject; + Map map = lazyMap.getMap(); + Map expectedMap = (Map) expectedObject; + return lazyCompareMap((MapTypeInfo) typeInfo, map, expectedMap); + } else if (typeInfo instanceof MapTypeInfo) { + Map map = (Map) lazyObject; + Map expectedMap = (Map) expectedObject; + return lazyCompareMap((MapTypeInfo) typeInfo, map, expectedMap); + } else if (lazyObject instanceof LazyStruct) { + LazyStruct lazyStruct = (LazyStruct) lazyObject; + List fields = lazyStruct.getFieldsAsList(); + List expectedFields = (List) expectedObject; + StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; + return lazyCompareStruct(structTypeInfo, fields, expectedFields); + } else if (typeInfo instanceof StructTypeInfo) { + ArrayList fields = (ArrayList) lazyObject; + List expectedFields = (List) expectedObject; + StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; + return lazyCompareStruct(structTypeInfo, fields, expectedFields); + } else { + System.err.println("Not implemented " + typeInfo.getClass().getName()); + } + return true; + } +} \ No newline at end of file diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java index a597fd7..b381e91 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java @@ -23,7 +23,9 @@ import java.nio.charset.CharacterCodingException; import java.nio.charset.StandardCharsets; import java.sql.Date; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,12 +39,21 @@ import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; import org.apache.hadoop.hive.serde2.lazy.LazyShort; import org.apache.hadoop.hive.serde2.lazy.LazyUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.hadoop.io.Text; import org.apache.hive.common.util.TimestampParser; +import com.google.common.base.Preconditions; + /* * Directly deserialize with the caller reading field-by-field the LazySimple (text) * serialization format. @@ -61,9 +72,124 @@ public final class LazySimpleDeserializeRead extends DeserializeRead { public static final Logger LOG = LoggerFactory.getLogger(LazySimpleDeserializeRead.class.getName()); - private int[] startPosition; + /* + * Information on a field. Made a class to allow readField to be agnostic to whether a top level + * or field within a complex type is being read + */ + private static class Field { + + // Optimize for most common case -- primitive. + public final boolean isPrimitive; + public final PrimitiveCategory primitiveCategory; + + public final Category complexCategory; + + public final TypeInfo typeInfo; + + public ComplexTypeHelper complexTypeHelper; + + public Field(TypeInfo typeInfo) { + Category category = typeInfo.getCategory(); + if (category == Category.PRIMITIVE) { + isPrimitive = true; + primitiveCategory = ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory(); + complexCategory = null; + } else { + isPrimitive = false; + primitiveCategory = null; + complexCategory = category; + } + + this.typeInfo = typeInfo; + + complexTypeHelper = null; + } + } + + /* + * Used to keep position/length for complex type fields. + * NOTE: The top level uses startPositions instead. + */ + private static class ComplexTypeHelper { + + public final Field complexField; + + public int complexFieldStart; + public int complexFieldLength; + public int complexFieldEnd; + + public int fieldPosition; + + public ComplexTypeHelper(Field complexField) { + this.complexField = complexField; + } + + public void setCurrentFieldInfo(int complexFieldStart, int complexFieldLength) { + this.complexFieldStart = complexFieldStart; + this.complexFieldLength = complexFieldLength; + complexFieldEnd = complexFieldStart + complexFieldLength; + fieldPosition = complexFieldStart; + } + } + + private static class ListComplexTypeHelper extends ComplexTypeHelper { + + public Field elementField; + + public ListComplexTypeHelper(Field complexField, Field elementField) { + super(complexField); + this.elementField = elementField; + } + + } + + private static class MapComplexTypeHelper extends ComplexTypeHelper { + + public Field keyField; + public Field valueField; + + public boolean fieldHaveParsedKey; + + public MapComplexTypeHelper(Field complexField, Field keyField, Field valueField) { + super(complexField); + this.keyField = keyField; + this.valueField = valueField; + fieldHaveParsedKey = false; + } + } + + private static class StructComplexTypeHelper extends ComplexTypeHelper { + + public Field[] fields; + + public int nextFieldIndex; - private final byte separator; + public StructComplexTypeHelper(Field complexField, Field[] fields) { + super(complexField); + this.fields = fields; + nextFieldIndex = 0; + } + } + + private static class UnionComplexTypeHelper extends ComplexTypeHelper { + + public Field tagField; + public Field[] fields; + + public boolean fieldHaveParsedTag; + public int fieldTag; + + public UnionComplexTypeHelper(Field complexField, Field[] fields) { + super(complexField); + this.tagField = new Field(TypeInfoFactory.intTypeInfo); + this.fields = fields; + fieldHaveParsedTag = false; + } + } + + private int[] startPositions; + + private final byte[] separators; private final boolean isEscaped; private final byte escapeChar; private final int[] escapeCounts; @@ -71,19 +197,25 @@ private final boolean isExtendedBooleanLiteral; private final int fieldCount; + private final Field[] fields; + private final int maxLevelDepth; private byte[] bytes; private int start; private int end; - private boolean parsed; + private boolean topLevelParsed; // Used by readNextField/skipNextField and not by readField. private int nextFieldIndex; // For getDetailedReadPositionString. - private int currentFieldIndex; + private int currentLevel; + private int currentTopLevelFieldIndex; private int currentFieldStart; private int currentFieldLength; + private int currentEscapeCount; + + private ComplexTypeHelper[] currentComplexTypeHelpers; // For string/char/varchar buffering when there are escapes. private int internalBufferLen; @@ -93,21 +225,112 @@ private boolean isEndOfInputReached; + private int addComplexFields(List fieldTypeInfoList, Field[] fields, int depth) { + Field field; + final int count = fieldTypeInfoList.size(); + for (int i = 0; i < count; i++) { + field = new Field(fieldTypeInfoList.get(i)); + if (!field.isPrimitive) { + depth = Math.max(depth, addComplexTypeHelper(field, depth)); + } + fields[i] = field; + } + return depth; + } + + private int addComplexTypeHelper(Field complexField, int depth) { + + // Assume one separator (depth) needed. + depth++; + + switch (complexField.complexCategory) { + case LIST: + { + ListTypeInfo listTypeInfo = (ListTypeInfo) complexField.typeInfo; + Field elementField = new Field(listTypeInfo.getListElementTypeInfo()); + if (!elementField.isPrimitive) { + depth = addComplexTypeHelper(elementField, depth); + } + ListComplexTypeHelper listHelper = + new ListComplexTypeHelper(complexField, elementField); + complexField.complexTypeHelper = listHelper; + } + break; + case MAP: + { + // Map needs two separators (key and key/value pair). + depth++; + + MapTypeInfo mapTypeInfo = (MapTypeInfo) complexField.typeInfo; + Field keyField = new Field(mapTypeInfo.getMapKeyTypeInfo()); + if (!keyField.isPrimitive) { + depth = Math.max(depth, addComplexTypeHelper(keyField, depth)); + } + Field valueField = new Field(mapTypeInfo.getMapValueTypeInfo()); + if (!valueField.isPrimitive) { + depth = Math.max(depth, addComplexTypeHelper(valueField, depth)); + } + MapComplexTypeHelper mapHelper = + new MapComplexTypeHelper(complexField, keyField, valueField); + complexField.complexTypeHelper = mapHelper; + } + break; + case STRUCT: + { + StructTypeInfo structTypeInfo = (StructTypeInfo) complexField.typeInfo; + List fieldTypeInfoList = structTypeInfo.getAllStructFieldTypeInfos(); + Field[] fields = new Field[fieldTypeInfoList.size()]; + depth = addComplexFields(fieldTypeInfoList, fields, depth); + StructComplexTypeHelper structHelper = + new StructComplexTypeHelper(complexField, fields); + complexField.complexTypeHelper = structHelper; + } + break; + case UNION: + { + UnionTypeInfo unionTypeInfo = (UnionTypeInfo) complexField.typeInfo; + List fieldTypeInfoList = unionTypeInfo.getAllUnionObjectTypeInfos(); + Field[] fields = new Field[fieldTypeInfoList.size()]; + depth = addComplexFields(fieldTypeInfoList, fields, depth); + UnionComplexTypeHelper structHelper = + new UnionComplexTypeHelper(complexField, fields); + complexField.complexTypeHelper = structHelper; + } + break; + default: + throw new Error("Unexpected complex category " + complexField.complexCategory); + } + return depth; + } + public LazySimpleDeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer, - byte separator, LazySerDeParameters lazyParams) { + LazySerDeParameters lazyParams) { super(typeInfos, useExternalBuffer); - fieldCount = typeInfos.length; + final int count = typeInfos.length; + fieldCount = count; + int depth = 0; + fields = new Field[count]; + Field field; + for (int i = 0; i < count; i++) { + field = new Field(typeInfos[i]); + if (!field.isPrimitive) { + depth = Math.max(depth, addComplexTypeHelper(field, 0)); + } + fields[i] = field; + } + maxLevelDepth = depth; + currentComplexTypeHelpers = new ComplexTypeHelper[depth]; // Field length is difference between positions hence one extra. - startPosition = new int[fieldCount + 1]; + startPositions = new int[count + 1]; - this.separator = separator; + this.separators = lazyParams.getSeparators(); isEscaped = lazyParams.isEscaped(); if (isEscaped) { escapeChar = lazyParams.getEscapeChar(); - escapeCounts = new int[fieldCount]; + escapeCounts = new int[count]; } else { escapeChar = (byte) 0; escapeCounts = null; @@ -123,11 +346,6 @@ public LazySimpleDeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer internalBufferLen = -1; } - public LazySimpleDeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer, - LazySerDeParameters lazyParams) { - this(typeInfos, useExternalBuffer, lazyParams.getSeparators()[0], lazyParams); - } - /* * Set the range of bytes to be deserialized. */ @@ -136,7 +354,8 @@ public void set(byte[] bytes, int offset, int length) { this.bytes = bytes; start = offset; end = offset + length; - parsed = false; + topLevelParsed = false; + currentLevel = 0; nextFieldIndex = -1; } @@ -157,14 +376,14 @@ public String getDetailedReadPositionString() { sb.append(" fields with types "); sb.append(Arrays.toString(typeInfos)); sb.append(". "); - if (!parsed) { + if (!topLevelParsed) { sb.append("Error during field separator parsing"); } else { sb.append("Read field #"); - sb.append(currentFieldIndex); + sb.append(currentTopLevelFieldIndex); sb.append(" at field start position "); - sb.append(startPosition[currentFieldIndex]); - int currentFieldLength = startPosition[currentFieldIndex + 1] - startPosition[currentFieldIndex] - 1; + sb.append(startPositions[currentTopLevelFieldIndex]); + int currentFieldLength = startPositions[currentTopLevelFieldIndex + 1] - startPositions[currentTopLevelFieldIndex] - 1; sb.append(" for field length "); sb.append(currentFieldLength); } @@ -178,15 +397,15 @@ public String getDetailedReadPositionString() { * This is an adapted version of the parse method in the LazyStruct class. * They should parse things the same way. */ - private void parse() { + private void topLevelParse() { int fieldId = 0; int fieldByteBegin = start; int fieldByteEnd = start; - final byte separator = this.separator; + final byte separator = this.separators[0]; final int fieldCount = this.fieldCount; - final int[] startPosition = this.startPosition; + final int[] startPositions = this.startPositions; final byte[] bytes = this.bytes; final int end = this.end; @@ -196,7 +415,7 @@ private void parse() { if (!isEscaped) { while (fieldByteEnd < end) { if (bytes[fieldByteEnd] == separator) { - startPosition[fieldId++] = fieldByteBegin; + startPositions[fieldId++] = fieldByteBegin; if (fieldId == fieldCount) { break; } @@ -207,7 +426,7 @@ private void parse() { } // End serves as final separator. if (fieldByteEnd == end && fieldId < fieldCount) { - startPosition[fieldId++] = fieldByteBegin; + startPositions[fieldId++] = fieldByteBegin; } } else { final byte escapeChar = this.escapeChar; @@ -219,7 +438,7 @@ private void parse() { if (bytes[fieldByteEnd] == separator) { escapeCounts[fieldId] = escapeCount; escapeCount = 0; - startPosition[fieldId++] = fieldByteBegin; + startPositions[fieldId++] = fieldByteBegin; if (fieldId == fieldCount) { break; } @@ -237,7 +456,7 @@ private void parse() { if (bytes[fieldByteEnd] == separator) { escapeCounts[fieldId] = escapeCount; escapeCount = 0; - startPosition[fieldId++] = fieldByteBegin; + startPositions[fieldId++] = fieldByteBegin; if (fieldId <= fieldCount) { fieldByteBegin = ++fieldByteEnd; } @@ -248,23 +467,64 @@ private void parse() { // End serves as final separator. if (fieldByteEnd == end && fieldId < fieldCount) { escapeCounts[fieldId] = escapeCount; - startPosition[fieldId++] = fieldByteBegin; + startPositions[fieldId++] = fieldByteBegin; } } if (fieldId == fieldCount || fieldByteEnd == end) { // All fields have been parsed, or bytes have been parsed. - // We need to set the startPosition of fields.length to ensure we + // We need to set the startPositions of fields.length to ensure we // can use the same formula to calculate the length of each field. // For missing fields, their starting positions will all be the same, // which will make their lengths to be -1 and uncheckedGetField will // return these fields as NULLs. - Arrays.fill(startPosition, fieldId, startPosition.length, fieldByteEnd + 1); + Arrays.fill(startPositions, fieldId, startPositions.length, fieldByteEnd + 1); } isEndOfInputReached = (fieldByteEnd == end); } + private int parseComplexField(int start, int end, int level) { + + final byte separator = separators[level]; + int fieldByteEnd = start; + + byte[] bytes = this.bytes; + + currentEscapeCount = 0; + if (!isEscaped) { + while (fieldByteEnd < end) { + if (bytes[fieldByteEnd] == separator) { + return fieldByteEnd; + } + fieldByteEnd++; + } + } else { + final byte escapeChar = this.escapeChar; + final int endLessOne = end - 1; + int escapeCount = 0; + // Process the bytes that can be escaped (the last one can't be). + while (fieldByteEnd < endLessOne) { + if (bytes[fieldByteEnd] == separator) { + currentEscapeCount = escapeCount; + return fieldByteEnd; + } else if (bytes[fieldByteEnd] == escapeChar) { + // Ignore the char after escape_char + fieldByteEnd += 2; + escapeCount++; + } else { + fieldByteEnd++; + } + } + // Process the last byte. + if (bytes[fieldByteEnd] != separator) { + fieldByteEnd++; + } + currentEscapeCount = escapeCount; + } + return fieldByteEnd; + } + /* * Reads the the next field. * @@ -291,9 +551,9 @@ public boolean readNextField() throws IOException { * Designed for skipping columns that are not included. */ public void skipNextField() throws IOException { - if (!parsed) { - parse(); - parsed = true; + if (!topLevelParsed) { + topLevelParse(); + topLevelParsed = true; } if (nextFieldIndex + 1 >= fieldCount) { // No more. @@ -341,23 +601,45 @@ private boolean checkNull(byte[] bytes, int start, int len) { */ public boolean readField(int fieldIndex) throws IOException { - if (!parsed) { - parse(); - parsed = true; + Preconditions.checkState(currentLevel == 0); + + if (!topLevelParsed) { + topLevelParse(); + topLevelParsed = true; } - currentFieldIndex = fieldIndex; + // Top level. + currentTopLevelFieldIndex = fieldIndex; + + currentFieldStart = startPositions[fieldIndex]; + currentFieldLength = startPositions[fieldIndex + 1] - startPositions[fieldIndex] - 1; + currentEscapeCount = (isEscaped ? escapeCounts[fieldIndex] : 0); + + return doReadField(fields[fieldIndex]); + } + + private boolean doReadField(Field field) { - final int fieldStart = startPosition[fieldIndex]; - currentFieldStart = fieldStart; - final int fieldLength = startPosition[fieldIndex + 1] - startPosition[fieldIndex] - 1; - currentFieldLength = fieldLength; + final int fieldStart = currentFieldStart; + final int fieldLength = currentFieldLength; if (fieldLength < 0) { return false; } final byte[] bytes = this.bytes; + // UNDONE: Debug only + final int totalLength = end - start; + final char[] inputChars = new char[totalLength]; + for (int i = start; i < end; i++) { + inputChars[i - start] = (char) bytes[i]; + } + + final char[] fieldChars = new char[fieldLength]; + for (int i = fieldStart; i < fieldStart + fieldLength; i++) { + fieldChars[i - fieldStart] = (char) bytes[i]; + } + // Is the field the configured string representing NULL? if (nullSequenceBytes != null) { if (checkNull(bytes, fieldStart, fieldLength)) { @@ -369,221 +651,254 @@ public boolean readField(int fieldIndex) throws IOException { /* * We have a field and are positioned to it. Read it. */ - switch (primitiveCategories[fieldIndex]) { - case BOOLEAN: - { - int i = fieldStart; - if (fieldLength == 4) { - if ((bytes[i] == 'T' || bytes[i] == 't') && - (bytes[i + 1] == 'R' || bytes[i + 1] == 'r') && - (bytes[i + 2] == 'U' || bytes[i + 1] == 'u') && - (bytes[i + 3] == 'E' || bytes[i + 3] == 'e')) { - currentBoolean = true; - } else { - // No boolean value match for 4 char field. - return false; - } - } else if (fieldLength == 5) { - if ((bytes[i] == 'F' || bytes[i] == 'f') && - (bytes[i + 1] == 'A' || bytes[i + 1] == 'a') && - (bytes[i + 2] == 'L' || bytes[i + 2] == 'l') && - (bytes[i + 3] == 'S' || bytes[i + 3] == 's') && - (bytes[i + 4] == 'E' || bytes[i + 4] == 'e')) { - currentBoolean = false; - } else { - // No boolean value match for 5 char field. - return false; - } - } else if (isExtendedBooleanLiteral && fieldLength == 1) { - byte b = bytes[fieldStart]; - if (b == '1' || b == 't' || b == 'T') { - currentBoolean = true; - } else if (b == '0' || b == 'f' || b == 'F') { - currentBoolean = false; + if (field.isPrimitive) { + switch (field.primitiveCategory) { + case BOOLEAN: + { + int i = fieldStart; + if (fieldLength == 4) { + if ((bytes[i] == 'T' || bytes[i] == 't') && + (bytes[i + 1] == 'R' || bytes[i + 1] == 'r') && + (bytes[i + 2] == 'U' || bytes[i + 2] == 'u') && + (bytes[i + 3] == 'E' || bytes[i + 3] == 'e')) { + currentBoolean = true; + } else { + // No boolean value match for 4 char field. + return false; + } + } else if (fieldLength == 5) { + if ((bytes[i] == 'F' || bytes[i] == 'f') && + (bytes[i + 1] == 'A' || bytes[i + 1] == 'a') && + (bytes[i + 2] == 'L' || bytes[i + 2] == 'l') && + (bytes[i + 3] == 'S' || bytes[i + 3] == 's') && + (bytes[i + 4] == 'E' || bytes[i + 4] == 'e')) { + currentBoolean = false; + } else { + // No boolean value match for 5 char field. + return false; + } + } else if (isExtendedBooleanLiteral && fieldLength == 1) { + byte b = bytes[fieldStart]; + if (b == '1' || b == 't' || b == 'T') { + currentBoolean = true; + } else if (b == '0' || b == 'f' || b == 'F') { + currentBoolean = false; + } else { + // No boolean value match for extended 1 char field. + return false; + } } else { - // No boolean value match for extended 1 char field. + // No boolean value match for other lengths. return false; } - } else { - // No boolean value match for other lengths. + } + return true; + case BYTE: + if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { return false; } - } - return true; - case BYTE: - if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { - return false; - } - currentByte = LazyByte.parseByte(bytes, fieldStart, fieldLength, 10); - return true; - case SHORT: - if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { - return false; - } - currentShort = LazyShort.parseShort(bytes, fieldStart, fieldLength, 10); - return true; - case INT: - if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { - return false; - } - currentInt = LazyInteger.parseInt(bytes, fieldStart, fieldLength, 10); - return true; - case LONG: - if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { - return false; - } - currentLong = LazyLong.parseLong(bytes, fieldStart, fieldLength, 10); - return true; - case FLOAT: - if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { - return false; - } - currentFloat = - Float.parseFloat( - new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8)); - return true; - case DOUBLE: - if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { - return false; - } - currentDouble = - Double.parseDouble( - new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8)); - return true; - case STRING: - case CHAR: - case VARCHAR: - { - if (isEscaped) { - if (escapeCounts[fieldIndex] == 0) { - // No escaping. + currentByte = LazyByte.parseByte(bytes, fieldStart, fieldLength, 10); + return true; + case SHORT: + if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { + return false; + } + currentShort = LazyShort.parseShort(bytes, fieldStart, fieldLength, 10); + return true; + case INT: + if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { + return false; + } + currentInt = LazyInteger.parseInt(bytes, fieldStart, fieldLength, 10); + return true; + case LONG: + if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { + return false; + } + currentLong = LazyLong.parseLong(bytes, fieldStart, fieldLength, 10); + return true; + case FLOAT: + if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { + return false; + } + currentFloat = + Float.parseFloat( + new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8)); + return true; + case DOUBLE: + if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { + return false; + } + currentDouble = + Double.parseDouble( + new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8)); + return true; + case STRING: + case CHAR: + case VARCHAR: + { + if (isEscaped) { + if (currentEscapeCount == 0) { + // No escaping. + currentExternalBufferNeeded = false; + currentBytes = bytes; + currentBytesStart = fieldStart; + currentBytesLength = fieldLength; + } else { + final int unescapedLength = fieldLength - currentEscapeCount; + if (useExternalBuffer) { + currentExternalBufferNeeded = true; + currentExternalBufferNeededLen = unescapedLength; + } else { + // The copyToBuffer will reposition and re-read the input buffer. + currentExternalBufferNeeded = false; + if (internalBufferLen < unescapedLength) { + internalBufferLen = unescapedLength; + internalBuffer = new byte[internalBufferLen]; + } + copyToBuffer(internalBuffer, 0, unescapedLength); + currentBytes = internalBuffer; + currentBytesStart = 0; + currentBytesLength = unescapedLength; + } + } + } else { + // If the data is not escaped, reference the data directly. currentExternalBufferNeeded = false; currentBytes = bytes; currentBytesStart = fieldStart; currentBytesLength = fieldLength; - } else { - final int unescapedLength = fieldLength - escapeCounts[fieldIndex]; - if (useExternalBuffer) { - currentExternalBufferNeeded = true; - currentExternalBufferNeededLen = unescapedLength; - } else { - // The copyToBuffer will reposition and re-read the input buffer. - currentExternalBufferNeeded = false; - if (internalBufferLen < unescapedLength) { - internalBufferLen = unescapedLength; - internalBuffer = new byte[internalBufferLen]; - } - copyToBuffer(internalBuffer, 0, unescapedLength); - currentBytes = internalBuffer; - currentBytesStart = 0; - currentBytesLength = unescapedLength; - } } - } else { - // If the data is not escaped, reference the data directly. - currentExternalBufferNeeded = false; - currentBytes = bytes; - currentBytesStart = fieldStart; - currentBytesLength = fieldLength; } - } - return true; - case BINARY: - { - byte[] recv = new byte[fieldLength]; - System.arraycopy(bytes, fieldStart, recv, 0, fieldLength); - byte[] decoded = LazyBinary.decodeIfNeeded(recv); - // use the original bytes in case decoding should fail - decoded = decoded.length > 0 ? decoded : recv; - currentBytes = decoded; - currentBytesStart = 0; - currentBytesLength = decoded.length; - } - return true; - case DATE: - if (!LazyUtils.isDateMaybe(bytes, fieldStart, fieldLength)) { - return false; - } - currentDateWritable.set( - Date.valueOf( - new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8))); - return true; - case TIMESTAMP: - { + return true; + case BINARY: + { + byte[] recv = new byte[fieldLength]; + System.arraycopy(bytes, fieldStart, recv, 0, fieldLength); + byte[] decoded = LazyBinary.decodeIfNeeded(recv); + // use the original bytes in case decoding should fail + decoded = decoded.length > 0 ? decoded : recv; + currentBytes = decoded; + currentBytesStart = 0; + currentBytesLength = decoded.length; + } + return true; + case DATE: if (!LazyUtils.isDateMaybe(bytes, fieldStart, fieldLength)) { return false; } - String s = new String(bytes, fieldStart, fieldLength, StandardCharsets.US_ASCII); - if (s.compareTo("NULL") == 0) { - logExceptionMessage(bytes, fieldStart, fieldLength, "TIMESTAMP"); + currentDateWritable.set( + Date.valueOf( + new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8))); + return true; + case TIMESTAMP: + { + if (!LazyUtils.isDateMaybe(bytes, fieldStart, fieldLength)) { + return false; + } + String s = new String(bytes, fieldStart, fieldLength, StandardCharsets.US_ASCII); + if (s.compareTo("NULL") == 0) { + logExceptionMessage(bytes, fieldStart, fieldLength, "TIMESTAMP"); + return false; + } + try { + currentTimestampWritable.set(timestampParser.parseTimestamp(s)); + } catch (IllegalArgumentException e) { + logExceptionMessage(bytes, fieldStart, fieldLength, "TIMESTAMP"); + return false; + } + } + return true; + case INTERVAL_YEAR_MONTH: + if (fieldLength == 0) { return false; } try { - currentTimestampWritable.set(timestampParser.parseTimestamp(s)); - } catch (IllegalArgumentException e) { - logExceptionMessage(bytes, fieldStart, fieldLength, "TIMESTAMP"); + String s = new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8); + currentHiveIntervalYearMonthWritable.set(HiveIntervalYearMonth.valueOf(s)); + } catch (Exception e) { + logExceptionMessage(bytes, fieldStart, fieldLength, "INTERVAL_YEAR_MONTH"); return false; } - } - return true; - case INTERVAL_YEAR_MONTH: - if (fieldLength == 0) { - return false; - } - try { - String s = new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8); - currentHiveIntervalYearMonthWritable.set(HiveIntervalYearMonth.valueOf(s)); - } catch (Exception e) { - logExceptionMessage(bytes, fieldStart, fieldLength, "INTERVAL_YEAR_MONTH"); - return false; - } - return true; - case INTERVAL_DAY_TIME: - if (fieldLength == 0) { - return false; - } - try { - String s = new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8); - currentHiveIntervalDayTimeWritable.set(HiveIntervalDayTime.valueOf(s)); - } catch (Exception e) { - logExceptionMessage(bytes, fieldStart, fieldLength, "INTERVAL_DAY_TIME"); - return false; - } - return true; - case DECIMAL: - { - if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { + return true; + case INTERVAL_DAY_TIME: + if (fieldLength == 0) { + return false; + } + try { + String s = new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8); + currentHiveIntervalDayTimeWritable.set(HiveIntervalDayTime.valueOf(s)); + } catch (Exception e) { + logExceptionMessage(bytes, fieldStart, fieldLength, "INTERVAL_DAY_TIME"); return false; } - // Trim blanks because OldHiveDecimal did... - currentHiveDecimalWritable.setFromBytes(bytes, fieldStart, fieldLength, /* trimBlanks */ true); - boolean decimalIsNull = !currentHiveDecimalWritable.isSet(); - if (!decimalIsNull) { - DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfos[fieldIndex]; + return true; + case DECIMAL: + { + if (!LazyUtils.isNumberMaybe(bytes, fieldStart, fieldLength)) { + return false; + } + // Trim blanks because OldHiveDecimal did... + currentHiveDecimalWritable.setFromBytes(bytes, fieldStart, fieldLength, /* trimBlanks */ true); + boolean decimalIsNull = !currentHiveDecimalWritable.isSet(); + if (!decimalIsNull) { + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) field.typeInfo; - int precision = decimalTypeInfo.getPrecision(); - int scale = decimalTypeInfo.getScale(); + int precision = decimalTypeInfo.getPrecision(); + int scale = decimalTypeInfo.getScale(); - decimalIsNull = !currentHiveDecimalWritable.mutateEnforcePrecisionScale(precision, scale); + decimalIsNull = !currentHiveDecimalWritable.mutateEnforcePrecisionScale(precision, scale); + } + if (decimalIsNull) { + if (LOG.isDebugEnabled()) { + LOG.debug("Data not in the HiveDecimal data type range so converted to null. Given data is :" + + new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8)); + } + return false; + } } - if (decimalIsNull) { - if (LOG.isDebugEnabled()) { - LOG.debug("Data not in the HiveDecimal data type range so converted to null. Given data is :" - + new String(bytes, fieldStart, fieldLength, StandardCharsets.UTF_8)); + return true; + default: + throw new Error("Unexpected primitive category " + field.primitiveCategory); + } + } else { + switch (field.complexCategory) { + case LIST: + case MAP: + case STRUCT: + case UNION: + { + if (currentLevel > 0) { + + // Check for Map which occupies 2 levels (key separator and key/value pair separator). + if (currentComplexTypeHelpers[currentLevel - 1] == null) { + Preconditions.checkState(currentLevel > 1); + Preconditions.checkState( + currentComplexTypeHelpers[currentLevel - 2] instanceof MapComplexTypeHelper); + currentLevel++; + } } - return false; + ComplexTypeHelper complexTypeHelper = field.complexTypeHelper; + currentComplexTypeHelpers[currentLevel++] = complexTypeHelper; + if (field.complexCategory == Category.MAP) { + currentComplexTypeHelpers[currentLevel] = null; + } + + // Set up context for readNextComplexField. + complexTypeHelper.setCurrentFieldInfo(currentFieldStart, currentFieldLength); } + return true; + default: + throw new Error("Unexpected complex category " + field.complexCategory); } - return true; - - default: - throw new Error("Unexpected primitive category " + primitiveCategories[fieldIndex].name()); } } catch (NumberFormatException nfe) { - // U+FFFD will throw this as well - logExceptionMessage(bytes, fieldStart, fieldLength, primitiveCategories[fieldIndex]); + logExceptionMessage(bytes, fieldStart, fieldLength, field.complexCategory, field.primitiveCategory); return false; - } + } catch (IllegalArgumentException iae) { + logExceptionMessage(bytes, fieldStart, fieldLength, field.complexCategory, field.primitiveCategory); + return false; + } } @Override @@ -614,6 +929,224 @@ private void copyToBuffer(byte[] buffer, int bufferStart, int bufferLength) { } } + @Override + public boolean isNextComplexMultiValue() { + Preconditions.checkState(currentLevel > 0); + + ComplexTypeHelper complexTypeHelper = currentComplexTypeHelpers[currentLevel - 1]; + Field complexField = complexTypeHelper.complexField; + final int fieldPosition = complexTypeHelper.fieldPosition; + final int complexFieldEnd = complexTypeHelper.complexFieldEnd; + switch (complexField.complexCategory) { + case LIST: + { + // Allow for empty string, etc. + final boolean isNext = (fieldPosition <= complexFieldEnd); + if (!isNext) { + popComplexType(); + } + return isNext; + } + case MAP: + { + final boolean isNext = (fieldPosition < complexFieldEnd); + if (!isNext) { + popComplexType(); + } + return isNext; + } + case STRUCT: + case UNION: + throw new Error("Complex category " + complexField.complexCategory + " not multi-value"); + default: + throw new Error("Unexpected complex category " + complexField.complexCategory); + } + } + + private void popComplexType() { + Preconditions.checkState(currentLevel > 0); + currentLevel--; + if (currentLevel > 0) { + + // Check for Map which occupies 2 levels (key separator and key/value pair separator). + if (currentComplexTypeHelpers[currentLevel - 1] == null) { + Preconditions.checkState(currentLevel > 1); + Preconditions.checkState( + currentComplexTypeHelpers[currentLevel - 2] instanceof MapComplexTypeHelper); + currentLevel--; + } + } + } + + /* + * NOTE: There is an expectation that all fields will be read-thru. + */ + @Override + public boolean readComplexField() throws IOException { + + Preconditions.checkState(currentLevel > 0); + + ComplexTypeHelper complexTypeHelper = currentComplexTypeHelpers[currentLevel - 1]; + Field complexField = complexTypeHelper.complexField; + switch (complexField.complexCategory) { + case LIST: + { + ListComplexTypeHelper listHelper = (ListComplexTypeHelper) complexTypeHelper; + final int fieldPosition = listHelper.fieldPosition; + final int complexFieldEnd = listHelper.complexFieldEnd; + Preconditions.checkState(fieldPosition <= complexFieldEnd); + + final int fieldEnd = parseComplexField(fieldPosition, complexFieldEnd, currentLevel); + listHelper.fieldPosition = fieldEnd + 1; // Move past separator. + + currentFieldStart = fieldPosition; + currentFieldLength = fieldEnd - fieldPosition; + + return doReadField(listHelper.elementField); + } + case MAP: + { + MapComplexTypeHelper mapHelper = (MapComplexTypeHelper) complexTypeHelper; + final int fieldPosition = mapHelper.fieldPosition; + final int complexFieldEnd = mapHelper.complexFieldEnd; + Preconditions.checkState(fieldPosition <= complexFieldEnd); + + currentFieldStart = fieldPosition; + + // ****UNDONE**** Need to bump current level an extra time if current one is Map... + int fieldEnd; + if (!mapHelper.fieldHaveParsedKey) { + + // Parse until key separator (currentLevel + 1). + fieldEnd = parseComplexField(fieldPosition, complexFieldEnd, currentLevel + 1); + + mapHelper.fieldPosition = fieldEnd + 1; // Move past key separator. + + currentFieldLength = fieldEnd - fieldPosition; + + mapHelper.fieldHaveParsedKey = true; + return doReadField(mapHelper.keyField); + } else { + + // Parse until pair separator (currentLevel). + fieldEnd = parseComplexField(fieldPosition, complexFieldEnd, currentLevel); + + mapHelper.fieldPosition = fieldEnd + 1; // Move past pair separator. + + currentFieldLength = fieldEnd - fieldPosition; + + mapHelper.fieldHaveParsedKey = false; + return doReadField(mapHelper.valueField); + } + } + case STRUCT: + { + StructComplexTypeHelper structHelper = (StructComplexTypeHelper) complexTypeHelper; + final int fieldPosition = structHelper.fieldPosition; + final int complexFieldEnd = structHelper.complexFieldEnd; + Preconditions.checkState(fieldPosition < complexFieldEnd); + + currentFieldStart = fieldPosition; + + final int nextFieldIndex = structHelper.nextFieldIndex; + Field[] fields = structHelper.fields; + int fieldEnd; + if (nextFieldIndex != fields.length - 1) { + + // Parse until field separator (currentLevel). + fieldEnd = parseComplexField(fieldPosition, complexFieldEnd, currentLevel); + + structHelper.fieldPosition = fieldEnd + 1; // Move past key separator. + + currentFieldLength = fieldEnd - fieldPosition; + + return doReadField(fields[structHelper.nextFieldIndex++]); + } else { + + if (!isEscaped) { + + // No parsing necessary -- the end is the parent's end. + structHelper.fieldPosition = complexFieldEnd + 1; // Move past parent field separator. + currentEscapeCount = 0; + } else { + // We must parse to get the escape count. + fieldEnd = parseComplexField(fieldPosition, complexFieldEnd, currentLevel - 1); + } + + currentFieldLength = complexFieldEnd - fieldPosition; + + structHelper.nextFieldIndex = 0; + return doReadField(fields[fields.length - 1]); + } + } + case UNION: + { + UnionComplexTypeHelper unionHelper = (UnionComplexTypeHelper) complexTypeHelper; + final int fieldPosition = unionHelper.fieldPosition; + final int complexFieldEnd = unionHelper.complexFieldEnd; + Preconditions.checkState(fieldPosition <= complexFieldEnd); + + currentFieldStart = fieldPosition; + + int fieldEnd; + if (!unionHelper.fieldHaveParsedTag) { + + // Parse until union separator (currentLevel). + fieldEnd = parseComplexField(fieldPosition, complexFieldEnd, currentLevel); + + unionHelper.fieldPosition = fieldEnd + 1; // Move past union separator. + + currentFieldLength = fieldEnd - fieldPosition; + + unionHelper.fieldHaveParsedTag = true; + boolean successful = doReadField(unionHelper.tagField); + if (!successful) { + throw new IOException("Null union tag"); + } + unionHelper.fieldTag = currentInt; + return true; + } else { + + if (!isEscaped) { + + // No parsing necessary -- the end is the parent's end. + unionHelper.fieldPosition = complexFieldEnd + 1; // Move past parent field separator. + currentEscapeCount = 0; + } else { + // We must parse to get the escape count. + fieldEnd = parseComplexField(fieldPosition, complexFieldEnd, currentLevel - 1); + } + + currentFieldLength = complexFieldEnd - fieldPosition; + + unionHelper.fieldHaveParsedTag = false; + return doReadField(unionHelper.fields[unionHelper.fieldTag]); + } + } + default: + throw new Error("Unexpected complex category " + complexField.complexCategory); + } + } + + @Override + public void finishComplexVariableFieldsType() { + Preconditions.checkState(currentLevel > 0); + + ComplexTypeHelper complexTypeHelper = currentComplexTypeHelpers[currentLevel - 1]; + Field complexField = complexTypeHelper.complexField; + switch (complexField.complexCategory) { + case LIST: + case MAP: + throw new Error("Complex category " + complexField.complexCategory + " is not variable fields type"); + case STRUCT: + case UNION: + popComplexType(); + break; + default: + throw new Error("Unexpected category " + complexField.complexCategory); + } + } + /* * Call this method may be called after all the all fields have been read to check * for unread fields. @@ -630,21 +1163,34 @@ public boolean isEndOfInputReached() { } public void logExceptionMessage(byte[] bytes, int bytesStart, int bytesLength, - PrimitiveCategory dataCategory) { + Category dataComplexCategory, PrimitiveCategory dataPrimitiveCategory) { final String dataType; - switch (dataCategory) { - case BYTE: - dataType = "TINYINT"; - break; - case LONG: - dataType = "BIGINT"; - break; - case SHORT: - dataType = "SMALLINT"; - break; - default: - dataType = dataCategory.toString(); - break; + if (dataComplexCategory == null) { + switch (dataPrimitiveCategory) { + case BYTE: + dataType = "TINYINT"; + break; + case LONG: + dataType = "BIGINT"; + break; + case SHORT: + dataType = "SMALLINT"; + break; + default: + dataType = dataPrimitiveCategory.toString(); + break; + } + } else { + switch (dataComplexCategory) { + case LIST: + case MAP: + case STRUCT: + case UNION: + dataType = dataComplexCategory.toString(); + break; + default: + throw new Error("Unexpected complex category " + dataComplexCategory); + } } logExceptionMessage(bytes, bytesStart, bytesLength, dataType); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleSerializeWrite.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleSerializeWrite.java index 1401ac3..eac9c5e 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleSerializeWrite.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleSerializeWrite.java @@ -55,12 +55,13 @@ * * This is an alternative way to serialize than what is provided by LazyBinarySerDe. */ -public final class LazySimpleSerializeWrite implements SerializeWrite { +public final class LazySimpleSerializeWrite extends SerializeWrite { public static final Logger LOG = LoggerFactory.getLogger(LazySimpleSerializeWrite.class.getName()); private LazySerDeParameters lazyParams; private byte separator; + private byte[] separators; private boolean[] needsEscape; private boolean isEscaped; private byte escapeChar; @@ -70,6 +71,8 @@ private int fieldCount; private int index; + private int currentLevel; + private int saveIndex; // For thread safety, we allocate private writable objects for our use only. private DateWritable dateWritable; @@ -80,14 +83,15 @@ private byte[] decimalScratchBuffer; public LazySimpleSerializeWrite(int fieldCount, - byte separator, LazySerDeParameters lazyParams) { + LazySerDeParameters lazyParams) { this(); this.fieldCount = fieldCount; - - this.separator = separator; + this.lazyParams = lazyParams; + separators = lazyParams.getSeparators(); + separator = separators[0]; isEscaped = lazyParams.isEscaped(); escapeChar = lazyParams.getEscapeChar(); needsEscape = lazyParams.getNeedsEscape(); @@ -106,6 +110,7 @@ public void set(Output output) { this.output = output; output.reset(); index = 0; + currentLevel = 0; } /* @@ -115,6 +120,7 @@ public void set(Output output) { public void setAppend(Output output) { this.output = output; index = 0; + currentLevel = 0; } /* @@ -124,6 +130,7 @@ public void setAppend(Output output) { public void reset() { output.reset(); index = 0; + currentLevel = 0; } /* @@ -508,4 +515,91 @@ public void writeHiveDecimal(HiveDecimalWritable decWritable, int scale) throws index++; } + + private void beginComplex() { + if (currentLevel == 0) { + if (index > 0) { + output.write(separator); + } + saveIndex = index; + } else { + // We are deep -- always write a separator. + output.write(separator); + } + // Always use index 0 so the write methods don't write a separator. + index = 0; + + // Set "global" separator member to next level. + separator = separators[++currentLevel]; + } + + private void finishComplex() { + --currentLevel; + if (currentLevel == 0) { + index = saveIndex; + index++; + } + separator = separators[currentLevel]; + } + + @Override + public void beginList() { + beginComplex(); + } + + @Override + public void finishList() { + finishComplex(); + } + + @Override + public void beginMap() { + beginComplex(); + + // MAP requires 2 levels: key separator and key-pair separator. + currentLevel++; + } + + @Override + public void writeMapKeySeparator() { + index = 0; + output.write(separators[currentLevel]); + } + + @Override + public void writeMapKeyPairSeparator() { + index = 0; + output.write(separators[currentLevel - 1]); + } + + @Override + public void finishMap() { + // Remove MAP extra level. + currentLevel--; + + finishComplex(); + } + + @Override + public void beginStruct() { + beginComplex(); + } + + @Override + public void finishStruct() { + finishComplex(); + } + + @Override + public void beginUnion(int tag) throws IOException { + beginComplex(); + writeInt(tag); + output.write(separators[currentLevel]); + index = 0; + } + + @Override + public void finishUnion() { + finishComplex(); + } } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java index 6bc4622..fdbef08 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java @@ -45,7 +45,7 @@ * * This is an alternative way to serialize than what is provided by LazyBinarySerDe. */ -public class LazyBinarySerializeWrite implements SerializeWrite { +public class LazyBinarySerializeWrite extends SerializeWrite { public static final Logger LOG = LoggerFactory.getLogger(LazyBinarySerializeWrite.class.getName()); private Output output; diff --git serde/src/test/org/apache/hadoop/hive/serde2/SerdeRandomRowSource.java serde/src/test/org/apache/hadoop/hive/serde2/SerdeRandomRowSource.java index 301ee8b..6b3fc78 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/SerdeRandomRowSource.java +++ serde/src/test/org/apache/hadoop/hive/serde2/SerdeRandomRowSource.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.serde2; +import java.io.IOException; import java.sql.Date; import java.sql.Timestamp; import java.util.ArrayList; @@ -25,6 +26,7 @@ import java.util.List; import java.util.Random; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.common.type.HiveChar; @@ -33,11 +35,24 @@ import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.common.type.RandomTypeUtil; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector; @@ -56,10 +71,20 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hive.common.util.DateUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.BytesWritable; + +import com.google.common.base.Preconditions; +import com.google.common.base.Charsets; /** * Generate object inspector and random row object[]. @@ -72,6 +97,14 @@ private List typeNames; + private Category[] categories; + + private TypeInfo[] typeInfos; + + private List objectInspectorList; + + // Primitive. + private PrimitiveCategory[] primitiveCategories; private PrimitiveTypeInfo[] primitiveTypeInfos; @@ -80,10 +113,23 @@ private StructObjectInspector rowStructObjectInspector; + private String[] alphabets; + + private boolean addEscapables; + private String needsEscapeStr; + public List typeNames() { return typeNames; } + public Category[] categories() { + return categories; + } + + public TypeInfo[] typeInfos() { + return typeInfos; + } + public PrimitiveCategory[] primitiveCategories() { return primitiveCategories; } @@ -97,30 +143,28 @@ public StructObjectInspector rowStructObjectInspector() { } public StructObjectInspector partialRowStructObjectInspector(int partialFieldCount) { - ArrayList partialPrimitiveObjectInspectorList = + ArrayList partialObjectInspectorList = new ArrayList(partialFieldCount); List columnNames = new ArrayList(partialFieldCount); for (int i = 0; i < partialFieldCount; i++) { columnNames.add(String.format("partial%d", i)); - partialPrimitiveObjectInspectorList.add( - PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( - primitiveTypeInfos[i])); + partialObjectInspectorList.add(getObjectInspector(typeInfos[i])); } return ObjectInspectorFactory.getStandardStructObjectInspector( - columnNames, primitiveObjectInspectorList); + columnNames, objectInspectorList); } - public void init(Random r) { + public void init(Random r, boolean includeComplexTypes, int maxComplexDepth) { this.r = r; - chooseSchema(); + chooseSchema(includeComplexTypes, maxComplexDepth); } /* * For now, exclude CHAR until we determine why there is a difference (blank padding) * serializing with LazyBinarySerializeWrite and the regular SerDe... */ - private static String[] possibleHiveTypeNames = { + private static String[] possibleHivePrimitiveTypeNames = { "boolean", "tinyint", "smallint", @@ -140,7 +184,146 @@ public void init(Random r) { "decimal" }; - private void chooseSchema() { + private static String[] possibleHiveComplexTypeNames = { + "array", + "map", + "struct", + "uniontype" + }; + + private String getRandomTypeName(boolean includeComplexTypes) { + String typeName; + if (!includeComplexTypes || r.nextInt(10) != 0) { + typeName = possibleHivePrimitiveTypeNames[r.nextInt(possibleHivePrimitiveTypeNames.length)]; + } else { + typeName = possibleHiveComplexTypeNames[r.nextInt(possibleHiveComplexTypeNames.length)]; + } + return typeName; + } + + private String getDecoratedTypeName(String typeName, boolean includeComplexTypes, int depth, int maxDepth) { + depth++; + boolean includeChildrenComplexTypes = includeComplexTypes && depth < maxDepth; + if (typeName.equals("char")) { + int maxLength = 1 + r.nextInt(100); + typeName = String.format("char(%d)", maxLength); + } else if (typeName.equals("varchar")) { + int maxLength = 1 + r.nextInt(100); + typeName = String.format("varchar(%d)", maxLength); + } else if (typeName.equals("decimal")) { + typeName = String.format("decimal(%d,%d)", HiveDecimal.SYSTEM_DEFAULT_PRECISION, HiveDecimal.SYSTEM_DEFAULT_SCALE); + } else if (typeName.equals("array")) { + String elementTypeName = getRandomTypeName(includeChildrenComplexTypes); + elementTypeName = getDecoratedTypeName(elementTypeName, includeChildrenComplexTypes, depth, maxDepth); + typeName = String.format("array<%s>", elementTypeName); + } else if (typeName.equals("map")) { + String keyTypeName = getRandomTypeName(includeChildrenComplexTypes); + keyTypeName = getDecoratedTypeName(keyTypeName, includeChildrenComplexTypes, depth, maxDepth); + String valueTypeName = getRandomTypeName(includeChildrenComplexTypes); + valueTypeName = getDecoratedTypeName(valueTypeName, includeChildrenComplexTypes, depth, maxDepth); + typeName = String.format("map<%s,%s>", keyTypeName, valueTypeName); + } else if (typeName.equals("struct")) { + final int fieldCount = 1 + r.nextInt(10); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < fieldCount; i++) { + String fieldTypeName = getRandomTypeName(includeChildrenComplexTypes); + fieldTypeName = getDecoratedTypeName(fieldTypeName, includeChildrenComplexTypes, depth, maxDepth); + if (i > 0) { + sb.append(","); + } + sb.append("col"); + sb.append(i); + sb.append(":"); + sb.append(fieldTypeName); + } + typeName = String.format("struct<%s>", sb.toString()); + } else if (typeName.equals("struct") || + typeName.equals("uniontype")) { + final int fieldCount = 1 + r.nextInt(10); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < fieldCount; i++) { + String fieldTypeName = getRandomTypeName(includeChildrenComplexTypes); + fieldTypeName = getDecoratedTypeName(fieldTypeName, includeChildrenComplexTypes, depth, maxDepth); + if (i > 0) { + sb.append(","); + } + sb.append(fieldTypeName); + } + typeName = String.format("uniontype<%s>", sb.toString()); + } + return typeName; + } + + private ObjectInspector getObjectInspector(TypeInfo typeInfo) { + ObjectInspector objectInspector; + switch (typeInfo.getCategory()) { + case PRIMITIVE: + { + PrimitiveTypeInfo primitiveType = (PrimitiveTypeInfo) typeInfo; + objectInspector = + PrimitiveObjectInspectorFactory. + getPrimitiveWritableObjectInspector(primitiveType); + } + break; + case MAP: + { + MapTypeInfo mapType = (MapTypeInfo) typeInfo; + MapObjectInspector mapInspector = + ObjectInspectorFactory.getStandardMapObjectInspector( + getObjectInspector(mapType.getMapKeyTypeInfo()), + getObjectInspector(mapType.getMapValueTypeInfo())); + objectInspector = mapInspector; + } + break; + case LIST: + { + ListTypeInfo listType = (ListTypeInfo) typeInfo; + ListObjectInspector listInspector = + ObjectInspectorFactory.getStandardListObjectInspector( + getObjectInspector(listType.getListElementTypeInfo())); + objectInspector = listInspector; + } + break; + case STRUCT: + { + StructTypeInfo structType = (StructTypeInfo) typeInfo; + List fieldTypes = structType.getAllStructFieldTypeInfos(); + + List fieldInspectors = new ArrayList(); + for (TypeInfo fieldType : fieldTypes) { + fieldInspectors.add(getObjectInspector(fieldType)); + } + + StructObjectInspector structInspector = + ObjectInspectorFactory.getStandardStructObjectInspector( + structType.getAllStructFieldNames(), fieldInspectors); + objectInspector = structInspector; + } + break; + case UNION: + { + UnionTypeInfo unionType = (UnionTypeInfo) typeInfo; + List fieldTypes = unionType.getAllUnionObjectTypeInfos(); + + List fieldInspectors = new ArrayList(); + for (TypeInfo fieldType : fieldTypes) { + fieldInspectors.add(getObjectInspector(fieldType)); + } + + UnionObjectInspector unionInspector = + ObjectInspectorFactory.getStandardUnionObjectInspector( + fieldInspectors); + objectInspector = unionInspector; + } + break; + default: + throw new RuntimeException("Unexpected category " + typeInfo.getCategory()); + } + Preconditions.checkState(objectInspector != null); + return objectInspector; + } + + private void chooseSchema(boolean includeComplexTypes, int maxComplexDepth) { HashSet hashSet = null; boolean allTypes; boolean onlyOne = (r.nextInt(100) == 7); @@ -151,13 +334,20 @@ private void chooseSchema() { allTypes = r.nextBoolean(); if (allTypes) { // One of each type. - columnCount = possibleHiveTypeNames.length; + columnCount = possibleHivePrimitiveTypeNames.length; + if (includeComplexTypes) { + columnCount += possibleHiveComplexTypeNames.length; + } hashSet = new HashSet(); } else { columnCount = 1 + r.nextInt(20); } } typeNames = new ArrayList(columnCount); + categories = new Category[columnCount]; + typeInfos = new TypeInfo[columnCount]; + objectInspectorList = new ArrayList(columnCount); + primitiveCategories = new PrimitiveCategory[columnCount]; primitiveTypeInfos = new PrimitiveTypeInfo[columnCount]; primitiveObjectInspectorList = new ArrayList(columnCount); @@ -167,12 +357,18 @@ private void chooseSchema() { String typeName; if (onlyOne) { - typeName = possibleHiveTypeNames[r.nextInt(possibleHiveTypeNames.length)]; + typeName = getRandomTypeName(includeComplexTypes); } else { int typeNum; if (allTypes) { + int maxTypeNum = possibleHivePrimitiveTypeNames.length; + if (includeComplexTypes) { + maxTypeNum += possibleHiveComplexTypeNames.length; + } while (true) { - typeNum = r.nextInt(possibleHiveTypeNames.length); + + typeNum = r.nextInt(maxTypeNum); + Integer typeNumInteger = new Integer(typeNum); if (!hashSet.contains(typeNumInteger)) { hashSet.add(typeNumInteger); @@ -180,27 +376,61 @@ private void chooseSchema() { } } } else { - typeNum = r.nextInt(possibleHiveTypeNames.length); + if (!includeComplexTypes || r.nextInt(10) != 0) { + typeNum = r.nextInt(possibleHivePrimitiveTypeNames.length); + } else { + typeNum = possibleHivePrimitiveTypeNames.length + r.nextInt(possibleHiveComplexTypeNames.length); + } + } + if (typeNum < possibleHivePrimitiveTypeNames.length) { + typeName = possibleHivePrimitiveTypeNames[typeNum]; + } else { + typeName = possibleHiveComplexTypeNames[typeNum - possibleHivePrimitiveTypeNames.length]; } - typeName = possibleHiveTypeNames[typeNum]; + } - if (typeName.equals("char")) { - int maxLength = 1 + r.nextInt(100); - typeName = String.format("char(%d)", maxLength); - } else if (typeName.equals("varchar")) { - int maxLength = 1 + r.nextInt(100); - typeName = String.format("varchar(%d)", maxLength); - } else if (typeName.equals("decimal")) { - typeName = String.format("decimal(%d,%d)", HiveDecimal.SYSTEM_DEFAULT_PRECISION, HiveDecimal.SYSTEM_DEFAULT_SCALE); + + String decoratedTypeName = getDecoratedTypeName(typeName, includeComplexTypes, 0, maxComplexDepth); + + TypeInfo typeInfo; + try { + typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(decoratedTypeName); + } catch (Exception e) { + throw new RuntimeException("Cannot convert type name " + decoratedTypeName + " to a type " + e); + } + + typeInfos[c] = typeInfo; + Category category = typeInfo.getCategory(); + categories[c] = category; + ObjectInspector objectInspector = getObjectInspector(typeInfo); + switch (category) { + case PRIMITIVE: + { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + objectInspector = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo); + primitiveTypeInfos[c] = primitiveTypeInfo; + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + primitiveCategories[c] = primitiveCategory; + primitiveObjectInspectorList.add(objectInspector); + } + break; + case LIST: + case MAP: + case STRUCT: + case UNION: + primitiveObjectInspectorList.add(null); + break; + default: + throw new RuntimeException("Unexpected catagory " + category); } - PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); - primitiveTypeInfos[c] = primitiveTypeInfo; - PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); - primitiveCategories[c] = primitiveCategory; - primitiveObjectInspectorList.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo)); - typeNames.add(typeName); + objectInspectorList.add(objectInspector); + + if (category == Category.PRIMITIVE) { + } + typeNames.add(decoratedTypeName); } - rowStructObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, primitiveObjectInspectorList); + rowStructObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, objectInspectorList); + alphabets = new String[columnCount]; } public Object[][] randomRows(int n) { @@ -214,18 +444,52 @@ private void chooseSchema() { public Object[] randomRow() { Object row[] = new Object[columnCount]; for (int c = 0; c < columnCount; c++) { - Object object = randomObject(c); - if (object == null) { - throw new Error("Unexpected null for column " + c); - } - row[c] = getWritableObject(c, object); - if (row[c] == null) { - throw new Error("Unexpected null for writable for column " + c); - } + row[c] = randomWritable(c); + } + return row; + } + + public Object[] randomPrimitiveRow(int columnCount) { + return randomPrimitiveRow(columnCount, r, primitiveTypeInfos); + } + + public static Object[] randomPrimitiveRow(int columnCount, Random r, + PrimitiveTypeInfo[] primitiveTypeInfos) { + Object row[] = new Object[columnCount]; + for (int c = 0; c < columnCount; c++) { + row[c] = randomPrimitiveObject(r, primitiveTypeInfos[c]); } return row; } + public void addBinarySortableAlphabets() { + for (int c = 0; c < columnCount; c++) { + switch (primitiveCategories[c]) { + case STRING: + case CHAR: + case VARCHAR: + byte[] bytes = new byte[10 + r.nextInt(10)]; + for (int i = 0; i < bytes.length; i++) { + bytes[i] = (byte) (32 + r.nextInt(96)); + } + int alwaysIndex = r.nextInt(bytes.length); + bytes[alwaysIndex] = 0; // Must be escaped by BinarySortable. + int alwaysIndex2 = r.nextInt(bytes.length); + bytes[alwaysIndex2] = 1; // Must be escaped by BinarySortable. + alphabets[c] = new String(bytes, Charsets.UTF_8); + break; + default: + // No alphabet needed. + break; + } + } + } + + public void addEscapables(String needsEscapeStr) { + addEscapables = true; + this.needsEscapeStr = needsEscapeStr; + } + public static void sort(Object[][] rows, ObjectInspector oi) { for (int i = 0; i < rows.length; i++) { for (int j = i + 1; j < rows.length; j++) { @@ -242,11 +506,9 @@ public void sort(Object[][] rows) { SerdeRandomRowSource.sort(rows, rowStructObjectInspector); } - public Object getWritableObject(int column, Object object) { - ObjectInspector objectInspector = primitiveObjectInspectorList.get(column); - PrimitiveCategory primitiveCategory = primitiveCategories[column]; - PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; - switch (primitiveCategory) { + public Object getWritablePrimitiveObject(PrimitiveTypeInfo primitiveTypeInfo, + ObjectInspector objectInspector, Object object) { + switch (primitiveTypeInfo.getPrimitiveCategory()) { case BOOLEAN: return ((WritableBooleanObjectInspector) objectInspector).create((boolean) object); case BYTE: @@ -292,16 +554,160 @@ public Object getWritableObject(int column, Object object) { return writableDecimalObjectInspector.create((HiveDecimal) object); } default: - throw new Error("Unknown primitive category " + primitiveCategory); + throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory()); + } + } + + public Object randomWritable(int column) { + return randomWritable(typeInfos[column], objectInspectorList.get(column)); + } + + public Object randomWritable(TypeInfo typeInfo, ObjectInspector objectInspector) { + switch (typeInfo.getCategory()) { + case PRIMITIVE: + { + Object object = randomPrimitiveObject(r, (PrimitiveTypeInfo) typeInfo); + return getWritablePrimitiveObject((PrimitiveTypeInfo) typeInfo, objectInspector, object); + } + case LIST: + { + if (r.nextInt(20) == 0) { + return null; + } + // Always generate a list with at least 1 value? + final int elementCount = 1 + r.nextInt(100); + StandardListObjectInspector listObjectInspector = + (StandardListObjectInspector) objectInspector; + ObjectInspector elementObjectInspector = + listObjectInspector.getListElementObjectInspector(); + TypeInfo elementTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + elementObjectInspector); + boolean isStringFamily = false; + PrimitiveCategory primitiveCategory = null; + if (elementTypeInfo.getCategory() == Category.PRIMITIVE) { + primitiveCategory = ((PrimitiveTypeInfo) elementTypeInfo).getPrimitiveCategory(); + if (primitiveCategory == PrimitiveCategory.STRING || + primitiveCategory == PrimitiveCategory.BINARY || + primitiveCategory == PrimitiveCategory.CHAR || + primitiveCategory == PrimitiveCategory.VARCHAR) { + isStringFamily = true; + } + } + Object listObj = listObjectInspector.create(elementCount); + for (int i = 0; i < elementCount; i++) { + Object ele = randomWritable(elementTypeInfo, elementObjectInspector); + // UNDONE: For now, a 1-element list with a null element is a null list... + if (ele == null && elementCount == 1) { + return null; + } + if (isStringFamily && elementCount == 1) { + switch (primitiveCategory) { + case STRING: + if (((Text) ele).getLength() == 0) { + return null; + } + break; + case BINARY: + if (((BytesWritable) ele).getLength() == 0) { + return null; + } + break; + case CHAR: + if (((HiveCharWritable) ele).getHiveChar().getStrippedValue().isEmpty()) { + return null; + } + break; + case VARCHAR: + if (((HiveVarcharWritable) ele).getHiveVarchar().getValue().isEmpty()) { + return null; + } + break; + default: + throw new RuntimeException("Unexpected primitive category " + primitiveCategory); + } + } + listObjectInspector.set(listObj, i, ele); + } + return listObj; + } + case MAP: + { + if (r.nextInt(20) == 0) { + return null; + } + final int keyPairCount = r.nextInt(100); + StandardMapObjectInspector mapObjectInspector = + (StandardMapObjectInspector) objectInspector; + ObjectInspector keyObjectInspector = + mapObjectInspector.getMapKeyObjectInspector(); + TypeInfo keyTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + keyObjectInspector); + ObjectInspector valueObjectInspector = + mapObjectInspector.getMapValueObjectInspector(); + TypeInfo valueTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + valueObjectInspector); + Object mapObj = mapObjectInspector.create(); + for (int i = 0; i < keyPairCount; i++) { + Object key = randomWritable(keyTypeInfo, keyObjectInspector); + Object value = randomWritable(valueTypeInfo, valueObjectInspector); + mapObjectInspector.put(mapObj, key, value); + } + return mapObj; + } + case STRUCT: + { + if (r.nextInt(20) == 0) { + return null; + } + StandardStructObjectInspector structObjectInspector = + (StandardStructObjectInspector) objectInspector; + List fieldRefsList = structObjectInspector.getAllStructFieldRefs(); + final int fieldCount = fieldRefsList.size(); + Object structObj = structObjectInspector.create(); + for (int i = 0; i < fieldCount; i++) { + StructField fieldRef = fieldRefsList.get(i); + ObjectInspector fieldObjectInspector = + fieldRef.getFieldObjectInspector(); + TypeInfo fieldTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + fieldObjectInspector); + Object fieldObj = randomWritable(fieldTypeInfo, fieldObjectInspector); + structObjectInspector.setStructFieldData(structObj, fieldRef, fieldObj); + } + return structObj; + } + case UNION: + { + StandardUnionObjectInspector unionObjectInspector = + (StandardUnionObjectInspector) objectInspector; + List objectInspectorList = unionObjectInspector.getObjectInspectors(); + final int unionCount = objectInspectorList.size(); + final byte tag = (byte) r.nextInt(unionCount); + Object unionObj = unionObjectInspector.create(); + ObjectInspector fieldObjectInspector = + objectInspectorList.get(tag); + TypeInfo fieldTypeInfo = + TypeInfoUtils.getTypeInfoFromObjectInspector( + fieldObjectInspector); + Object fieldObj = randomWritable(fieldTypeInfo, fieldObjectInspector); + return new StandardUnion(tag, fieldObj); + } + default: + throw new RuntimeException("Unexpected category " + typeInfo.getCategory()); } } - public Object randomObject(int column) { - PrimitiveCategory primitiveCategory = primitiveCategories[column]; - PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; - switch (primitiveCategory) { + public Object randomPrimitiveObject(int column) { + return randomPrimitiveObject(r, primitiveTypeInfos[column]); + } + + public static Object randomPrimitiveObject(Random r, PrimitiveTypeInfo primitiveTypeInfo) { + switch (primitiveTypeInfo.getPrimitiveCategory()) { case BOOLEAN: - return Boolean.valueOf(r.nextInt(1) == 1); + return Boolean.valueOf(r.nextBoolean()); case BYTE: return Byte.valueOf((byte) r.nextInt()); case SHORT: @@ -336,7 +742,7 @@ public Object randomObject(int column) { return dec; } default: - throw new Error("Unknown primitive category " + primitiveCategory); + throw new Error("Unknown primitive category " + primitiveTypeInfo.getCategory()); } } @@ -347,13 +753,17 @@ public static HiveChar getRandHiveChar(Random r, CharTypeInfo charTypeInfo) { return hiveChar; } - public static HiveVarchar getRandHiveVarchar(Random r, VarcharTypeInfo varcharTypeInfo) { + public static HiveVarchar getRandHiveVarchar(Random r, VarcharTypeInfo varcharTypeInfo, String alphabet) { int maxLength = 1 + r.nextInt(varcharTypeInfo.getLength()); - String randomString = RandomTypeUtil.getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); + String randomString = RandomTypeUtil.getRandString(r, alphabet, 100); HiveVarchar hiveVarchar = new HiveVarchar(randomString, maxLength); return hiveVarchar; } + public static HiveVarchar getRandHiveVarchar(Random r, VarcharTypeInfo varcharTypeInfo) { + return getRandHiveVarchar(r, varcharTypeInfo, "abcdefghijklmnopqrstuvwxyz"); + } + public static byte[] getRandBinary(Random r, int len){ byte[] bytes = new byte[len]; for (int j = 0; j < len; j++){ @@ -385,6 +795,7 @@ public static HiveDecimal getRandHiveDecimal(Random r, DecimalTypeInfo decimalTy sb.append("."); sb.append(RandomTypeUtil.getRandString(r, DECIMAL_CHARS, scale)); } + HiveDecimal dec = HiveDecimal.create(sb.toString()); return dec; diff --git serde/src/test/org/apache/hadoop/hive/serde2/VerifyFast.java serde/src/test/org/apache/hadoop/hive/serde2/VerifyFast.java index 19b04bb..030995e 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/VerifyFast.java +++ serde/src/test/org/apache/hadoop/hive/serde2/VerifyFast.java @@ -18,9 +18,14 @@ package org.apache.hadoop.hive.serde2; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.sql.Date; import java.sql.Timestamp; +import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map.Entry; import junit.framework.TestCase; @@ -30,7 +35,24 @@ import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.serde2.fast.DeserializeRead; +import org.apache.hadoop.hive.serde2.lazy.LazyBinary; +import org.apache.hadoop.hive.serde2.lazy.LazyByte; +import org.apache.hadoop.hive.serde2.lazy.LazyDate; +import org.apache.hadoop.hive.serde2.lazy.LazyDouble; +import org.apache.hadoop.hive.serde2.lazy.LazyFloat; +import org.apache.hadoop.hive.serde2.lazy.LazyHiveChar; +import org.apache.hadoop.hive.serde2.lazy.LazyHiveDecimal; +import org.apache.hadoop.hive.serde2.lazy.LazyHiveIntervalDayTime; +import org.apache.hadoop.hive.serde2.lazy.LazyHiveIntervalYearMonth; +import org.apache.hadoop.hive.serde2.lazy.LazyHiveVarchar; +import org.apache.hadoop.hive.serde2.lazy.LazyInteger; +import org.apache.hadoop.hive.serde2.lazy.LazyLong; +import org.apache.hadoop.hive.serde2.lazy.LazyShort; +import org.apache.hadoop.hive.serde2.lazy.LazyString; +import org.apache.hadoop.hive.serde2.lazy.LazyTimestamp; +import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; import org.apache.hadoop.hive.serde2.fast.SerializeWrite; import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DateWritable; @@ -44,7 +66,13 @@ import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; @@ -53,6 +81,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.FloatWritable; /** * TestBinarySortableSerDe. @@ -61,338 +90,623 @@ public class VerifyFast { public static void verifyDeserializeRead(DeserializeRead deserializeRead, - PrimitiveTypeInfo primitiveTypeInfo, Writable writable) throws IOException { + TypeInfo typeInfo, Object object) throws IOException { boolean isNull; isNull = !deserializeRead.readNextField(); + doVerifyDeserializeRead(deserializeRead, typeInfo, object, isNull); + } + + public static void doVerifyDeserializeRead(DeserializeRead deserializeRead, + TypeInfo typeInfo, Object object, boolean isNull) throws IOException { if (isNull) { - if (writable != null) { - TestCase.fail("Field reports null but object is not null (class " + writable.getClass().getName() + ", " + writable.toString() + ")"); + if (object != null) { + TestCase.fail("Field reports null but object is not null (class " + object.getClass().getName() + ", " + object.toString() + ")"); } return; - } else if (writable == null) { + } else if (object == null) { TestCase.fail("Field report not null but object is null"); } - switch (primitiveTypeInfo.getPrimitiveCategory()) { - case BOOLEAN: - { - boolean value = deserializeRead.currentBoolean; - if (!(writable instanceof BooleanWritable)) { - TestCase.fail("Boolean expected writable not Boolean"); - } - boolean expected = ((BooleanWritable) writable).get(); - if (value != expected) { - TestCase.fail("Boolean field mismatch (expected " + expected + " found " + value + ")"); - } - } - break; - case BYTE: - { - byte value = deserializeRead.currentByte; - if (!(writable instanceof ByteWritable)) { - TestCase.fail("Byte expected writable not Byte"); - } - byte expected = ((ByteWritable) writable).get(); - if (value != expected) { - TestCase.fail("Byte field mismatch (expected " + (int) expected + " found " + (int) value + ")"); - } - } - break; - case SHORT: - { - short value = deserializeRead.currentShort; - if (!(writable instanceof ShortWritable)) { - TestCase.fail("Short expected writable not Short"); - } - short expected = ((ShortWritable) writable).get(); - if (value != expected) { - TestCase.fail("Short field mismatch (expected " + expected + " found " + value + ")"); - } - } - break; - case INT: - { - int value = deserializeRead.currentInt; - if (!(writable instanceof IntWritable)) { - TestCase.fail("Integer expected writable not Integer"); - } - int expected = ((IntWritable) writable).get(); - if (value != expected) { - TestCase.fail("Int field mismatch (expected " + expected + " found " + value + ")"); - } - } - break; - case LONG: - { - long value = deserializeRead.currentLong; - if (!(writable instanceof LongWritable)) { - TestCase.fail("Long expected writable not Long"); - } - Long expected = ((LongWritable) writable).get(); - if (value != expected) { - TestCase.fail("Long field mismatch (expected " + expected + " found " + value + ")"); - } - } - break; - case FLOAT: - { - float value = deserializeRead.currentFloat; - if (!(writable instanceof FloatWritable)) { - TestCase.fail("Float expected writable not Float"); - } - float expected = ((FloatWritable) writable).get(); - if (value != expected) { - TestCase.fail("Float field mismatch (expected " + expected + " found " + value + ")"); - } - } - break; - case DOUBLE: - { - double value = deserializeRead.currentDouble; - if (!(writable instanceof DoubleWritable)) { - TestCase.fail("Double expected writable not Double"); - } - double expected = ((DoubleWritable) writable).get(); - if (value != expected) { - TestCase.fail("Double field mismatch (expected " + expected + " found " + value + ")"); - } - } - break; - case STRING: - { - byte[] stringBytes = Arrays.copyOfRange( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); - Text text = new Text(stringBytes); - String string = text.toString(); - String expected = ((Text) writable).toString(); - if (!string.equals(expected)) { - TestCase.fail("String field mismatch (expected '" + expected + "' found '" + string + "')"); - } - } - break; - case CHAR: - { - byte[] stringBytes = Arrays.copyOfRange( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); - Text text = new Text(stringBytes); - String string = text.toString(); - - HiveChar hiveChar = new HiveChar(string, ((CharTypeInfo) primitiveTypeInfo).getLength()); - - HiveChar expected = ((HiveCharWritable) writable).getHiveChar(); - if (!hiveChar.equals(expected)) { - TestCase.fail("Char field mismatch (expected '" + expected + "' found '" + hiveChar + "')"); + switch (typeInfo.getCategory()) { + case PRIMITIVE: + { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + switch (primitiveTypeInfo.getPrimitiveCategory()) { + case BOOLEAN: + { + boolean value = deserializeRead.currentBoolean; + if (!(object instanceof BooleanWritable)) { + TestCase.fail("Boolean expected writable not Boolean"); + } + boolean expected = ((BooleanWritable) object).get(); + if (value != expected) { + TestCase.fail("Boolean field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case BYTE: + { + byte value = deserializeRead.currentByte; + if (!(object instanceof ByteWritable)) { + TestCase.fail("Byte expected writable not Byte"); + } + byte expected = ((ByteWritable) object).get(); + if (value != expected) { + TestCase.fail("Byte field mismatch (expected " + (int) expected + " found " + (int) value + ")"); + } + } + break; + case SHORT: + { + short value = deserializeRead.currentShort; + if (!(object instanceof ShortWritable)) { + TestCase.fail("Short expected writable not Short"); + } + short expected = ((ShortWritable) object).get(); + if (value != expected) { + TestCase.fail("Short field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case INT: + { + int value = deserializeRead.currentInt; + if (!(object instanceof IntWritable)) { + TestCase.fail("Integer expected writable not Integer"); + } + int expected = ((IntWritable) object).get(); + if (value != expected) { + TestCase.fail("Int field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case LONG: + { + long value = deserializeRead.currentLong; + if (!(object instanceof LongWritable)) { + TestCase.fail("Long expected writable not Long"); + } + Long expected = ((LongWritable) object).get(); + if (value != expected) { + TestCase.fail("Long field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case FLOAT: + { + float value = deserializeRead.currentFloat; + if (!(object instanceof FloatWritable)) { + TestCase.fail("Float expected writable not Float"); + } + float expected = ((FloatWritable) object).get(); + if (value != expected) { + TestCase.fail("Float field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case DOUBLE: + { + double value = deserializeRead.currentDouble; + if (!(object instanceof DoubleWritable)) { + TestCase.fail("Double expected writable not Double"); + } + double expected = ((DoubleWritable) object).get(); + if (value != expected) { + TestCase.fail("Double field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case STRING: + { + byte[] stringBytes = Arrays.copyOfRange( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); + Text text = new Text(stringBytes); + String string = text.toString(); + String expected = ((Text) object).toString(); + if (!string.equals(expected)) { + TestCase.fail("String field mismatch (expected '" + expected + "' found '" + string + "')"); + } + } + break; + case CHAR: + { + byte[] stringBytes = Arrays.copyOfRange( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); + Text text = new Text(stringBytes); + String string = text.toString(); + + HiveChar hiveChar = new HiveChar(string, ((CharTypeInfo) primitiveTypeInfo).getLength()); + + HiveChar expected = ((HiveCharWritable) object).getHiveChar(); + if (!hiveChar.equals(expected)) { + TestCase.fail("Char field mismatch (expected '" + expected + "' found '" + hiveChar + "')"); + } + } + break; + case VARCHAR: + { + byte[] stringBytes = Arrays.copyOfRange( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); + Text text = new Text(stringBytes); + String string = text.toString(); + + HiveVarchar hiveVarchar = new HiveVarchar(string, ((VarcharTypeInfo) primitiveTypeInfo).getLength()); + + HiveVarchar expected = ((HiveVarcharWritable) object).getHiveVarchar(); + if (!hiveVarchar.equals(expected)) { + TestCase.fail("Varchar field mismatch (expected '" + expected + "' found '" + hiveVarchar + "')"); + } + } + break; + case DECIMAL: + { + HiveDecimal value = deserializeRead.currentHiveDecimalWritable.getHiveDecimal(); + if (value == null) { + TestCase.fail("Decimal field evaluated to NULL"); + } + HiveDecimal expected = ((HiveDecimalWritable) object).getHiveDecimal(); + if (!value.equals(expected)) { + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) primitiveTypeInfo; + int precision = decimalTypeInfo.getPrecision(); + int scale = decimalTypeInfo.getScale(); + TestCase.fail("Decimal field mismatch (expected " + expected.toString() + " found " + value.toString() + ") precision " + precision + ", scale " + scale); + } + } + break; + case DATE: + { + Date value = deserializeRead.currentDateWritable.get(); + Date expected = ((DateWritable) object).get(); + if (!value.equals(expected)) { + TestCase.fail("Date field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case TIMESTAMP: + { + Timestamp value = deserializeRead.currentTimestampWritable.getTimestamp(); + Timestamp expected = ((TimestampWritable) object).getTimestamp(); + if (!value.equals(expected)) { + TestCase.fail("Timestamp field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case INTERVAL_YEAR_MONTH: + { + HiveIntervalYearMonth value = deserializeRead.currentHiveIntervalYearMonthWritable.getHiveIntervalYearMonth(); + HiveIntervalYearMonth expected = ((HiveIntervalYearMonthWritable) object).getHiveIntervalYearMonth(); + if (!value.equals(expected)) { + TestCase.fail("HiveIntervalYearMonth field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case INTERVAL_DAY_TIME: + { + HiveIntervalDayTime value = deserializeRead.currentHiveIntervalDayTimeWritable.getHiveIntervalDayTime(); + HiveIntervalDayTime expected = ((HiveIntervalDayTimeWritable) object).getHiveIntervalDayTime(); + if (!value.equals(expected)) { + TestCase.fail("HiveIntervalDayTime field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case BINARY: + { + byte[] byteArray = Arrays.copyOfRange( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); + BytesWritable bytesWritable = (BytesWritable) object; + byte[] expected = Arrays.copyOfRange(bytesWritable.getBytes(), 0, bytesWritable.getLength()); + if (byteArray.length != expected.length){ + TestCase.fail("Byte Array field mismatch (expected " + Arrays.toString(expected) + + " found " + Arrays.toString(byteArray) + ")"); + } + for (int b = 0; b < byteArray.length; b++) { + if (byteArray[b] != expected[b]) { + TestCase.fail("Byte Array field mismatch (expected " + Arrays.toString(expected) + + " found " + Arrays.toString(byteArray) + ")"); + } + } + } + break; + default: + throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory()); } } break; - case VARCHAR: - { - byte[] stringBytes = Arrays.copyOfRange( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); - Text text = new Text(stringBytes); - String string = text.toString(); - - HiveVarchar hiveVarchar = new HiveVarchar(string, ((VarcharTypeInfo) primitiveTypeInfo).getLength()); + case LIST: + case MAP: + case STRUCT: + case UNION: + throw new Error("Complex types need to be handled separately"); + default: + throw new Error("Unknown category " + typeInfo.getCategory()); + } + } - HiveVarchar expected = ((HiveVarcharWritable) writable).getHiveVarchar(); - if (!hiveVarchar.equals(expected)) { - TestCase.fail("Varchar field mismatch (expected '" + expected + "' found '" + hiveVarchar + "')"); - } - } - break; - case DECIMAL: - { - HiveDecimal value = deserializeRead.currentHiveDecimalWritable.getHiveDecimal(); - if (value == null) { - TestCase.fail("Decimal field evaluated to NULL"); - } - HiveDecimal expected = ((HiveDecimalWritable) writable).getHiveDecimal(); - if (!value.equals(expected)) { - DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) primitiveTypeInfo; - int precision = decimalTypeInfo.getPrecision(); - int scale = decimalTypeInfo.getScale(); - TestCase.fail("Decimal field mismatch (expected " + expected.toString() + " found " + value.toString() + ") precision " + precision + ", scale " + scale); - } - } - break; - case DATE: - { - Date value = deserializeRead.currentDateWritable.get(); - Date expected = ((DateWritable) writable).get(); - if (!value.equals(expected)) { - TestCase.fail("Date field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + public static void serializeWrite(SerializeWrite serializeWrite, + TypeInfo typeInfo, Object object) throws IOException { + if (object == null) { + serializeWrite.writeNull(); + return; + } + switch (typeInfo.getCategory()) { + case PRIMITIVE: + { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + switch (primitiveTypeInfo.getPrimitiveCategory()) { + case BOOLEAN: + { + boolean value = ((BooleanWritable) object).get(); + serializeWrite.writeBoolean(value); + } + break; + case BYTE: + { + byte value = ((ByteWritable) object).get(); + serializeWrite.writeByte(value); + } + break; + case SHORT: + { + short value = ((ShortWritable) object).get(); + serializeWrite.writeShort(value); + } + break; + case INT: + { + int value = ((IntWritable) object).get(); + serializeWrite.writeInt(value); + } + break; + case LONG: + { + long value = ((LongWritable) object).get(); + serializeWrite.writeLong(value); + } + break; + case FLOAT: + { + float value = ((FloatWritable) object).get(); + serializeWrite.writeFloat(value); + } + break; + case DOUBLE: + { + double value = ((DoubleWritable) object).get(); + serializeWrite.writeDouble(value); + } + break; + case STRING: + { + Text value = (Text) object; + byte[] stringBytes = value.getBytes(); + int stringLength = stringBytes.length; + serializeWrite.writeString(stringBytes, 0, stringLength); + } + break; + case CHAR: + { + HiveChar value = ((HiveCharWritable) object).getHiveChar(); + serializeWrite.writeHiveChar(value); + } + break; + case VARCHAR: + { + HiveVarchar value = ((HiveVarcharWritable) object).getHiveVarchar(); + serializeWrite.writeHiveVarchar(value); + } + break; + case DECIMAL: + { + HiveDecimal value = ((HiveDecimalWritable) object).getHiveDecimal(); + DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)primitiveTypeInfo; + serializeWrite.writeHiveDecimal(value, decTypeInfo.scale()); + } + break; + case DATE: + { + Date value = ((DateWritable) object).get(); + serializeWrite.writeDate(value); + } + break; + case TIMESTAMP: + { + Timestamp value = ((TimestampWritable) object).getTimestamp(); + serializeWrite.writeTimestamp(value); + } + break; + case INTERVAL_YEAR_MONTH: + { + HiveIntervalYearMonth value = ((HiveIntervalYearMonthWritable) object).getHiveIntervalYearMonth(); + serializeWrite.writeHiveIntervalYearMonth(value); + } + break; + case INTERVAL_DAY_TIME: + { + HiveIntervalDayTime value = ((HiveIntervalDayTimeWritable) object).getHiveIntervalDayTime(); + serializeWrite.writeHiveIntervalDayTime(value); + } + break; + case BINARY: + { + BytesWritable byteWritable = (BytesWritable) object; + byte[] binaryBytes = byteWritable.getBytes(); + int length = byteWritable.getLength(); + serializeWrite.writeBinary(binaryBytes, 0, length); + } + break; + default: + throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory().name()); } } break; - case TIMESTAMP: + case LIST: { - Timestamp value = deserializeRead.currentTimestampWritable.getTimestamp(); - Timestamp expected = ((TimestampWritable) writable).getTimestamp(); - if (!value.equals(expected)) { - TestCase.fail("Timestamp field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; + TypeInfo elementTypeInfo = listTypeInfo.getListElementTypeInfo(); + serializeWrite.beginList(); + ArrayList elements = (ArrayList) object; + for (Object elementObject : elements) { + if (elementObject == null) { + serializeWrite.writeNull(); + } else { + serializeWrite(serializeWrite, elementTypeInfo, elementObject); + } } - } - break; - case INTERVAL_YEAR_MONTH: - { - HiveIntervalYearMonth value = deserializeRead.currentHiveIntervalYearMonthWritable.getHiveIntervalYearMonth(); - HiveIntervalYearMonth expected = ((HiveIntervalYearMonthWritable) writable).getHiveIntervalYearMonth(); - if (!value.equals(expected)) { - TestCase.fail("HiveIntervalYearMonth field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + serializeWrite.finishList(); + } + break; + case MAP: + { + MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo; + TypeInfo keyTypeInfo = mapTypeInfo.getMapKeyTypeInfo(); + TypeInfo valueTypeInfo = mapTypeInfo.getMapValueTypeInfo(); + serializeWrite.beginMap(); + HashMap hashMap = (HashMap) object; + boolean isFirst = true; + for (Entry entry : hashMap.entrySet()) { + if (isFirst) { + isFirst = false; + } else { + serializeWrite.writeMapKeyPairSeparator(); + } + if (entry.getKey() == null) { + serializeWrite.writeNull(); + } else { + serializeWrite(serializeWrite, keyTypeInfo, entry.getKey()); + } + serializeWrite.writeMapKeySeparator(); + if (entry.getValue() == null) { + serializeWrite.writeNull(); + } else { + serializeWrite(serializeWrite, valueTypeInfo, entry.getValue()); + } } + serializeWrite.finishMap(); } break; - case INTERVAL_DAY_TIME: + case STRUCT: { - HiveIntervalDayTime value = deserializeRead.currentHiveIntervalDayTimeWritable.getHiveIntervalDayTime(); - HiveIntervalDayTime expected = ((HiveIntervalDayTimeWritable) writable).getHiveIntervalDayTime(); - if (!value.equals(expected)) { - TestCase.fail("HiveIntervalDayTime field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; + ArrayList fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos(); + ArrayList fieldValues = (ArrayList) object; + final int size = fieldValues.size(); + serializeWrite.beginStruct(); + for (int i = 0; i < size; i++) { + serializeWrite(serializeWrite, fieldTypeInfos.get(i), fieldValues.get(i)); } + serializeWrite.finishStruct(); } break; - case BINARY: + case UNION: { - byte[] byteArray = Arrays.copyOfRange( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); - BytesWritable bytesWritable = (BytesWritable) writable; - byte[] expected = Arrays.copyOfRange(bytesWritable.getBytes(), 0, bytesWritable.getLength()); - if (byteArray.length != expected.length){ - TestCase.fail("Byte Array field mismatch (expected " + Arrays.toString(expected) - + " found " + Arrays.toString(byteArray) + ")"); - } - for (int b = 0; b < byteArray.length; b++) { - if (byteArray[b] != expected[b]) { - TestCase.fail("Byte Array field mismatch (expected " + Arrays.toString(expected) - + " found " + Arrays.toString(byteArray) + ")"); - } - } + UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo; + List fieldTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos(); + final int size = fieldTypeInfos.size(); + StandardUnion standardUnion = (StandardUnion) object; + byte tag = standardUnion.getTag(); + serializeWrite.beginUnion(tag); + serializeWrite(serializeWrite, fieldTypeInfos.get(tag), standardUnion.getObject()); + serializeWrite.finishUnion(); } break; default: - throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory()); + throw new Error("Unknown category " + typeInfo.getCategory().name()); } } - public static void serializeWrite(SerializeWrite serializeWrite, - PrimitiveTypeInfo primitiveTypeInfo, Writable writable) throws IOException { - if (writable == null) { - serializeWrite.writeNull(); - return; + public Object readComplexPrimitiveField(DeserializeRead deserializeRead, + PrimitiveTypeInfo primitiveTypeInfo) throws IOException { + boolean isNull = !deserializeRead.readComplexField(); + if (isNull) { + return null; + } else { + return doReadComplexPrimitiveField(deserializeRead, primitiveTypeInfo); } + } + + private static Object doReadComplexPrimitiveField(DeserializeRead deserializeRead, + PrimitiveTypeInfo primitiveTypeInfo) throws IOException { switch (primitiveTypeInfo.getPrimitiveCategory()) { - case BOOLEAN: - { - boolean value = ((BooleanWritable) writable).get(); - serializeWrite.writeBoolean(value); - } - break; + case BOOLEAN: + return new BooleanWritable(deserializeRead.currentBoolean); case BYTE: - { - byte value = ((ByteWritable) writable).get(); - serializeWrite.writeByte(value); - } - break; + return new ByteWritable(deserializeRead.currentByte); case SHORT: - { - short value = ((ShortWritable) writable).get(); - serializeWrite.writeShort(value); - } - break; + return new ShortWritable(deserializeRead.currentShort); case INT: - { - int value = ((IntWritable) writable).get(); - serializeWrite.writeInt(value); - } - break; + return new IntWritable(deserializeRead.currentInt); case LONG: - { - long value = ((LongWritable) writable).get(); - serializeWrite.writeLong(value); - } - break; + return new LongWritable(deserializeRead.currentLong); case FLOAT: - { - float value = ((FloatWritable) writable).get(); - serializeWrite.writeFloat(value); - } - break; + return new FloatWritable(deserializeRead.currentFloat); case DOUBLE: - { - double value = ((DoubleWritable) writable).get(); - serializeWrite.writeDouble(value); - } - break; + return new DoubleWritable(deserializeRead.currentDouble); case STRING: - { - Text value = (Text) writable; - byte[] stringBytes = value.getBytes(); - int stringLength = stringBytes.length; - serializeWrite.writeString(stringBytes, 0, stringLength); - } - break; + return new String( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength, + StandardCharsets.UTF_8); case CHAR: - { - HiveChar value = ((HiveCharWritable) writable).getHiveChar(); - serializeWrite.writeHiveChar(value); - } - break; + return new HiveChar( + new String( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength, + StandardCharsets.UTF_8), + ((CharTypeInfo) primitiveTypeInfo).getLength()); case VARCHAR: - { - HiveVarchar value = ((HiveVarcharWritable) writable).getHiveVarchar(); - serializeWrite.writeHiveVarchar(value); - } - break; + return new HiveVarchar( + new String( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength, + StandardCharsets.UTF_8), + ((VarcharTypeInfo) primitiveTypeInfo).getLength()); case DECIMAL: - { - HiveDecimal value = ((HiveDecimalWritable) writable).getHiveDecimal(); - DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)primitiveTypeInfo; - serializeWrite.writeHiveDecimal(value, decTypeInfo.scale()); - } - break; + return new HiveDecimalWritable(deserializeRead.currentHiveDecimalWritable); case DATE: - { - Date value = ((DateWritable) writable).get(); - serializeWrite.writeDate(value); - } - break; + return new DateWritable(deserializeRead.currentDateWritable); case TIMESTAMP: - { - Timestamp value = ((TimestampWritable) writable).getTimestamp(); - serializeWrite.writeTimestamp(value); - } - break; + return new TimestampWritable(deserializeRead.currentTimestampWritable); case INTERVAL_YEAR_MONTH: - { - HiveIntervalYearMonth value = ((HiveIntervalYearMonthWritable) writable).getHiveIntervalYearMonth(); - serializeWrite.writeHiveIntervalYearMonth(value); - } - break; + return new HiveIntervalYearMonthWritable(deserializeRead.currentHiveIntervalYearMonthWritable); case INTERVAL_DAY_TIME: - { - HiveIntervalDayTime value = ((HiveIntervalDayTimeWritable) writable).getHiveIntervalDayTime(); - serializeWrite.writeHiveIntervalDayTime(value); - } - break; + return new HiveIntervalDayTimeWritable(deserializeRead.currentHiveIntervalDayTimeWritable); case BINARY: - { - BytesWritable byteWritable = (BytesWritable) writable; - byte[] binaryBytes = byteWritable.getBytes(); - int length = byteWritable.getLength(); - serializeWrite.writeBinary(binaryBytes, 0, length); + return new BytesWritable( + Arrays.copyOfRange( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesLength)); + default: + throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory()); + } + } + + public static Object deserializeReadComplexType(DeserializeRead deserializeRead, + TypeInfo typeInfo) throws IOException { + + boolean isNull = !deserializeRead.readNextField(); + if (isNull) { + return null; + } + return getComplexField(deserializeRead, typeInfo); + } + + static int fake = 0; + + private static Object getComplexField(DeserializeRead deserializeRead, + TypeInfo typeInfo) throws IOException { + switch (typeInfo.getCategory()) { + case PRIMITIVE: + return doReadComplexPrimitiveField(deserializeRead, (PrimitiveTypeInfo) typeInfo); + case LIST: + { + ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; + TypeInfo elementTypeInfo = listTypeInfo.getListElementTypeInfo(); + ArrayList list = new ArrayList(); + Object eleObj; + boolean isNull; + while (deserializeRead.isNextComplexMultiValue()) { + isNull = !deserializeRead.readComplexField(); + if (isNull) { + eleObj = null; + } else { + eleObj = getComplexField(deserializeRead, elementTypeInfo); + if (eleObj instanceof String && ((String) eleObj).equals("SMNAR")) { + fake++; + } + } + list.add(eleObj); + } + return list; + } + case MAP: + { + MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo; + TypeInfo keyTypeInfo = mapTypeInfo.getMapKeyTypeInfo(); + TypeInfo valueTypeInfo = mapTypeInfo.getMapValueTypeInfo(); + HashMap hashMap = new HashMap(); + Object keyObj; + Object valueObj; + boolean isNull; + while (deserializeRead.isNextComplexMultiValue()) { + isNull = !deserializeRead.readComplexField(); + if (isNull) { + keyObj = null; + } else { + keyObj = getComplexField(deserializeRead, keyTypeInfo); + } + isNull = !deserializeRead.readComplexField(); + if (isNull) { + valueObj = null; + } else { + valueObj = getComplexField(deserializeRead, valueTypeInfo); + } + hashMap.put(keyObj, valueObj); + } + return hashMap; + } + case STRUCT: + { + StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; + ArrayList fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos(); + final int size = fieldTypeInfos.size(); + ArrayList fieldValues = new ArrayList(); + Object fieldObj; + boolean isNull; + for (int i = 0; i < size; i++) { + isNull = !deserializeRead.readComplexField(); + if (isNull) { + fieldObj = null; + } else { + fieldObj = getComplexField(deserializeRead, fieldTypeInfos.get(i)); + } + fieldValues.add(fieldObj); + } + deserializeRead.finishComplexVariableFieldsType(); + return fieldValues; + } + case UNION: + { + UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo; + List unionTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos(); + final int size = unionTypeInfos.size(); + Object tagObj; + int tag; + Object unionObj; + boolean isNull = !deserializeRead.readComplexField(); + if (isNull) { + unionObj = null; + } else { + // Get the tag value. + tagObj = getComplexField(deserializeRead, TypeInfoFactory.intTypeInfo); + tag = ((IntWritable) tagObj).get(); + + isNull = !deserializeRead.readComplexField(); + if (isNull) { + unionObj = null; + } else { + // Get the union value. + unionObj = getComplexField(deserializeRead, unionTypeInfos.get(tag)); + } + } + + deserializeRead.finishComplexVariableFieldsType(); + return unionObj; } - break; default: - throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory().name()); + throw new Error("Unexpected category " + typeInfo.getCategory()); } } } \ No newline at end of file diff --git serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestClass.java serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestClass.java index df5e8db..77982a6 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestClass.java +++ serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestClass.java @@ -230,6 +230,9 @@ public static void nonRandomRowFill(Object[][] rows, PrimitiveCategory[] primiti for (int i = 0; i < minCount; i++) { Object[] row = rows[i]; for (int c = 0; c < primitiveCategories.length; c++) { + if (primitiveCategories[c] == null) { + continue; + } Object object = row[c]; // Current value. switch (primitiveCategories[c]) { case BOOLEAN: diff --git serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java index 5f5b03a..ce1f7d0 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java +++ serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java @@ -307,7 +307,9 @@ private void testBinarySortableFastCase(int caseNum, boolean doNonRandomFill, Ra throws Throwable { SerdeRandomRowSource source = new SerdeRandomRowSource(); - source.init(r); + + // UNDONE: Until Fast BinarySortable supports complex types -- disable. + source.init(r, false, 4); int rowCount = 1000; Object[][] rows = source.randomRows(rowCount); diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java index c857b42..2cc60fc 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java +++ serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java @@ -17,7 +17,12 @@ */ package org.apache.hadoop.hive.serde2.lazy; +import java.io.IOException; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import java.util.Properties; import java.util.Random; @@ -33,7 +38,13 @@ import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleSerializeWrite; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; @@ -46,12 +57,12 @@ private void testLazySimpleFast( SerdeRandomRowSource source, Object[][] rows, LazySimpleSerDe serde, StructObjectInspector rowOI, LazySimpleSerDe serde_fewer, StructObjectInspector writeRowOI, - byte separator, LazySerDeParameters serdeParams, LazySerDeParameters serdeParams_fewer, - PrimitiveTypeInfo[] primitiveTypeInfos, + LazySerDeParameters serdeParams, LazySerDeParameters serdeParams_fewer, + TypeInfo[] typeInfos, boolean useIncludeColumns, boolean doWriteFewerColumns, Random r) throws Throwable { int rowCount = rows.length; - int columnCount = primitiveTypeInfos.length; + int columnCount = typeInfos.length; boolean[] columnsToInclude = null; if (useIncludeColumns) { @@ -62,10 +73,10 @@ private void testLazySimpleFast( } int writeColumnCount = columnCount; - PrimitiveTypeInfo[] writePrimitiveTypeInfos = primitiveTypeInfos; + TypeInfo[] writeTypeInfos = typeInfos; if (doWriteFewerColumns) { writeColumnCount = writeRowOI.getAllStructFieldRefs().size(); - writePrimitiveTypeInfos = Arrays.copyOf(primitiveTypeInfos, writeColumnCount); + writeTypeInfos = Arrays.copyOf(typeInfos, writeColumnCount); } // Try to serialize @@ -75,16 +86,15 @@ private void testLazySimpleFast( Output output = new Output(); LazySimpleSerializeWrite lazySimpleSerializeWrite = - new LazySimpleSerializeWrite(columnCount, - separator, serdeParams); + new LazySimpleSerializeWrite(columnCount, serdeParams); lazySimpleSerializeWrite.set(output); for (int index = 0; index < columnCount; index++) { - Writable writable = (Writable) row[index]; + Object object = row[index]; - VerifyFast.serializeWrite(lazySimpleSerializeWrite, primitiveTypeInfos[index], writable); + VerifyFast.serializeWrite(lazySimpleSerializeWrite, typeInfos[index], object); } BytesWritable bytesWritable = new BytesWritable(); @@ -97,9 +107,9 @@ private void testLazySimpleFast( Object[] row = rows[i]; LazySimpleDeserializeRead lazySimpleDeserializeRead = new LazySimpleDeserializeRead( - writePrimitiveTypeInfos, + writeTypeInfos, /* useExternalBuffer */ false, - separator, serdeParams); + serdeParams); BytesWritable bytesWritable = serializeWriteBytes[i]; byte[] bytes = bytesWritable.getBytes(); @@ -116,10 +126,10 @@ private void testLazySimpleFast( lazySimpleDeserializeRead.skipNextField(); } else if (index >= writeColumnCount) { // Should come back a null. - VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, primitiveTypeInfos[index], null); + verifyReadNull(lazySimpleDeserializeRead, typeInfos[index]); } else { - Writable writable = (Writable) row[index]; - VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, primitiveTypeInfos[index], writable); + Object expectedObject = row[index]; + verifyRead(lazySimpleDeserializeRead, typeInfos[index], expectedObject); } } if (writeColumnCount == columnCount) { @@ -128,28 +138,22 @@ private void testLazySimpleFast( } // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite. - for (int i = 0; i < rowCount; i++) { - BytesWritable bytesWritable = serializeWriteBytes[i]; + for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { + BytesWritable bytesWritable = serializeWriteBytes[rowIndex]; LazyStruct lazySimpleStruct = (LazyStruct) serde.deserialize(bytesWritable); - Object[] row = rows[i]; + Object[] row = rows[rowIndex]; for (int index = 0; index < columnCount; index++) { - PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[index]; - Writable writable = (Writable) row[index]; - LazyPrimitive lazyPrimitive = (LazyPrimitive) lazySimpleStruct.getField(index); - Object object; - if (lazyPrimitive != null) { - object = lazyPrimitive.getWritableObject(); - } else { - object = null; - } - if (writable == null || object == null) { - if (writable != null || object != null) { + TypeInfo typeInfo = typeInfos[index]; + Object expectedObject = row[index]; + Object object = lazySimpleStruct.getField(index); + if (expectedObject == null || object == null) { + if (expectedObject != null || object != null) { fail("SerDe deserialized NULL column mismatch"); } } else { - if (!object.equals(writable)) { + if (!VerifyLazy.lazyCompare(typeInfo, object, expectedObject)) { fail("SerDe deserialized value does not match"); } } @@ -185,9 +189,9 @@ private void testLazySimpleFast( LazySimpleDeserializeRead lazySimpleDeserializeRead = new LazySimpleDeserializeRead( - writePrimitiveTypeInfos, + writeTypeInfos, /* useExternalBuffer */ false, - separator, serdeParams); + serdeParams); byte[] bytes = serdeBytes[i]; lazySimpleDeserializeRead.set(bytes, 0, bytes.length); @@ -197,10 +201,10 @@ private void testLazySimpleFast( lazySimpleDeserializeRead.skipNextField(); } else if (index >= writeColumnCount) { // Should come back a null. - VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, primitiveTypeInfos[index], null); + verifyReadNull(lazySimpleDeserializeRead, typeInfos[index]); } else { - Writable writable = (Writable) row[index]; - VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, primitiveTypeInfos[index], writable); + Object expectedObject = row[index]; + verifyRead(lazySimpleDeserializeRead, typeInfos[index], expectedObject); } } if (writeColumnCount == columnCount) { @@ -209,6 +213,44 @@ private void testLazySimpleFast( } } + private void verifyReadNull(LazySimpleDeserializeRead lazySimpleDeserializeRead, + TypeInfo typeInfo) throws IOException { + if (typeInfo.getCategory() == Category.PRIMITIVE) { + VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, typeInfo, null); + } else { + Object complexFieldObj = VerifyFast.deserializeReadComplexType(lazySimpleDeserializeRead, typeInfo); + if (complexFieldObj != null) { + TestCase.fail("Field report not null but object is null"); + } + } + } + + static int fake = 0; + + private void verifyRead(LazySimpleDeserializeRead lazySimpleDeserializeRead, + TypeInfo typeInfo, Object expectedObject) throws IOException { + if (typeInfo.getCategory() == Category.PRIMITIVE) { + VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, typeInfo, expectedObject); + } else { + if (expectedObject instanceof ArrayList && ((ArrayList) expectedObject).size() == 0) { + fake++; + } + Object complexFieldObj = VerifyFast.deserializeReadComplexType(lazySimpleDeserializeRead, typeInfo); + if (expectedObject == null) { + if (complexFieldObj != null) { + TestCase.fail("Field reports not null but object is null (class " + complexFieldObj.getClass().getName() + ", " + complexFieldObj.toString() + ")"); + } + } else { + if (complexFieldObj == null) { + TestCase.fail("Field reports null but object is not null (class " + expectedObject.getClass().getName() + ", " + expectedObject.toString() + ")"); + } + } + if (!VerifyLazy.lazyCompare(typeInfo, complexFieldObj, expectedObject)) { + TestCase.fail("Comparision failed typeInfo " + typeInfo.toString()); + } + } + } + private byte[] copyBytes(Text serialized) { byte[] result = new byte[serialized.getLength()]; System.arraycopy(serialized.getBytes(), 0, result, 0, serialized.getLength()); @@ -238,17 +280,22 @@ private LazySimpleSerDe getSerDe(String fieldNames, String fieldTypes) throws Se return serDe; } - private LazySerDeParameters getSerDeParams(String fieldNames, String fieldTypes) throws SerDeException { + private LazySerDeParameters getSerDeParams(String fieldNames, String fieldTypes, + byte[] separators) throws SerDeException { Configuration conf = new Configuration(); Properties tbl = createProperties(fieldNames, fieldTypes); - return new LazySerDeParameters(conf, tbl, LazySimpleSerDe.class.getName()); + LazySerDeParameters lazySerDeParams = new LazySerDeParameters(conf, tbl, LazySimpleSerDe.class.getName()); + for (int i = 0; i < separators.length; i++) { + lazySerDeParams.setSeparator(i, separators[i]); + } + return lazySerDeParams; } public void testLazySimpleFastCase(int caseNum, boolean doNonRandomFill, Random r) throws Throwable { SerdeRandomRowSource source = new SerdeRandomRowSource(); - source.init(r); + source.init(r, true, 1); int rowCount = 1000; Object[][] rows = source.randomRows(rowCount); @@ -259,8 +306,8 @@ public void testLazySimpleFastCase(int caseNum, boolean doNonRandomFill, Random StructObjectInspector rowStructObjectInspector = source.rowStructObjectInspector(); - PrimitiveTypeInfo[] primitiveTypeInfos = source.primitiveTypeInfos(); - int columnCount = primitiveTypeInfos.length; + TypeInfo[] typeInfos = source.typeInfos(); + int columnCount = typeInfos.length; int writeColumnCount = columnCount; StructObjectInspector writeRowStructObjectInspector = rowStructObjectInspector; @@ -277,8 +324,11 @@ public void testLazySimpleFastCase(int caseNum, boolean doNonRandomFill, Random String fieldNames = ObjectInspectorUtils.getFieldNames(rowStructObjectInspector); String fieldTypes = ObjectInspectorUtils.getFieldTypes(rowStructObjectInspector); + // Use different separator values. + byte[] separators = new byte[] {(byte) 9, (byte) 2, (byte) 3, (byte) 4, (byte) 5, (byte) 6, (byte) 7, (byte) 8}; + LazySimpleSerDe serde = getSerDe(fieldNames, fieldTypes); - LazySerDeParameters serdeParams = getSerDeParams(fieldNames, fieldTypes); + LazySerDeParameters serdeParams = getSerDeParams(fieldNames, fieldTypes, separators); LazySimpleSerDe serde_fewer = null; LazySerDeParameters serdeParams_fewer = null; @@ -287,22 +337,22 @@ public void testLazySimpleFastCase(int caseNum, boolean doNonRandomFill, Random String partialFieldTypes = ObjectInspectorUtils.getFieldTypes(writeRowStructObjectInspector); serde_fewer = getSerDe(fieldNames, fieldTypes); - serdeParams_fewer = getSerDeParams(partialFieldNames, partialFieldTypes); + serdeParams_fewer = getSerDeParams(partialFieldNames, partialFieldTypes, separators); } - byte separator = (byte) '\t'; + testLazySimpleFast( source, rows, serde, rowStructObjectInspector, serde_fewer, writeRowStructObjectInspector, - separator, serdeParams, serdeParams_fewer, primitiveTypeInfos, + serdeParams, serdeParams_fewer, typeInfos, /* useIncludeColumns */ false, /* doWriteFewerColumns */ false, r); testLazySimpleFast( source, rows, serde, rowStructObjectInspector, serde_fewer, writeRowStructObjectInspector, - separator, serdeParams, serdeParams_fewer, primitiveTypeInfos, + serdeParams, serdeParams_fewer, typeInfos, /* useIncludeColumns */ true, /* doWriteFewerColumns */ false, r); if (doWriteFewerColumns) { @@ -310,14 +360,14 @@ public void testLazySimpleFastCase(int caseNum, boolean doNonRandomFill, Random source, rows, serde, rowStructObjectInspector, serde_fewer, writeRowStructObjectInspector, - separator, serdeParams, serdeParams_fewer, primitiveTypeInfos, + serdeParams, serdeParams_fewer, typeInfos, /* useIncludeColumns */ false, /* doWriteFewerColumns */ true, r); testLazySimpleFast( source, rows, serde, rowStructObjectInspector, serde_fewer, writeRowStructObjectInspector, - separator, serdeParams, serdeParams_fewer, primitiveTypeInfos, + serdeParams, serdeParams_fewer, typeInfos, /* useIncludeColumns */ true, /* doWriteFewerColumns */ true, r); } } @@ -325,10 +375,10 @@ public void testLazySimpleFastCase(int caseNum, boolean doNonRandomFill, Random public void testLazySimpleFast() throws Throwable { try { - Random r = new Random(35790); + Random r = new Random(8322); int caseNum = 0; - for (int i = 0; i < 10; i++) { + for (int i = 0; i < 20; i++) { testLazySimpleFastCase(caseNum, (i % 2 == 0), r); caseNum++; } diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java index e62a80a..75d4e0b 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java +++ serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java @@ -213,7 +213,9 @@ private void testLazyBinaryFast( public void testLazyBinaryFastCase(int caseNum, boolean doNonRandomFill, Random r) throws Throwable { SerdeRandomRowSource source = new SerdeRandomRowSource(); - source.init(r); + + // UNDONE: Until Fast LazyBinary supports complex types -- disable. + source.init(r, false, 4); int rowCount = 1000; Object[][] rows = source.randomRows(rowCount); @@ -289,7 +291,7 @@ public void testLazyBinaryFastCase(int caseNum, boolean doNonRandomFill, Random public void testLazyBinaryFast() throws Throwable { try { - Random r = new Random(35790); + Random r = new Random(9983); int caseNum = 0; for (int i = 0; i < 10; i++) {