diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index f2ba3c5..23c3578 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -454,9 +454,9 @@ private static BaseWork getBaseWork(Configuration conf, String name) { } } - public static Map> getMapWorkAllScratchColumnVectorTypeMaps(Configuration hiveConf) { + public static Map getMapWorkVectorScratchColumnTypeMap(Configuration hiveConf) { MapWork mapWork = getMapWork(hiveConf); - return mapWork.getAllScratchColumnVectorTypeMaps(); + return mapWork.getVectorScratchColumnTypeMap(); } public static void setWorkflowAdjacencies(Configuration conf, QueryPlan plan) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java index 071b144..0531a0f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java @@ -109,7 +109,7 @@ public Object call() { sources[tag] = new ReduceRecordSource(); sources[tag].init(jconf, reducer, redWork.getVectorMode(), keyTableDesc, valueTableDesc, reader, tag == position, (byte) tag, - redWork.getAllScratchColumnVectorTypeMaps()); + redWork.getVectorScratchColumnTypeMap()); ois[tag] = sources[tag].getObjectInspector(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java index aa80510..c8182aa 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hive.ql.exec.tez; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; @@ -28,6 +29,9 @@ import org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorDeserializeRow; import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; @@ -40,6 +44,9 @@ import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe; +import org.apache.hadoop.hive.serde2.binarysortable.deserializeread.BinarySortableDeserializeRead; +import org.apache.hadoop.hive.serde2.lazybinary.deserializeread.LazyBinaryDeserializeRead; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; @@ -47,7 +54,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; @@ -85,13 +91,15 @@ private boolean vectorized = false; - private DataOutputBuffer keyBuffer; - private DataOutputBuffer valueBuffer; + private VectorDeserializeRow keyBinarySortableDeserializeToRow; + + private VectorDeserializeRow valueLazyBinaryDeserializeToRow; + private VectorizedRowBatchCtx batchContext; private VectorizedRowBatch batch; // number of columns pertaining to keys in a vectorized row batch - private int keysColumnOffset; + private int firstValueColumnOffset; private final int BATCH_SIZE = VectorizedRowBatch.DEFAULT_SIZE; private StructObjectInspector keyStructInspector; @@ -114,9 +122,12 @@ void init(JobConf jconf, Operator reducer, boolean vectorized, TableDesc keyTableDesc, TableDesc valueTableDesc, KeyValuesReader reader, boolean handleGroupKey, byte tag, - Map> scratchColumnVectorTypes) + Map reduceShuffleScratchColumnTypeMap) throws Exception { + // Configuration conf = jconf; + // HiveConf hiveConf = (HiveConf) conf; + ObjectInspector keyObjectInspector; this.reducer = reducer; @@ -134,9 +145,7 @@ void init(JobConf jconf, Operator reducer, boolean vectorized, TableDesc keyT if(vectorized) { keyStructInspector = (StructObjectInspector) keyObjectInspector; - keysColumnOffset = keyStructInspector.getAllStructFieldRefs().size(); - keyBuffer = new DataOutputBuffer(); - valueBuffer = new DataOutputBuffer(); + firstValueColumnOffset = keyStructInspector.getAllStructFieldRefs().size(); } // We should initialize the SerDe with the TypeInfo when available. @@ -153,7 +162,7 @@ void init(JobConf jconf, Operator reducer, boolean vectorized, TableDesc keyT /* vectorization only works with struct object inspectors */ valueStructInspectors = (StructObjectInspector) valueObjectInspector; - final int totalColumns = keysColumnOffset + + final int totalColumns = firstValueColumnOffset + valueStructInspectors.getAllStructFieldRefs().size(); valueStringWriters = new ArrayList(totalColumns); valueStringWriters.addAll(Arrays @@ -180,11 +189,32 @@ void init(JobConf jconf, Operator reducer, boolean vectorized, TableDesc keyT } rowObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(colNames, ois); - Map reduceShuffleScratchColumnTypeMap = - scratchColumnVectorTypes.get("_REDUCE_SHUFFLE_"); batchContext = new VectorizedRowBatchCtx(); batchContext.init(reduceShuffleScratchColumnTypeMap, (StructObjectInspector) rowObjectInspector); batch = batchContext.createVectorizedRowBatch(); + + // Setup vectorized deserialization for the key and value. + BinarySortableSerDe binarySortableSerDe = (BinarySortableSerDe) inputKeyDeserializer; + + keyBinarySortableDeserializeToRow = new VectorDeserializeRow( + new BinarySortableDeserializeRead(binarySortableSerDe.getSortOrders())); + keyBinarySortableDeserializeToRow.init(keyStructInspector, 0); + + final int valuesSize = valueStructInspectors.getAllStructFieldRefs().size(); + if (valuesSize > 0) { + valueLazyBinaryDeserializeToRow = new VectorDeserializeRow( + new LazyBinaryDeserializeRead(valuesSize)); + valueLazyBinaryDeserializeToRow.init(valueStructInspectors, firstValueColumnOffset); + + // Create data buffers for value bytes column vectors. + for (int i = firstValueColumnOffset; i < batch.numCols; i++) { + ColumnVector colVector = batch.cols[i]; + if (colVector instanceof BytesColumnVector) { + BytesColumnVector bytesColumnVector = (BytesColumnVector) colVector; + bytesColumnVector.initBuffer(); + } + } + } } else { ois.add(keyObjectInspector); ois.add(valueObjectInspector); @@ -211,9 +241,12 @@ public final boolean isGrouped() { @Override public boolean pushRecord() throws HiveException { - BytesWritable keyWritable; - if (!vectorized && groupIterator.hasNext()) { + if (vectorized) { + return pushRecordVector(); + } + + if (groupIterator.hasNext()) { // if we have records left in the group we push one of those groupIterator.next(); return true; @@ -222,11 +255,11 @@ public boolean pushRecord() throws HiveException { try { if (!reader.next()) { return false; - } else { - keyWritable = (BytesWritable) reader.getCurrentKey(); - valueWritables = reader.getCurrentValues(); } + BytesWritable keyWritable = (BytesWritable) reader.getCurrentKey(); + valueWritables = reader.getCurrentValues(); + //Set the key, check if this is a new group or same group try { keyObject = inputKeyDeserializer.deserialize(keyWritable); @@ -250,13 +283,9 @@ public boolean pushRecord() throws HiveException { reducer.setGroupKeyObject(keyObject); } - if(vectorized) { - processVectors(valueWritables, tag); - } else { - groupIterator.initialize(valueWritables, keyObject, tag); - if (groupIterator.hasNext()) { - groupIterator.next(); // push first record of group - } + groupIterator.initialize(valueWritables, keyObject, tag); + if (groupIterator.hasNext()) { + groupIterator.next(); // push first record of group } return true; } catch (Throwable e) { @@ -340,48 +369,95 @@ public void next() throws HiveException { } } + private boolean pushRecordVector() { + try { + if (!reader.next()) { + return false; + } + + BytesWritable keyWritable = (BytesWritable) reader.getCurrentKey(); + valueWritables = reader.getCurrentValues(); + + // Check if this is a new group or same group + if (handleGroupKey && !keyWritable.equals(this.groupKey)) { + // If a operator wants to do some work at the beginning of a group + if (groupKey == null) { // the first group + this.groupKey = new BytesWritable(); + } else { + // If a operator wants to do some work at the end of a group + reducer.endGroup(); + } + + groupKey.set(keyWritable.getBytes(), 0, keyWritable.getLength()); + reducer.startGroup(); + } + + processVectorGroup(keyWritable, valueWritables, tag); + return true; + } catch (Throwable e) { + abort = true; + if (e instanceof OutOfMemoryError) { + // Don't create a new object if we are already out of memory + throw (OutOfMemoryError) e; + } else { + l4j.fatal(StringUtils.stringifyException(e)); + throw new RuntimeException(e); + } + } + } + /** * @param values * @return true if it is not done and can take more inputs */ - private void processVectors(Iterable values, byte tag) throws HiveException { - /* deserialize key into columns */ - VectorizedBatchUtil.addRowToBatchFrom(keyObject, keyStructInspector, - 0, 0, batch, keyBuffer); - for(int i = 0; i < keysColumnOffset; i++) { + private void processVectorGroup(BytesWritable keyWritable, + Iterable values, byte tag) throws HiveException, IOException { + + // Deserialize key into vector row columns. + // Since we referencing byte column vector byte arrays by reference, we don't need + // a data buffer. + byte[] keyBytes = keyWritable.getBytes(); + int keyLength = keyWritable.getLength(); + keyBinarySortableDeserializeToRow.setBytes(keyBytes, 0, keyLength); + keyBinarySortableDeserializeToRow.deserializeByReference(batch, 0); + for(int i = 0; i < firstValueColumnOffset; i++) { VectorizedBatchUtil.setRepeatingColumn(batch, i); } int rowIdx = 0; try { for (Object value : values) { - /* deserialize value into columns */ - BytesWritable valueWritable = (BytesWritable) value; - Object valueObj = deserializeValue(valueWritable, tag); - - VectorizedBatchUtil.addRowToBatchFrom(valueObj, valueStructInspectors, - rowIdx, keysColumnOffset, batch, valueBuffer); + if (valueLazyBinaryDeserializeToRow != null) { + // Deserialize value into vector row columns. + BytesWritable valueWritable = (BytesWritable) value; + byte[] valueBytes = valueWritable.getBytes(); + int valueLength = valueWritable.getLength(); + valueLazyBinaryDeserializeToRow.setBytes(valueBytes, 0, valueLength); + // TODO: Perhaps we could use "by reference" if we knew the stability of the + // values iterator objects... + valueLazyBinaryDeserializeToRow.deserializeByValue(batch, rowIdx); + } rowIdx++; if (rowIdx >= BATCH_SIZE) { VectorizedBatchUtil.setBatchSize(batch, rowIdx); + // VectorizedBatchUtil.debugDisplayBatch(batch, "processVectorGroup"); reducer.processOp(batch, tag); // Reset just the value columns and value buffer. - for (int i = keysColumnOffset; i < batch.numCols; i++) { + for (int i = firstValueColumnOffset; i < batch.numCols; i++) { + // Note that reset also resets the data buffer for bytes column vectors. batch.cols[i].reset(); } - valueBuffer.reset(); rowIdx = 0; } } if (rowIdx > 0) { // Flush final partial batch. VectorizedBatchUtil.setBatchSize(batch, rowIdx); + // VectorizedBatchUtil.debugDisplayBatch(batch, "processVectorGroup"); reducer.processOp(batch, tag); } batch.reset(); - keyBuffer.reset(); - valueBuffer.reset(); } catch (Exception e) { String rowString = null; try { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAppMasterEventOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAppMasterEventOperator.java index d05cc23..440fc83 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAppMasterEventOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAppMasterEventOperator.java @@ -18,24 +18,12 @@ package org.apache.hadoop.hive.ql.exec.vector; -import java.io.IOException; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.AppMasterEventDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.common.StatsSetupConst; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.SerDeStats; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.io.ObjectWritable; -import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; /** @@ -45,14 +33,23 @@ private static final long serialVersionUID = 1L; - protected transient Object[] singleRow; + private VectorizationContext vContext; + + // The above members are initialized by the constructor and must not be + // transient. + //--------------------------------------------------------------------------- - protected transient VectorExpressionWriter[] valueWriters; + private transient boolean firstBatch; - public VectorAppMasterEventOperator(VectorizationContext context, + private transient VectorExtractRowSameBatch vectorExtractRowSameBatch; + + protected transient Object[] singleRow; + + public VectorAppMasterEventOperator(VectorizationContext vContext, OperatorDesc conf) { super(); this.conf = (AppMasterEventDesc) conf; + this.vContext = vContext; } public VectorAppMasterEventOperator() { @@ -60,69 +57,69 @@ public VectorAppMasterEventOperator() { @Override public void initializeOp(Configuration hconf) throws HiveException { + + // We need a input object inspector that is for the row we will extract out of the + // vectorized row batch, not for example, an original inspector for an ORC table, etc. + inputObjInspectors[0] = + VectorizedBatchUtil.convertToStandardStructObjectInspector((StructObjectInspector) inputObjInspectors[0]); + + // Call AppMasterEventOperator with new input inspector. super.initializeOp(hconf); - valueWriters = VectorExpressionWriterFactory.getExpressionWriters( - (StructObjectInspector) inputObjInspectors[0]); - singleRow = new Object[valueWriters.length]; + + firstBatch = true; } @Override public void processOp(Object data, int tag) throws HiveException { - - VectorizedRowBatch vrg = (VectorizedRowBatch) data; - - Writable [] records = null; - Writable recordValue = null; - boolean vectorizedSerde = false; - try { - if (serializer instanceof VectorizedSerde) { - recordValue = ((VectorizedSerde) serializer).serializeVector(vrg, - inputObjInspectors[0]); - records = (Writable[]) ((ObjectWritable) recordValue).get(); - vectorizedSerde = true; - } - } catch (SerDeException e1) { - throw new HiveException(e1); + VectorizedRowBatch batch = (VectorizedRowBatch) data; + if (firstBatch) { + vectorExtractRowSameBatch = new VectorExtractRowSameBatch(); + vectorExtractRowSameBatch.init((StructObjectInspector) inputObjInspectors[0], vContext.getProjectedColumns()); + vectorExtractRowSameBatch.setOneBatch(batch); + + singleRow = new Object[vectorExtractRowSameBatch.getCount()]; + + firstBatch = false; } - - for (int i = 0; i < vrg.size; i++) { - Writable row = null; - if (vectorizedSerde) { - row = records[i]; - } else { - if (vrg.valueWriters == null) { - vrg.setValueWriters(this.valueWriters); - } - try { - row = serializer.serialize(getRowObject(vrg, i), inputObjInspectors[0]); - } catch (SerDeException ex) { - throw new HiveException(ex); + + if (hasReachedMaxSize) { + return; + } + + try { + Writable writableRow; + if (batch.selectedInUse) { + int selected[] = batch.selected; + for (int logical = 0 ; logical < batch.size; logical++) { + int batchIndex = selected[logical]; + vectorExtractRowSameBatch.extractRow(batchIndex, singleRow); + writableRow = serializer.serialize(singleRow, inputObjInspectors[0]); + writableRow.write(buffer); + if (buffer.getLength() > MAX_SIZE) { + LOG.info("Disabling AM events. Buffer size too large: " + buffer.getLength()); + hasReachedMaxSize = true; + buffer = null; + break; + } } - } - try { - row.write(buffer); - if (buffer.getLength() > MAX_SIZE) { - LOG.info("Disabling AM events. Buffer size too large: " + buffer.getLength()); - hasReachedMaxSize = true; - buffer = null; + } else { + for (int batchIndex = 0 ; batchIndex < batch.size; batchIndex++) { + vectorExtractRowSameBatch.extractRow(batchIndex, singleRow); + writableRow = serializer.serialize(singleRow, inputObjInspectors[0]); + writableRow.write(buffer); + if (buffer.getLength() > MAX_SIZE) { + LOG.info("Disabling AM events. Buffer size too large: " + buffer.getLength()); + hasReachedMaxSize = true; + buffer = null; + break; + } } - } catch (Exception e) { - throw new HiveException(e); } + } catch (Exception e) { + throw new HiveException(e); } - } - private Object[] getRowObject(VectorizedRowBatch vrg, int rowIndex) - throws HiveException { - int batchIndex = rowIndex; - if (vrg.selectedInUse) { - batchIndex = vrg.selected[rowIndex]; - } - for (int i = 0; i < vrg.projectionSize; i++) { - ColumnVector vectorColumn = vrg.cols[vrg.projectedColumns[i]]; - singleRow[i] = vrg.valueWriters[i].writeValue(vectorColumn, batchIndex); - } - return singleRow; + // TODO: Regular operator calls forward??? } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java new file mode 100644 index 0000000..4cbcb10 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java @@ -0,0 +1,546 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.sql.Timestamp; +import java.util.List; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +/** + * This class assigns specified columns of a row from a Writable row Object[]. + * + * The caller provides the hive type names and target column numbers in the order desired to + * assign from the Writable row Object[]. + * + * This class is abstract to allow the subclasses to control batch reuse. + */ +public abstract class VectorAssignRow { + private static final long serialVersionUID = 1L; + private static final Log LOG = LogFactory.getLog(VectorAssignRow.class); + + protected abstract class Assigner { + protected int columnIndex; + + Assigner(int columnIndex) { + this.columnIndex = columnIndex; + } + + public int getColumnIndex() { + return columnIndex; + } + + abstract void setColumnVector(VectorizedRowBatch batch); + + abstract void forgetColumnVector(); + + abstract void assign(int batchIndex, Object object); + } + + private class VoidAssigner extends Assigner { + + VoidAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + } + + @Override + void forgetColumnVector() { + } + + @Override + void assign(int batchIndex, Object object) { + // This is no-op, there is no column to assign to and the object is expected to be null. + assert (object == null); + } + } + + private abstract class AbstractLongAssigner extends Assigner { + + protected LongColumnVector colVector; + protected long[] vector; + + AbstractLongAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + colVector = (LongColumnVector) batch.cols[columnIndex]; + vector = colVector.vector; + } + + @Override + void forgetColumnVector() { + colVector = null; + vector = null; + } + } + + protected class BooleanAssigner extends AbstractLongAssigner { + + BooleanAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + BooleanWritable bw = (BooleanWritable) object; + vector[batchIndex] = (bw.get() ? 1 : 0); + } + } + } + + protected class ByteAssigner extends AbstractLongAssigner { + + ByteAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + ByteWritable bw = (ByteWritable) object; + vector[batchIndex] = bw.get(); + } + } + } + + private class ShortAssigner extends AbstractLongAssigner { + + ShortAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + ShortWritable sw = (ShortWritable) object; + vector[batchIndex] = sw.get(); + } + } + } + + private class IntAssigner extends AbstractLongAssigner { + + IntAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + IntWritable iw = (IntWritable) object; + vector[batchIndex] = iw.get(); + } + } + } + + private class LongAssigner extends AbstractLongAssigner { + + LongAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + LongWritable lw = (LongWritable) object; + vector[batchIndex] = lw.get(); + } + } + } + + private class DateAssigner extends AbstractLongAssigner { + + DateAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + DateWritable bw = (DateWritable) object; + vector[batchIndex] = bw.getDays(); + } + } + } + + private class TimestampAssigner extends AbstractLongAssigner { + + TimestampAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + TimestampWritable tw = (TimestampWritable) object; + Timestamp t = tw.getTimestamp(); + vector[batchIndex] = TimestampUtils.getTimeNanoSec(t); + } + } + } + + private abstract class AbstractDoubleAssigner extends Assigner { + + protected DoubleColumnVector colVector; + protected double[] vector; + + AbstractDoubleAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + colVector = (DoubleColumnVector) batch.cols[columnIndex]; + vector = colVector.vector; + } + + @Override + void forgetColumnVector() { + colVector = null; + vector = null; + } + } + + private class FloatAssigner extends AbstractDoubleAssigner { + + FloatAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + FloatWritable fw = (FloatWritable) object; + vector[batchIndex] = fw.get(); + } + } + } + + private class DoubleAssigner extends AbstractDoubleAssigner { + + DoubleAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + DoubleWritable dw = (DoubleWritable) object; + vector[batchIndex] = dw.get(); + } + } + } + + private abstract class AbstractBytesAssigner extends Assigner { + + protected BytesColumnVector colVector; + + AbstractBytesAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + colVector = (BytesColumnVector) batch.cols[columnIndex]; + } + + @Override + void forgetColumnVector() { + colVector = null; + } + } + + private class BinaryAssigner extends AbstractBytesAssigner { + + BinaryAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + BytesWritable bw = (BytesWritable) object; + colVector.setVal(batchIndex, bw.getBytes(), 0, bw.getLength()); + } + } + } + + private class StringAssigner extends AbstractBytesAssigner { + + StringAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + Text tw = (Text) object; + colVector.setVal(batchIndex, tw.getBytes(), 0, tw.getLength()); + } + } + } + + private class VarCharAssigner extends AbstractBytesAssigner { + + VarCharAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + // We store VARCHAR type stripped of pads. + HiveVarchar hiveVarchar; + if (object instanceof HiveVarchar) { + hiveVarchar = (HiveVarchar) object; + } else { + hiveVarchar = ((HiveVarcharWritable) object).getHiveVarchar(); + } + byte[] bytes = hiveVarchar.getValue().getBytes(); + colVector.setVal(batchIndex, bytes, 0, bytes.length); + } + } + } + + private class CharAssigner extends AbstractBytesAssigner { + + CharAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + // We store CHAR type stripped of pads. + HiveChar hiveChar; + if (object instanceof HiveChar) { + hiveChar = (HiveChar) object; + } else { + hiveChar = ((HiveCharWritable) object).getHiveChar(); + } + + // We store CHAR in vector row batch with padding stripped. + byte[] bytes = hiveChar.getStrippedValue().getBytes(); + colVector.setVal(batchIndex, bytes, 0, bytes.length); + } + } + } + + private class DecimalAssigner extends Assigner { + + protected DecimalColumnVector colVector; + + DecimalAssigner(int columnIndex) { + super(columnIndex); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + colVector = (DecimalColumnVector) batch.cols[columnIndex]; + } + + @Override + void forgetColumnVector() { + colVector = null; + } + + @Override + void assign(int batchIndex, Object object) { + if (object == null) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + if (object instanceof HiveDecimal) { + colVector.set(batchIndex, (HiveDecimal) object); + } else { + colVector.set(batchIndex, (HiveDecimalWritable) object); + } + } + } + } + + private Assigner createAssigner(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) throws HiveException { + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + Assigner assigner; + switch (primitiveCategory) { + case VOID: + assigner = new VoidAssigner(columnIndex); + break; + case BOOLEAN: + assigner = new BooleanAssigner(columnIndex); + break; + case BYTE: + assigner = new ByteAssigner(columnIndex); + break; + case SHORT: + assigner = new ShortAssigner(columnIndex); + break; + case INT: + assigner = new IntAssigner(columnIndex); + break; + case LONG: + assigner = new LongAssigner(columnIndex); + break; + case TIMESTAMP: + assigner = new TimestampAssigner(columnIndex); + break; + case DATE: + assigner = new DateAssigner(columnIndex); + break; + case FLOAT: + assigner = new FloatAssigner(columnIndex); + break; + case DOUBLE: + assigner = new DoubleAssigner(columnIndex); + break; + case BINARY: + assigner = new BinaryAssigner(columnIndex); + break; + case STRING: + assigner = new StringAssigner(columnIndex); + break; + case VARCHAR: + assigner = new VarCharAssigner(columnIndex); + break; + case CHAR: + assigner = new CharAssigner(columnIndex); + break; + case DECIMAL: + assigner = new DecimalAssigner(columnIndex); + break; + default: + throw new HiveException("No vector row assigner for primitive category " + + primitiveCategory); + } + return assigner; + } + + Assigner[] assigners; + + public void init(StructObjectInspector structObjectInspector, List projectedColumns) throws HiveException { + + List fields = structObjectInspector.getAllStructFieldRefs(); + assigners = new Assigner[fields.size()]; + + int i = 0; + for (StructField field : fields) { + int columnIndex = projectedColumns.get(i); + ObjectInspector fieldInspector = field.getFieldObjectInspector(); + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString( + fieldInspector.getTypeName()); + assigners[i] = createAssigner(primitiveTypeInfo, columnIndex); + i++; + } + } + + public void init(List typeNames) throws HiveException { + + assigners = new Assigner[typeNames.size()]; + + int i = 0; + for (String typeName : typeNames) { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); + assigners[i] = createAssigner(primitiveTypeInfo, i); + i++; + } + } + + protected void setBatch(VectorizedRowBatch batch) throws HiveException { + for (Assigner assigner : assigners) { + assigner.setColumnVector(batch); + } + for (int i = 0; i < assigners.length; i++) { + Assigner assigner = assigners[i]; + int columnIndex = assigner.getColumnIndex(); + if (batch.cols[columnIndex] == null) { + throw new HiveException("Unexpected null vector column " + columnIndex); + } + assigner.setColumnVector(batch); + } + + } + + public void assignRowColumn(int batchIndex, int logicalColumnIndex, Object object) { + assigners[logicalColumnIndex].assign(batchIndex, object); + } + + public void assignRow(int batchIndex, Object[] objects) { + int i = 0; + for (Assigner assigner : assigners) { + assigner.assign(batchIndex, objects[i++]); + } + } + +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRowSameBatch.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRowSameBatch.java new file mode 100644 index 0000000..8c7c2ad --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRowSameBatch.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.hive.ql.metadata.HiveException; + +/** + * This class assigns specified columns of a VectorizedRowBatch row from a Writable row Object[]. + * + * The caller provides the hive type names and target column numbers in the order desired to + * assign from the Writable row Object[]. + * + * This class is for use when the batch being assigned is always the same. + */ +public class VectorAssignRowSameBatch extends VectorAssignRow { + + public void setOneBatch(VectorizedRowBatch batch) throws HiveException { + setBatch(batch); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java new file mode 100644 index 0000000..1cba4f7 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOrderedMap.Mapping; + +/** + * This class collects column information for copying a row from one VectorizedRowBatch to + * same/another batch. + */ +public abstract class VectorColumnMapping { + + private static final long serialVersionUID = 1L; + + protected int[] sourceColumns; + protected int[] outputColumns; + protected String[] typeNames; + + protected VectorColumnOrderedMap vectorColumnMapping; + + public VectorColumnMapping() { + this.vectorColumnMapping = new VectorColumnOrderedMap(); + } + + public abstract void add(int sourceColumn, int outputColumn, String typeName); + + public abstract void finalize(); + + public int getCount() { + return sourceColumns.length; + } + + public int[] getInputColumns() { + return sourceColumns; + } + + public int[] getOutputColumns() { + return outputColumns; + } + + public String[] getTypeNames() { + return typeNames; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("source columns: " + Arrays.toString(sourceColumns)); + sb.append(", "); + sb.append("output columns: " + Arrays.toString(outputColumns)); + sb.append(", "); + sb.append("type names: " + Arrays.toString(typeNames)); + return sb.toString(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java new file mode 100644 index 0000000..96a4f83 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java @@ -0,0 +1,112 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.ArrayList; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * This class collects column information for mapping vector columns, including the hive type name. + * + * The column information are kept ordered by a specified column. + * + * Call getMapping to collects the results into convenient arrays. + */ +public class VectorColumnOrderedMap { + protected static transient final Log LOG = LogFactory.getLog(VectorColumnOrderedMap.class); + + private TreeMap orderedTreeMap; + + private class Value { + int valueColumn; + + String typeName; + + Value(int valueColumn, String typeName) { + this.valueColumn = valueColumn; + this.typeName = typeName; + } + } + + public class Mapping { + + private final int[] orderedColumns; + private final int[] valueColumns; + private final String[] typeNames; + + Mapping(int[] orderedColumns, int[] valueColumns, String[] typeNames) { + this.orderedColumns = orderedColumns; + this.valueColumns = valueColumns; + this.typeNames = typeNames; + } + + public int getCount() { + return orderedColumns.length; + } + + public int[] getOrderedColumns() { + return orderedColumns; + } + + public int[] getValueColumns() { + return valueColumns; + } + + public String[] getTypeNames() { + return typeNames; + } + } + + public VectorColumnOrderedMap() { + orderedTreeMap = new TreeMap(); + } + + public void add(int orderedColumn, int valueColumn, String typeName) { + if (orderedTreeMap.containsKey(orderedColumn)) { + throw new Error("Duplicate column " + orderedColumn + " in ordered column map"); + } + orderedTreeMap.put(orderedColumn, new Value(valueColumn, typeName)); + } + + public boolean orderedColumnsContain(int orderedColumn) { + return orderedTreeMap.containsKey(orderedColumn); + } + + public Mapping getMapping() { + ArrayList orderedColumns = new ArrayList(); + ArrayList valueColumns = new ArrayList(); + ArrayList typeNames = new ArrayList(); + for (Map.Entry entry : orderedTreeMap.entrySet()) { + orderedColumns.add(entry.getKey()); + Value value = entry.getValue(); + valueColumns.add(value.valueColumn); + typeNames.add(value.typeName); + } + return new Mapping( + ArrayUtils.toPrimitive(orderedColumns.toArray(new Integer[0])), + ArrayUtils.toPrimitive(valueColumns.toArray(new Integer[0])), + typeNames.toArray(new String[0])); + + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java new file mode 100644 index 0000000..491e8a4 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOrderedMap.Mapping; + +/** + * This class collects column information for copying a row from one VectorizedRowBatch to + * same/another batch. + * + * In this variation, column information is ordered by the output column number. + */ +public class VectorColumnOutputMapping extends VectorColumnMapping { + + private static final long serialVersionUID = 1L; + + @Override + public void add(int sourceColumn, int outputColumn, String typeName) { + // Order on outputColumn. + vectorColumnMapping.add(outputColumn, sourceColumn, typeName); + } + + public boolean containsOutputColumn(int outputColumn) { + return vectorColumnMapping.orderedColumnsContain(outputColumn); + } + + @Override + public void finalize() { + Mapping mapping = vectorColumnMapping.getMapping(); + + // Ordered columns are the output columns. + sourceColumns = mapping.getValueColumns(); + outputColumns = mapping.getOrderedColumns(); + typeNames = mapping.getTypeNames(); + + // Not needed anymore. + vectorColumnMapping = null; + } + +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java new file mode 100644 index 0000000..c1c53ba --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOrderedMap.Mapping; + +/** + * This class collects column information for copying a row from one VectorizedRowBatch to + * same/another batch. + * + * In this variation, column information is ordered by the source column number. + */ +public class VectorColumnSourceMapping extends VectorColumnMapping { + + private static final long serialVersionUID = 1L; + + @Override + public void add(int sourceColumn, int outputColumn, String typeName) { + // Order on sourceColumn. + vectorColumnMapping.add(sourceColumn, outputColumn, typeName); + } + + @Override + public void finalize() { + Mapping mapping = vectorColumnMapping.getMapping(); + + // Ordered columns are the source columns. + sourceColumns = mapping.getOrderedColumns(); + outputColumns = mapping.getValueColumns(); + typeNames = mapping.getTypeNames(); + + // Not needed anymore. + vectorColumnMapping = null; + } + + public boolean isSourceSequenceGood() { + int count = sourceColumns.length; + for (int i = 0; i < count; i++) { + if (sourceColumns[i] != i) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java new file mode 100644 index 0000000..e010e45 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java @@ -0,0 +1,246 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * This class copies specified columns of a row from one VectorizedRowBatch to another. + */ +public class VectorCopyRow { + + protected static transient final Log LOG = LogFactory.getLog(VectorCopyRow.class); + + private abstract class CopyRow { + protected int inColumnIndex; + protected int outColumnIndex; + + CopyRow(int inColumnIndex, int outColumnIndex) { + this.inColumnIndex = inColumnIndex; + this.outColumnIndex = outColumnIndex; + } + + abstract void copy(VectorizedRowBatch inBatch, int inBatchIndex, VectorizedRowBatch outBatch, int outBatchIndex); + } + + private class LongCopyRow extends CopyRow { + + LongCopyRow(int inColumnIndex, int outColumnIndex) { + super(inColumnIndex, outColumnIndex); + } + + @Override + void copy(VectorizedRowBatch inBatch, int inBatchIndex, VectorizedRowBatch outBatch, int outBatchIndex) { + LongColumnVector inColVector = (LongColumnVector) inBatch.cols[inColumnIndex]; + LongColumnVector outColVector = (LongColumnVector) outBatch.cols[outColumnIndex]; + + if (inColVector.isRepeating) { + if (inColVector.noNulls || !inColVector.isNull[0]) { + outColVector.vector[outBatchIndex] = inColVector.vector[0]; + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } else { + if (inColVector.noNulls || !inColVector.isNull[inBatchIndex]) { + outColVector.vector[outBatchIndex] = inColVector.vector[inBatchIndex]; + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } + } + } + + private class DoubleCopyRow extends CopyRow { + + DoubleCopyRow(int inColumnIndex, int outColumnIndex) { + super(inColumnIndex, outColumnIndex); + } + + @Override + void copy(VectorizedRowBatch inBatch, int inBatchIndex, VectorizedRowBatch outBatch, int outBatchIndex) { + DoubleColumnVector inColVector = (DoubleColumnVector) inBatch.cols[inColumnIndex]; + DoubleColumnVector outColVector = (DoubleColumnVector) outBatch.cols[outColumnIndex]; + + if (inColVector.isRepeating) { + if (inColVector.noNulls || !inColVector.isNull[0]) { + outColVector.vector[outBatchIndex] = inColVector.vector[0]; + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } else { + if (inColVector.noNulls || !inColVector.isNull[inBatchIndex]) { + outColVector.vector[outBatchIndex] = inColVector.vector[inBatchIndex]; + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } + } + } + + private abstract class AbstractBytesCopyRow extends CopyRow { + + AbstractBytesCopyRow(int inColumnIndex, int outColumnIndex) { + super(inColumnIndex, outColumnIndex); + } + + } + + private class BytesCopyRowByValue extends AbstractBytesCopyRow { + + BytesCopyRowByValue(int inColumnIndex, int outColumnIndex) { + super(inColumnIndex, outColumnIndex); + } + + @Override + void copy(VectorizedRowBatch inBatch, int inBatchIndex, VectorizedRowBatch outBatch, int outBatchIndex) { + BytesColumnVector inColVector = (BytesColumnVector) inBatch.cols[inColumnIndex]; + BytesColumnVector outColVector = (BytesColumnVector) outBatch.cols[outColumnIndex]; + + if (inColVector.isRepeating) { + if (inColVector.noNulls || !inColVector.isNull[0]) { + outColVector.setVal(outBatchIndex, inColVector.vector[0], inColVector.start[0], inColVector.length[0]); + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } else { + if (inColVector.noNulls || !inColVector.isNull[inBatchIndex]) { + outColVector.setVal(outBatchIndex, inColVector.vector[inBatchIndex], inColVector.start[inBatchIndex], inColVector.length[inBatchIndex]); + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } + } + } + + private class BytesCopyRowByReference extends AbstractBytesCopyRow { + + BytesCopyRowByReference(int inColumnIndex, int outColumnIndex) { + super(inColumnIndex, outColumnIndex); + } + + @Override + void copy(VectorizedRowBatch inBatch, int inBatchIndex, VectorizedRowBatch outBatch, int outBatchIndex) { + BytesColumnVector inColVector = (BytesColumnVector) inBatch.cols[inColumnIndex]; + BytesColumnVector outColVector = (BytesColumnVector) outBatch.cols[outColumnIndex]; + + if (inColVector.isRepeating) { + if (inColVector.noNulls || !inColVector.isNull[0]) { + outColVector.setRef(outBatchIndex, inColVector.vector[0], inColVector.start[0], inColVector.length[0]); + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } else { + if (inColVector.noNulls || !inColVector.isNull[inBatchIndex]) { + outColVector.setRef(outBatchIndex, inColVector.vector[inBatchIndex], inColVector.start[inBatchIndex], inColVector.length[inBatchIndex]); + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } + } + } + + private class DecimalCopyRow extends CopyRow { + + DecimalCopyRow(int inColumnIndex, int outColumnIndex) { + super(inColumnIndex, outColumnIndex); + } + + @Override + void copy(VectorizedRowBatch inBatch, int inBatchIndex, VectorizedRowBatch outBatch, int outBatchIndex) { + DecimalColumnVector inColVector = (DecimalColumnVector) inBatch.cols[inColumnIndex]; + DecimalColumnVector outColVector = (DecimalColumnVector) outBatch.cols[outColumnIndex]; + + if (inColVector.isRepeating) { + if (inColVector.noNulls || !inColVector.isNull[0]) { + outColVector.set(outBatchIndex, inColVector.vector[0]); + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } else { + if (inColVector.noNulls || !inColVector.isNull[inBatchIndex]) { + outColVector.set(outBatchIndex, inColVector.vector[inBatchIndex]); + } else { + VectorizedBatchUtil.setNullColIsNullValue(outColVector, outBatchIndex); + } + } + } + } + + private CopyRow[] subRowToBatchCopiersByValue; + private CopyRow[] subRowToBatchCopiersByReference; + + public void init(VectorColumnMapping columnMapping) { + int count = columnMapping.getCount(); + subRowToBatchCopiersByValue = new CopyRow[count]; + subRowToBatchCopiersByReference = new CopyRow[count]; + + for (int i = 0; i < count; i++) { + int inputColumn = columnMapping.getInputColumns()[i]; + int outputColumn = columnMapping.getOutputColumns()[i]; + String typeName = columnMapping.getTypeNames()[i]; + + CopyRow copyRowByValue = null; + CopyRow copyRowByReference = null; + + if (VectorizationContext.isIntFamily(typeName) || + VectorizationContext.isDatetimeFamily(typeName)) { + copyRowByValue = new LongCopyRow(inputColumn, outputColumn); + } else if (VectorizationContext.isFloatFamily(typeName)) { + copyRowByValue = new DoubleCopyRow(inputColumn, outputColumn); + } else if (VectorizationContext.isStringFamily(typeName)) { + copyRowByValue = new BytesCopyRowByValue(inputColumn, outputColumn); + copyRowByReference = new BytesCopyRowByReference(inputColumn, outputColumn); + } else if (VectorizationContext.decimalTypePattern.matcher(typeName).matches()){ + copyRowByValue = new DecimalCopyRow(inputColumn, outputColumn); + } else { + throw new Error("Cannot allocate vector copy row for " + typeName); + } + subRowToBatchCopiersByValue[i] = copyRowByValue; + if (copyRowByReference == null) { + subRowToBatchCopiersByReference[i] = copyRowByValue; + } else { + subRowToBatchCopiersByReference[i] = copyRowByReference; + } + } + } + + /* + * Use this copy method when the source batch may get reused before the target batch is finished. + * Any bytes column vector values will be copied to the target by value into the column's + * data buffer. + */ + public void copyByValue(VectorizedRowBatch inBatch, int inBatchIndex, VectorizedRowBatch outBatch, int outBatchIndex) { + for (CopyRow copyRow : subRowToBatchCopiersByValue) { + copyRow.copy(inBatch, inBatchIndex, outBatch, outBatchIndex); + } + } + + /* + * Use this copy method when the source batch is safe and will remain around until the target + * batch is finished. + * + * Any bytes column vector values will be referenced by the target column instead of copying. + */ + public void copyByReference(VectorizedRowBatch inBatch, int inBatchIndex, VectorizedRowBatch outBatch, int outBatchIndex) { + for (CopyRow copyRow : subRowToBatchCopiersByReference) { + copyRow.copy(inBatch, inBatchIndex, outBatch, outBatchIndex); + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java new file mode 100644 index 0000000..b3dd99e --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java @@ -0,0 +1,552 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.io.IOException; +import java.sql.Timestamp; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadDateResults; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadDecimalResults; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadBytesResults; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadTimestampResults; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; + +/** + * This class deserializes a serialization format into a row of a VectorizedRowBatch. + * + * The caller provides the hive type names and output column numbers in the order desired to + * deserialize. + * + * This class uses an provided DeserializeRead object to directly deserialize by reading + * field-by-field from a serialization format into the primitive values of the VectorizedRowBatch. + */ + +public class VectorDeserializeRow { + private static final long serialVersionUID = 1L; + private static final Log LOG = LogFactory.getLog(VectorDeserializeRow.class); + + private DeserializeRead deserializeRead; + + private Reader[] readersByValue; + private Reader[] readersByReference; + + public VectorDeserializeRow(DeserializeRead deserializeRead) { + this(); + this.deserializeRead = deserializeRead; + } + + // Not public since we must have the deserialize read object. + private VectorDeserializeRow() { + } + + private abstract class Reader { + protected int columnIndex; + + Reader(int columnIndex) { + this.columnIndex = columnIndex; + } + + abstract void apply(VectorizedRowBatch batch, int batchIndex) throws IOException; + } + + private abstract class AbstractLongReader extends Reader { + + AbstractLongReader(int columnIndex) { + super(columnIndex); + } + } + + private class BooleanReader extends AbstractLongReader { + + BooleanReader(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + boolean value = deserializeRead.readBoolean(); + colVector.vector[batchIndex] = (value ? 1 : 0); + } + } + } + + private class ByteReader extends AbstractLongReader { + + ByteReader(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + byte value = deserializeRead.readByte(); + colVector.vector[batchIndex] = (long) value; + } + } + } + + private class ShortReader extends AbstractLongReader { + + ShortReader(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + short value = deserializeRead.readShort(); + colVector.vector[batchIndex] = (long) value; + } + } + } + + private class IntReader extends AbstractLongReader { + + IntReader(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + int value = deserializeRead.readInt(); + colVector.vector[batchIndex] = (long) value; + } + } + } + + private class LongReader extends AbstractLongReader { + + LongReader(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + long value = deserializeRead.readLong(); + colVector.vector[batchIndex] = value; + } + } + } + + private class DateReader extends AbstractLongReader { + + ReadDateResults readDateResults; + + DateReader(int columnIndex) { + super(columnIndex); + readDateResults = deserializeRead.createReadDateResults(); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + deserializeRead.readDate(readDateResults); + colVector.vector[batchIndex] = (long) readDateResults.getDays(); + } + } + } + + private class TimestampReader extends AbstractLongReader { + + ReadTimestampResults readTimestampResults; + + TimestampReader(int columnIndex) { + super(columnIndex); + readTimestampResults = deserializeRead.createReadTimestampResults(); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + deserializeRead.readTimestamp(readTimestampResults); + Timestamp t = readTimestampResults.getTimestamp(); + colVector.vector[batchIndex] = TimestampUtils.getTimeNanoSec(t); + } + } + } + + private abstract class AbstractDoubleReader extends Reader { + + AbstractDoubleReader(int columnIndex) { + super(columnIndex); + } + } + + private class FloatReader extends AbstractDoubleReader { + + FloatReader(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + DoubleColumnVector colVector = (DoubleColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + float value = deserializeRead.readFloat(); + colVector.vector[batchIndex] = (double) value; + } + } + } + + private class DoubleReader extends AbstractDoubleReader { + + DoubleReader(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + DoubleColumnVector colVector = (DoubleColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + double value = deserializeRead.readDouble(); + colVector.vector[batchIndex] = value; + } + } + } + + private abstract class AbstractBytesReader extends Reader { + protected ReadBytesResults readBytesResults; + + AbstractBytesReader(int columnIndex) { + super(columnIndex); + readBytesResults = deserializeRead.createReadBytesResults(); + } + } + + private class BytesReaderByValue extends AbstractBytesReader { + + BytesReaderByValue(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + deserializeRead.readBytes(readBytesResults); + colVector.setVal(batchIndex, readBytesResults.bytes, + readBytesResults.start, readBytesResults.length); + } + } + } + + private class BytesReaderByReference extends AbstractBytesReader { + + BytesReaderByReference(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + deserializeRead.readBytes(readBytesResults); + colVector.setRef(batchIndex, readBytesResults.bytes, + readBytesResults.start, readBytesResults.length); + } + } + } + + private class CharReaderByValue extends AbstractBytesReader { + + private CharTypeInfo charTypeInfo; + + CharReaderByValue(CharTypeInfo charTypeInfo, int columnIndex) { + super(columnIndex); + this.charTypeInfo = charTypeInfo; + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + // Use the basic bytes read to get access, then use our optimal truncate/trim method that + // does not use Java String objects. + deserializeRead.readBytes(readBytesResults); + int adjustedLength = StringExpr.rightTrimAndTruncate(readBytesResults.bytes, + readBytesResults.start, readBytesResults.length, charTypeInfo.getLength()); + colVector.setVal(batchIndex, readBytesResults.bytes, readBytesResults.start, adjustedLength); + } + } + } + + private class CharReaderByReference extends AbstractBytesReader { + + private CharTypeInfo charTypeInfo; + + CharReaderByReference(CharTypeInfo charTypeInfo, int columnIndex) { + super(columnIndex); + this.charTypeInfo = charTypeInfo; + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + // Use the basic bytes read to get access, then use our optimal truncate/trim method that + // does not use Java String objects. + deserializeRead.readBytes(readBytesResults); + int adjustedLength = StringExpr.rightTrimAndTruncate(readBytesResults.bytes, + readBytesResults.start, readBytesResults.length, charTypeInfo.getLength()); + colVector.setRef(batchIndex, readBytesResults.bytes, readBytesResults.start, adjustedLength); + } + } + } + + private class VarcharReaderByValue extends AbstractBytesReader { + + private VarcharTypeInfo varcharTypeInfo; + + VarcharReaderByValue(VarcharTypeInfo varcharTypeInfo, int columnIndex) { + super(columnIndex); + this.varcharTypeInfo = varcharTypeInfo; + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + // Use the basic bytes read to get access, then use our optimal truncate method that + // does not use Java String objects. + deserializeRead.readBytes(readBytesResults); + int adjustedLength = StringExpr.truncate(readBytesResults.bytes, + readBytesResults.start, readBytesResults.length, varcharTypeInfo.getLength()); + colVector.setVal(batchIndex, readBytesResults.bytes, readBytesResults.start, adjustedLength); + } + } + } + + private class VarcharReaderByReference extends AbstractBytesReader { + + private VarcharTypeInfo varcharTypeInfo; + + VarcharReaderByReference(VarcharTypeInfo varcharTypeInfo, int columnIndex) { + super(columnIndex); + this.varcharTypeInfo = varcharTypeInfo; + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + // Use the basic bytes read to get access, then use our optimal truncate method that + // does not use Java String objects. + deserializeRead.readBytes(readBytesResults); + int adjustedLength = StringExpr.truncate(readBytesResults.bytes, + readBytesResults.start, readBytesResults.length, varcharTypeInfo.getLength()); + colVector.setRef(batchIndex, readBytesResults.bytes, readBytesResults.start, adjustedLength); + } + } + } + + private class HiveDecimalReader extends Reader { + + private ReadDecimalResults readDecimalResults; + + HiveDecimalReader(DecimalTypeInfo decimalTypeInfo, int columnIndex) { + super(columnIndex); + readDecimalResults = deserializeRead.createReadDecimalResults(decimalTypeInfo); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) throws IOException { + DecimalColumnVector colVector = (DecimalColumnVector) batch.cols[columnIndex]; + + if (deserializeRead.readCheckNull()) { + VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); + } else { + deserializeRead.readHiveDecimal(readDecimalResults); + HiveDecimal hiveDecimal = readDecimalResults.getHiveDecimal(); + colVector.vector[batchIndex].set(hiveDecimal); + } + } + } + + private void addReader(String typeName, int index, int outputColumn) { + Reader readerByValue = null; + Reader readerByReference = null; + + if (typeName.equals("boolean")) { + readerByValue = new BooleanReader(outputColumn); + } else if (typeName.equals("tinyint")) { + readerByValue = new ByteReader(outputColumn); + } else if (typeName.equals("smallint")) { + readerByValue = new ShortReader(outputColumn); + } else if (typeName.equals("int")) { + readerByValue = new IntReader(outputColumn); + } else if (typeName.equals("bigint") || typeName.equals("long")) { + readerByValue = new LongReader(outputColumn); + } else if (typeName.equals("timestamp")) { + readerByValue = new TimestampReader(outputColumn); + } else if (typeName.equals("date")) { + readerByValue = new DateReader(outputColumn); + } else if (typeName.equals("float")) { + readerByValue = new FloatReader(outputColumn); + } else if (typeName.equals("double")) { + readerByValue = new DoubleReader(outputColumn); + } else if (typeName.equals("string") || + typeName.equals("binary")) { + readerByValue = new BytesReaderByValue(outputColumn); + readerByReference = new BytesReaderByReference(outputColumn); + } else if (VectorizationContext.charTypePattern.matcher(typeName).matches()) { + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + CharTypeInfo charTypeInfo = (CharTypeInfo) typeInfo; + readerByValue = new CharReaderByValue(charTypeInfo, outputColumn); + readerByReference = new CharReaderByReference(charTypeInfo, outputColumn); + } else if (VectorizationContext.varcharTypePattern.matcher(typeName).matches()) { + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + VarcharTypeInfo varcharTypeInfo = (VarcharTypeInfo) typeInfo; + readerByValue = new VarcharReaderByValue(varcharTypeInfo, outputColumn); + readerByReference = new VarcharReaderByReference(varcharTypeInfo, outputColumn); + } else if (VectorizationContext.decimalTypePattern.matcher(typeName).matches()) { + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + readerByValue = new HiveDecimalReader((DecimalTypeInfo) typeInfo, outputColumn); + } else { + throw new Error("Cannot allocate deserialize read row for " + typeName); + } + readersByValue[index] = readerByValue; + if (readerByReference == null) { + readersByReference[index] = readerByValue; + } else { + readersByReference[index] = readerByReference; + } + } + + public void init(String[] typeNames, int[] columns) { + readersByValue = new Reader[typeNames.length]; + readersByReference = new Reader[typeNames.length]; + + for (int i = 0; i < typeNames.length; i++) { + String typeName = typeNames[i].toLowerCase(); + int outputColumn = columns[i]; + + addReader(typeName, i, outputColumn); + } + } + + public void init(StructObjectInspector structInspector, int startColumn) { + List fields = structInspector.getAllStructFieldRefs(); + final int fieldsSize = fields.size(); + + readersByValue = new Reader[fieldsSize]; + readersByReference = new Reader[fieldsSize]; + + for (int i = 0; i < fieldsSize; i++) { + StructField field = fields.get(i); + String typeName = field.getFieldObjectInspector().getTypeName().toLowerCase(); + int outputColumn = startColumn + i; + + addReader(typeName, i, outputColumn); + } + } + + public void init(List typeNames) { + readersByValue = new Reader[typeNames.size()]; + readersByReference = new Reader[typeNames.size()]; + + for (int i = 0; i < typeNames.size(); i++) { + String typeName = typeNames.get(i).toLowerCase(); + addReader(typeName, i, i); + } + } + + public void setBytes(byte[] bytes, int offset, int length) { + deserializeRead.set(bytes, offset, length); + } + + public void deserializeByValue(VectorizedRowBatch batch, int batchIndex) throws IOException { + for (Reader reader : readersByValue) { + reader.apply(batch, batchIndex); + } + } + + public void deserializeByReference(VectorizedRowBatch batch, int batchIndex) throws IOException { + for (Reader reader : readersByReference) { + reader.apply(batch, batchIndex); + } + } + +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java new file mode 100644 index 0000000..b31c54a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java @@ -0,0 +1,678 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.io.IOException; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.ql.exec.vector.VectorAssignRow.Assigner; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.io.Text; + +/** + * This class extracts specified VectorizedRowBatch row columns into a Writable row Object[]. + * + * The caller provides the hive type names and target column numbers in the order desired to + * extract from the Writable row Object[]. + * + * This class is abstract to allow the subclasses to control batch reuse. + */ +public abstract class VectorExtractRow { + private static final long serialVersionUID = 1L; + private static final Log LOG = LogFactory.getLog(VectorExtractRow.class); + + private boolean tolerateNullColumns; + + public VectorExtractRow() { + // UNDONE: For now allow null columns until vector_decimal_mapjoin.q is understood... + tolerateNullColumns = true; + } + + protected abstract class Extractor { + protected int columnIndex; + protected Object object; + + public Extractor(int columnIndex) { + this.columnIndex = columnIndex; + } + + public int getColumnIndex() { + return columnIndex; + } + + abstract void setColumnVector(VectorizedRowBatch batch); + + abstract void forgetColumnVector(); + + abstract Object extract(int batchIndex); + } + + private class VoidExtractor extends Extractor { + + VoidExtractor(int columnIndex) { + super(columnIndex); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + } + + @Override + void forgetColumnVector() { + } + + @Override + Object extract(int batchIndex) { + return null; + } + } + + private abstract class AbstractLongExtractor extends Extractor { + + protected LongColumnVector colVector; + protected long[] vector; + + AbstractLongExtractor(int columnIndex) { + super(columnIndex); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + colVector = (LongColumnVector) batch.cols[columnIndex]; + vector = colVector.vector; + } + + @Override + void forgetColumnVector() { + colVector = null; + vector = null; + } + } + + protected class BooleanExtractor extends AbstractLongExtractor { + + BooleanExtractor(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableBooleanObjectInspector.create(false); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + long value = vector[adjustedIndex]; + PrimitiveObjectInspectorFactory.writableBooleanObjectInspector.set(object, value == 0 ? false : true); + return object; + } else { + return null; + } + } + } + + protected class ByteExtractor extends AbstractLongExtractor { + + ByteExtractor(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableByteObjectInspector.create((byte) 0); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + long value = vector[adjustedIndex]; + PrimitiveObjectInspectorFactory.writableByteObjectInspector.set(object, (byte) value); + return object; + } else { + return null; + } + } + } + + private class ShortExtractor extends AbstractLongExtractor { + + ShortExtractor(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableShortObjectInspector.create((short) 0); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + long value = vector[adjustedIndex]; + PrimitiveObjectInspectorFactory.writableShortObjectInspector.set(object, (short) value); + return object; + } else { + return null; + } + } + } + + private class IntExtractor extends AbstractLongExtractor { + + IntExtractor(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableIntObjectInspector.create(0); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + long value = vector[adjustedIndex]; + PrimitiveObjectInspectorFactory.writableIntObjectInspector.set(object, (int) value); + return object; + } else { + return null; + } + } + } + + private class LongExtractor extends AbstractLongExtractor { + + LongExtractor(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableLongObjectInspector.create(0); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + long value = vector[adjustedIndex]; + PrimitiveObjectInspectorFactory.writableLongObjectInspector.set(object, value); + return object; + } else { + return null; + } + } + } + + private class DateExtractor extends AbstractLongExtractor { + + private Date date; + + DateExtractor(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableDateObjectInspector.create(new Date(0)); + date = new Date(0); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + long value = vector[adjustedIndex]; + date.setTime(DateWritable.daysToMillis((int) value)); + PrimitiveObjectInspectorFactory.writableDateObjectInspector.set(object, date); + return object; + } else { + return null; + } + } + } + + private class TimestampExtractor extends AbstractLongExtractor { + + private Timestamp timestamp; + + TimestampExtractor(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableTimestampObjectInspector.create(new Timestamp(0)); + timestamp = new Timestamp(0); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + long value = vector[adjustedIndex]; + TimestampUtils.assignTimeInNanoSec(value, timestamp); + PrimitiveObjectInspectorFactory.writableTimestampObjectInspector.set(object, timestamp); + return object; + } else { + return null; + } + } + } + + private abstract class AbstractDoubleExtractor extends Extractor { + + protected DoubleColumnVector colVector; + protected double[] vector; + + AbstractDoubleExtractor(int columnIndex) { + super(columnIndex); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + colVector = (DoubleColumnVector) batch.cols[columnIndex]; + vector = colVector.vector; + } + + @Override + void forgetColumnVector() { + colVector = null; + vector = null; + } + } + + private class FloatExtractor extends AbstractDoubleExtractor { + + FloatExtractor(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableFloatObjectInspector.create(0f); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + double value = vector[adjustedIndex]; + PrimitiveObjectInspectorFactory.writableFloatObjectInspector.set(object, (float) value); + return object; + } else { + return null; + } + } + } + + private class DoubleExtractor extends AbstractDoubleExtractor { + + DoubleExtractor(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableDoubleObjectInspector.create(0f); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + double value = vector[adjustedIndex]; + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector.set(object, value); + return object; + } else { + return null; + } + } + } + + private abstract class AbstractBytesExtractor extends Extractor { + + protected BytesColumnVector colVector; + + AbstractBytesExtractor(int columnIndex) { + super(columnIndex); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + colVector = (BytesColumnVector) batch.cols[columnIndex]; + } + + @Override + void forgetColumnVector() { + colVector = null; + } + } + + private class BinaryExtractorByValue extends AbstractBytesExtractor { + + private DataOutputBuffer buffer; + + // Use the BytesWritable instance here as a reference to data saved in buffer. We do not + // want to pass the binary object inspector a byte[] since we would need to allocate it on the + // heap each time to get the length correct. + private BytesWritable bytesWritable; + + BinaryExtractorByValue(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableBinaryObjectInspector.create(ArrayUtils.EMPTY_BYTE_ARRAY); + buffer = new DataOutputBuffer(); + bytesWritable = new BytesWritable(); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + byte[] bytes = colVector.vector[adjustedIndex]; + int start = colVector.start[adjustedIndex]; + int length = colVector.length[adjustedIndex]; + + // Save a copy of the binary data. + buffer.reset(); + try { + buffer.write(bytes, start, length); + } catch (IOException ioe) { + throw new IllegalStateException("bad write", ioe); + } + + bytesWritable.set(buffer.getData(), 0, buffer.getLength()); + PrimitiveObjectInspectorFactory.writableBinaryObjectInspector.set(object, bytesWritable); + return object; + } else { + return null; + } + } + } + + private class StringExtractorByValue extends AbstractBytesExtractor { + + // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. + private Text text; + + StringExtractorByValue(int columnIndex) { + super(columnIndex); + object = PrimitiveObjectInspectorFactory.writableStringObjectInspector.create(StringUtils.EMPTY); + text = new Text(); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + byte[] value = colVector.vector[adjustedIndex]; + int start = colVector.start[adjustedIndex]; + int length = colVector.length[adjustedIndex]; + + // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. + text.set(value, start, length); + + PrimitiveObjectInspectorFactory.writableStringObjectInspector.set(object, text); + return object; + } else { + return null; + } + } + } + + private class VarCharExtractorByValue extends AbstractBytesExtractor { + + // We need our own instance of the VARCHAR object inspector to hold the maximum length + // from the TypeInfo. + private WritableHiveVarcharObjectInspector writableVarcharObjectInspector; + + // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. + private Text text; + + /* + * @param varcharTypeInfo + * We need the VARCHAR type information that contains the maximum length. + * @param columnIndex + * The vector row batch column that contains the bytes for the VARCHAR. + */ + VarCharExtractorByValue(VarcharTypeInfo varcharTypeInfo, int columnIndex) { + super(columnIndex); + writableVarcharObjectInspector = new WritableHiveVarcharObjectInspector(varcharTypeInfo); + object = writableVarcharObjectInspector.create(new HiveVarchar(StringUtils.EMPTY, -1)); + text = new Text(); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + byte[] value = colVector.vector[adjustedIndex]; + int start = colVector.start[adjustedIndex]; + int length = colVector.length[adjustedIndex]; + + // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. + text.set(value, start, length); + + writableVarcharObjectInspector.set(object, text.toString()); + return object; + } else { + return null; + } + } + } + + private class CharExtractorByValue extends AbstractBytesExtractor { + + // We need our own instance of the CHAR object inspector to hold the maximum length + // from the TypeInfo. + private WritableHiveCharObjectInspector writableCharObjectInspector; + + // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. + private Text text; + + /* + * @param varcharTypeInfo + * We need the CHAR type information that contains the maximum length. + * @param columnIndex + * The vector row batch column that contains the bytes for the CHAR. + */ + CharExtractorByValue(CharTypeInfo charTypeInfo, int columnIndex) { + super(columnIndex); + writableCharObjectInspector = new WritableHiveCharObjectInspector(charTypeInfo); + object = writableCharObjectInspector.create(new HiveChar(StringUtils.EMPTY, -1)); + text = new Text(); + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + byte[] value = colVector.vector[adjustedIndex]; + int start = colVector.start[adjustedIndex]; + int length = colVector.length[adjustedIndex]; + + // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. + text.set(value, start, length); + + writableCharObjectInspector.set(object, text.toString()); + return object; + } else { + return null; + } + } + } + + private class DecimalExtractor extends Extractor { + + private WritableHiveDecimalObjectInspector writableDecimalObjectInspector; + protected DecimalColumnVector colVector; + + /* + * @param decimalTypeInfo + * We need the DECIMAL type information that contains scale and precision. + * @param columnIndex + * The vector row batch column that contains the bytes for the VARCHAR. + */ + DecimalExtractor(DecimalTypeInfo decimalTypeInfo, int columnIndex) { + super(columnIndex); + writableDecimalObjectInspector = new WritableHiveDecimalObjectInspector(decimalTypeInfo); + object = writableDecimalObjectInspector.create(HiveDecimal.ZERO); + } + + @Override + void setColumnVector(VectorizedRowBatch batch) { + colVector = (DecimalColumnVector) batch.cols[columnIndex]; + } + + @Override + void forgetColumnVector() { + colVector = null; + } + + @Override + Object extract(int batchIndex) { + int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex); + if (colVector.noNulls || !colVector.isNull[adjustedIndex]) { + HiveDecimal value = colVector.vector[adjustedIndex].getHiveDecimal(); + writableDecimalObjectInspector.set(object, value); + return object; + } else { + return null; + } + } + } + + private Extractor createExtractor(PrimitiveTypeInfo primitiveTypeInfo, int columnIndex) throws HiveException { + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + Extractor extracter; + switch (primitiveCategory) { + case VOID: + extracter = new VoidExtractor(columnIndex); + break; + case BOOLEAN: + extracter = new BooleanExtractor(columnIndex); + break; + case BYTE: + extracter = new ByteExtractor(columnIndex); + break; + case SHORT: + extracter = new ShortExtractor(columnIndex); + break; + case INT: + extracter = new IntExtractor(columnIndex); + break; + case LONG: + extracter = new LongExtractor(columnIndex); + break; + case TIMESTAMP: + extracter = new TimestampExtractor(columnIndex); + break; + case DATE: + extracter = new DateExtractor(columnIndex); + break; + case FLOAT: + extracter = new FloatExtractor(columnIndex); + break; + case DOUBLE: + extracter = new DoubleExtractor(columnIndex); + break; + case BINARY: + extracter = new BinaryExtractorByValue(columnIndex); + break; + case STRING: + extracter = new StringExtractorByValue(columnIndex); + break; + case VARCHAR: + extracter = new VarCharExtractorByValue((VarcharTypeInfo) primitiveTypeInfo, columnIndex); + break; + case CHAR: + extracter = new CharExtractorByValue((CharTypeInfo) primitiveTypeInfo, columnIndex); + break; + case DECIMAL: + extracter = new DecimalExtractor((DecimalTypeInfo) primitiveTypeInfo, columnIndex); + break; + default: + throw new HiveException("No vector row extracter for primitive category " + + primitiveCategory); + } + return extracter; + } + + Extractor[] extracters; + + public void init(StructObjectInspector structObjectInspector, List projectedColumns) throws HiveException { + + extracters = new Extractor[projectedColumns.size()]; + + List fields = structObjectInspector.getAllStructFieldRefs(); + + int i = 0; + for (StructField field : fields) { + int columnIndex = projectedColumns.get(i); + ObjectInspector fieldInspector = field.getFieldObjectInspector(); + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString( + fieldInspector.getTypeName()); + extracters[i] = createExtractor(primitiveTypeInfo, columnIndex); + i++; + } + } + + public void init(List typeNames) throws HiveException { + + extracters = new Extractor[typeNames.size()]; + + int i = 0; + for (String typeName : typeNames) { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); + extracters[i] = createExtractor(primitiveTypeInfo, i); + i++; + } + } + + public int getCount() { + return extracters.length; + } + + protected void setBatch(VectorizedRowBatch batch) throws HiveException { + + for (int i = 0; i < extracters.length; i++) { + Extractor extracter = extracters[i]; + int columnIndex = extracter.getColumnIndex(); + if (batch.cols[columnIndex] == null) { + if (tolerateNullColumns) { + // Replace with void... + extracter = new VoidExtractor(columnIndex); + extracters[i] = extracter; + } else { + throw new HiveException("Unexpected null vector column " + columnIndex); + } + } + extracter.setColumnVector(batch); + } + } + + public Object extractRowColumn(int batchIndex, int logicalColumnIndex) { + return extracters[logicalColumnIndex].extract(batchIndex); + } + + public void extractRow(int batchIndex, Object[] objects) { + int i = 0; + for (Extractor extracter : extracters) { + objects[i++] = extracter.extract(batchIndex); + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRowSameBatch.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRowSameBatch.java new file mode 100644 index 0000000..faec0aa --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRowSameBatch.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.hive.ql.metadata.HiveException; + +/** + * This class extracts specified VectorizedRowBatch row columns into a Writable row Object[]. + * + * The caller provides the hive type names and target column numbers in the order desired to + * extract from the Writable row Object[]. + * + * This class is for use when the batch being assigned is always the same. + */ +public class VectorExtractRowSameBatch extends VectorExtractRow { + + public void setOneBatch(VectorizedRowBatch batch) throws HiveException { + setBatch(batch); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java index 858604c..a1006d3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java @@ -20,12 +20,9 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; /** @@ -35,14 +32,23 @@ private static final long serialVersionUID = 1L; - protected transient Object[] singleRow; + private VectorizationContext vContext; + + // The above members are initialized by the constructor and must not be + // transient. + //--------------------------------------------------------------------------- - protected transient VectorExpressionWriter[] valueWriters; + private transient boolean firstBatch; - public VectorFileSinkOperator(VectorizationContext context, + private transient VectorExtractRowSameBatch vectorExtractRowSameBatch; + + protected transient Object[] singleRow; + + public VectorFileSinkOperator(VectorizationContext vContext, OperatorDesc conf) { super(); this.conf = (FileSinkDesc) conf; + this.vContext = vContext; } public VectorFileSinkOperator() { @@ -53,41 +59,40 @@ public VectorFileSinkOperator() { protected void initializeOp(Configuration hconf) throws HiveException { // We need a input object inspector that is for the row we will extract out of the // vectorized row batch, not for example, an original inspector for an ORC table, etc. - VectorExpressionWriterFactory.processVectorInspector( - (StructObjectInspector) inputObjInspectors[0], - new VectorExpressionWriterFactory.SingleOIDClosure() { - @Override - public void assign(VectorExpressionWriter[] writers, - ObjectInspector objectInspector) { - valueWriters = writers; - inputObjInspectors[0] = objectInspector; - } - }); - singleRow = new Object[valueWriters.length]; + inputObjInspectors[0] = + VectorizedBatchUtil.convertToStandardStructObjectInspector((StructObjectInspector) inputObjInspectors[0]); // Call FileSinkOperator with new input inspector. super.initializeOp(hconf); + + firstBatch = true; } @Override public void processOp(Object data, int tag) throws HiveException { - VectorizedRowBatch vrg = (VectorizedRowBatch)data; - for (int i = 0; i < vrg.size; i++) { - Object[] row = getRowObject(vrg, i); - super.processOp(row, tag); - } - } + VectorizedRowBatch batch = (VectorizedRowBatch) data; + if (firstBatch) { + vectorExtractRowSameBatch = new VectorExtractRowSameBatch(); + vectorExtractRowSameBatch.init((StructObjectInspector) inputObjInspectors[0], vContext.getProjectedColumns()); + vectorExtractRowSameBatch.setOneBatch(batch); - private Object[] getRowObject(VectorizedRowBatch vrg, int rowIndex) - throws HiveException { - int batchIndex = rowIndex; - if (vrg.selectedInUse) { - batchIndex = vrg.selected[rowIndex]; + singleRow = new Object[vectorExtractRowSameBatch.getCount()]; + + firstBatch = false; } - for (int i = 0; i < vrg.projectionSize; i++) { - ColumnVector vectorColumn = vrg.cols[vrg.projectedColumns[i]]; - singleRow[i] = valueWriters[i].writeValue(vectorColumn, batchIndex); + + if (batch.selectedInUse) { + int selected[] = batch.selected; + for (int logical = 0 ; logical < batch.size; logical++) { + int batchIndex = selected[logical]; + vectorExtractRowSameBatch.extractRow(batchIndex, singleRow); + super.processOp(singleRow, tag); + } + } else { + for (int batchIndex = 0 ; batchIndex < batch.size; batchIndex++) { + vectorExtractRowSameBatch.extractRow(batchIndex, singleRow); + super.processOp(singleRow, tag); + } } - return singleRow; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java index 402b324..d348845 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java @@ -99,7 +99,7 @@ private transient VectorizedRowBatch outputBatch; private transient VectorizedRowBatchCtx vrbCtx; - private transient VectorColumnAssign[] vectorColumnAssign; + private transient VectorAssignRowSameBatch vectorAssignRowSameBatch; /** * Interface for processing mode: global, hash, unsorted streaming, or group batch @@ -753,8 +753,7 @@ public VectorGroupByOperator(VectorizationContext vContext, OperatorDesc conf) isVectorOutput = desc.getVectorDesc().isVectorOutput(); - vOutContext = new VectorizationContext(desc.getOutputColumnNames()); - vOutContext.setFileKey(vContext.getFileKey() + "/_GROUPBY_"); + vOutContext = new VectorizationContext(getName(), desc.getOutputColumnNames()); } public VectorGroupByOperator() { @@ -799,11 +798,12 @@ protected void initializeOp(Configuration hconf) throws HiveException { outputObjInspector = ObjectInspectorFactory.getStandardStructObjectInspector( outputFieldNames, objectInspectors); if (isVectorOutput) { - vrbCtx = new VectorizedRowBatchCtx(); - vrbCtx.init(vOutContext.getScratchColumnTypeMap(), (StructObjectInspector) outputObjInspector); - outputBatch = vrbCtx.createVectorizedRowBatch(); - vectorColumnAssign = VectorColumnAssignFactory.buildAssigners( - outputBatch, outputObjInspector, vOutContext.getProjectionColumnMap(), conf.getOutputColumnNames()); + vrbCtx = new VectorizedRowBatchCtx(); + vrbCtx.init(vOutContext.getScratchColumnTypeMap(), (StructObjectInspector) outputObjInspector); + outputBatch = vrbCtx.createVectorizedRowBatch(); + vectorAssignRowSameBatch = new VectorAssignRowSameBatch(); + vectorAssignRowSameBatch.init((StructObjectInspector) outputObjInspector, vOutContext.getProjectedColumns()); + vectorAssignRowSameBatch.setOneBatch(outputBatch); } } catch (HiveException he) { @@ -890,12 +890,12 @@ private void writeSingleRow(VectorHashKeyWrapper kw, VectorAggregationBufferRow } else { // Output keys and aggregates into the output batch. for (int i = 0; i < outputKeyLength; ++i) { - vectorColumnAssign[fi++].assignObjectValue(keyWrappersBatch.getWritableKeyValue ( - kw, i, keyOutputWriters[i]), outputBatch.size); + vectorAssignRowSameBatch.assignRowColumn(outputBatch.size, fi++, + keyWrappersBatch.getWritableKeyValue (kw, i, keyOutputWriters[i])); } for (int i = 0; i < aggregators.length; ++i) { - vectorColumnAssign[fi++].assignObjectValue(aggregators[i].evaluateOutput( - agg.getAggregationBuffer(i)), outputBatch.size); + vectorAssignRowSameBatch.assignRowColumn(outputBatch.size, fi++, + aggregators[i].evaluateOutput(agg.getAggregationBuffer(i))); } ++outputBatch.size; if (outputBatch.size == VectorizedRowBatch.DEFAULT_SIZE) { @@ -915,8 +915,8 @@ private void writeGroupRow(VectorAggregationBufferRow agg, DataOutputBuffer buff throws HiveException { int fi = outputKeyLength; // Start after group keys. for (int i = 0; i < aggregators.length; ++i) { - vectorColumnAssign[fi++].assignObjectValue(aggregators[i].evaluateOutput( - agg.getAggregationBuffer(i)), outputBatch.size); + vectorAssignRowSameBatch.assignRowColumn(outputBatch.size, fi++, + aggregators[i].evaluateOutput(agg.getAggregationBuffer(i))); } ++outputBatch.size; if (outputBatch.size == VectorizedRowBatch.DEFAULT_SIZE) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java index 2c8aee1..b83a53e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java @@ -65,7 +65,7 @@ private transient VectorizedRowBatch outputBatch; private transient VectorExpressionWriter[] valueWriters; - private transient Map outputVectorAssigners; + private transient Map outputVectorAssignRowMap; // These members are used as out-of-band params // for the inner-loop supper.processOp callbacks @@ -107,8 +107,7 @@ public VectorMapJoinOperator (VectorizationContext vContext, OperatorDesc conf) bigTableValueExpressions = vContext.getVectorExpressions(exprs.get(posBigTable)); // We are making a new output vectorized row batch. - vOutContext = new VectorizationContext(desc.getOutputColumnNames()); - vOutContext.setFileKey(vContext.getFileKey() + "/MAP_JOIN_" + desc.getBigTableAlias()); + vOutContext = new VectorizationContext(getName(), desc.getOutputColumnNames()); } @Override @@ -177,7 +176,7 @@ protected Object _evaluate(Object row, int version) throws HiveException { // Filtering is handled in the input batch processing filterMaps[posBigTable] = null; - outputVectorAssigners = new HashMap(); + outputVectorAssignRowMap = new HashMap(); } /** @@ -186,15 +185,16 @@ protected Object _evaluate(Object row, int version) throws HiveException { @Override protected void internalForward(Object row, ObjectInspector outputOI) throws HiveException { Object[] values = (Object[]) row; - VectorColumnAssign[] vcas = outputVectorAssigners.get(outputOI); - if (null == vcas) { - vcas = VectorColumnAssignFactory.buildAssigners( - outputBatch, outputOI, vOutContext.getProjectionColumnMap(), conf.getOutputColumnNames()); - outputVectorAssigners.put(outputOI, vcas); - } - for (int i=0; i outputVectorAssigners; + private transient Map outputVectorAssignRowMap; private transient int batchIndex = -1; @@ -115,10 +115,9 @@ public VectorSMBMapJoinOperator(VectorizationContext vContext, OperatorDesc conf bigTableValueExpressions = vContext.getVectorExpressions(exprs.get(posBigTable)); // We are making a new output vectorized row batch. - vOutContext = new VectorizationContext(desc.getOutputColumnNames()); - vOutContext.setFileKey(vContext.getFileKey() + "/SMB_JOIN_" + desc.getBigTableAlias()); + vOutContext = new VectorizationContext(getName(), desc.getOutputColumnNames()); } - + @Override protected List smbJoinComputeKeys(Object row, byte alias) throws HiveException { if (alias == this.posBigTable) { @@ -140,7 +139,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { keyWrapperBatch = VectorHashKeyWrapperBatch.compileKeyWrapperBatch(keyExpressions); - outputVectorAssigners = new HashMap(); + outputVectorAssignRowMap = new HashMap(); // This key evaluator translates from the vectorized VectorHashKeyWrapper format // into the row-mode MapJoinKey @@ -269,15 +268,16 @@ public void closeOp(boolean aborted) throws HiveException { @Override protected void internalForward(Object row, ObjectInspector outputOI) throws HiveException { Object[] values = (Object[]) row; - VectorColumnAssign[] vcas = outputVectorAssigners.get(outputOI); - if (null == vcas) { - vcas = VectorColumnAssignFactory.buildAssigners( - outputBatch, outputOI, vOutContext.getProjectionColumnMap(), conf.getOutputColumnNames()); - outputVectorAssigners.put(outputOI, vcas); - } - for (int i = 0; i < values.length; ++i) { - vcas[i].assignObjectValue(values[i], outputBatch.size); + VectorAssignRowSameBatch va = outputVectorAssignRowMap.get(outputOI); + if (va == null) { + va = new VectorAssignRowSameBatch(); + va.init((StructObjectInspector) outputOI, vOutContext.getProjectedColumns()); + va.setOneBatch(outputBatch); + outputVectorAssignRowMap.put(outputOI, va); } + + va.assignRow(outputBatch.size, values); + ++outputBatch.size; if (outputBatch.size == VectorizedRowBatch.DEFAULT_SIZE) { flushOutput(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java index 458dc5a..8bfcab7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java @@ -65,10 +65,7 @@ public VectorSelectOperator(VectorizationContext vContext, OperatorDesc conf) * Create a new vectorization context to create a new projection, but keep * same output column manager must be inherited to track the scratch the columns. */ - vOutContext = new VectorizationContext(vContext); - - // Set a fileKey, although this operator doesn't use it. - vOutContext.setFileKey(vContext.getFileKey() + "/_SELECT_"); + vOutContext = new VectorizationContext(getName(), vContext); vOutContext.resetProjectionColumns(); for (int i=0; i < colList.size(); ++i) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRow.java new file mode 100644 index 0000000..cb025ba --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRow.java @@ -0,0 +1,523 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.sql.Timestamp; +import java.util.List; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.serializewrite.SerializeWrite; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +/** + * This class serializes columns from a row in a VectorizedRowBatch into a serialization format. + * + * The caller provides the hive type names and column numbers in the order desired to + * serialize. + * + * This class uses an provided SerializeWrite object to directly serialize by writing + * field-by-field into a serialization format from the primitive values of the VectorizedRowBatch. + * + * Note that when serializing a row, the logical mapping using selected in use has already + * been performed. + */ +public class VectorSerializeRow { + + private SerializeWrite serializeWrite; + + public VectorSerializeRow(SerializeWrite serializeWrite) { + this(); + this.serializeWrite = serializeWrite; + } + + // Not public since we must have the serialize write object. + private VectorSerializeRow() { + } + + private abstract class Writer { + protected int columnIndex; + + Writer(int columnIndex) { + this.columnIndex = columnIndex; + } + + abstract boolean apply(VectorizedRowBatch batch, int batchIndex); + } + + private abstract class AbstractLongWriter extends Writer { + + AbstractLongWriter(int columnIndex) { + super(columnIndex); + } + } + + private class BooleanWriter extends AbstractLongWriter { + + BooleanWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeBoolean(colVector.vector[0] != 0); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeBoolean(colVector.vector[batchIndex] != 0); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private class ByteWriter extends AbstractLongWriter { + + ByteWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeByte((byte) colVector.vector[0]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeByte((byte) colVector.vector[batchIndex]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private class ShortWriter extends AbstractLongWriter { + + ShortWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeShort((short) colVector.vector[0]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeShort((short) colVector.vector[batchIndex]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private class IntWriter extends AbstractLongWriter { + + IntWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeInt((int) colVector.vector[0]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeInt((int) colVector.vector[batchIndex]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private class LongWriter extends AbstractLongWriter { + + LongWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeLong(colVector.vector[0]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeLong(colVector.vector[batchIndex]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private class DateWriter extends AbstractLongWriter { + + DateWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeDate((int) colVector.vector[0]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeDate((int) colVector.vector[batchIndex]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private class TimestampWriter extends AbstractLongWriter { + + Timestamp scratchTimestamp; + + TimestampWriter(int columnIndex) { + super(columnIndex); + scratchTimestamp = new Timestamp(0); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + TimestampUtils.assignTimeInNanoSec(colVector.vector[0], scratchTimestamp); + serializeWrite.writeTimestamp(scratchTimestamp); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + TimestampUtils.assignTimeInNanoSec(colVector.vector[batchIndex], scratchTimestamp); + serializeWrite.writeTimestamp(scratchTimestamp); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private abstract class AbstractDoubleWriter extends Writer { + + AbstractDoubleWriter(int columnIndex) { + super(columnIndex); + } + } + + private class FloatWriter extends AbstractDoubleWriter { + + FloatWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + DoubleColumnVector colVector = (DoubleColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeFloat((float) colVector.vector[0]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeFloat((float) colVector.vector[batchIndex]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private class DoubleWriter extends AbstractDoubleWriter { + + DoubleWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + DoubleColumnVector colVector = (DoubleColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeDouble(colVector.vector[0]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeDouble(colVector.vector[batchIndex]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private class BytesWriter extends Writer { + + BytesWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeBytes(colVector.vector[0], colVector.start[0], colVector.length[0]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeBytes(colVector.vector[batchIndex], + colVector.start[batchIndex], colVector.length[batchIndex]); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private class HiveDecimalWriter extends Writer { + protected HiveDecimalWritable[] vector; + + HiveDecimalWriter(int columnIndex) { + super(columnIndex); + } + + @Override + boolean apply(VectorizedRowBatch batch, int batchIndex) { + DecimalColumnVector colVector = (DecimalColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + serializeWrite.writeHiveDecimal(colVector.vector[0].getHiveDecimal()); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + serializeWrite.writeHiveDecimal(colVector.vector[batchIndex].getHiveDecimal()); + return true; + } else { + serializeWrite.writeNull(); + return false; + } + } + } + } + + private Writer[] writers; + + private Writer createWriter(TypeInfo typeInfo, int columnIndex) throws HiveException { + Writer writer; + Category category = typeInfo.getCategory(); + switch (category) { + case PRIMITIVE: + { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + switch (primitiveCategory) { + // case VOID: + // UNDONE: + // break; + case BOOLEAN: + writer = new BooleanWriter(columnIndex); + break; + case BYTE: + writer = new ByteWriter(columnIndex); + break; + case SHORT: + writer = new ShortWriter(columnIndex); + break; + case INT: + writer = new IntWriter(columnIndex); + break; + case LONG: + writer = new LongWriter(columnIndex); + break; + case DATE: + writer = new DateWriter(columnIndex); + break; + case TIMESTAMP: + writer = new TimestampWriter(columnIndex); + break; + case FLOAT: + writer = new FloatWriter(columnIndex); + break; + case DOUBLE: + writer = new DoubleWriter(columnIndex); + break; + case STRING: + case CHAR: + case VARCHAR: + case BINARY: + writer = new BytesWriter(columnIndex); + break; + case DECIMAL: + writer = new HiveDecimalWriter(columnIndex); + break; + default: + throw new HiveException("Unexpected primitive type category " + primitiveCategory); + } + } + break; + default: + throw new HiveException("Unexpected type category " + category); + } + return writer; + } + + public void init(List typeNames, int[] columnMap) throws HiveException { + writers = new Writer[typeNames.size()]; + for (int i = 0; i < typeNames.size(); i++) { + String typeName = typeNames.get(i); + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + int columnIndex = columnMap[i]; + Writer writer = createWriter(typeInfo, columnIndex); + writers[i] = writer; + } + } + + public void init(List typeNames) throws HiveException { + writers = new Writer[typeNames.size()]; + for (int i = 0; i < typeNames.size(); i++) { + String typeName = typeNames.get(i); + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + Writer writer = createWriter(typeInfo, i); + writers[i] = writer; + } + } + + public int getCount() { + return writers.length; + } + + public void setOutput(Output output) { + serializeWrite.set(output); + } + + /* + * Note that when serializing a row, the logical mapping using selected in use has already + * been performed. batchIndex is the actual index of the row. + */ + public boolean serializeWrite(VectorizedRowBatch batch, int batchIndex) { + boolean anyNulls = false; + for (Writer writer : writers) { + if (!writer.apply(batch, batchIndex)) { + anyNulls = true; + } + } + return anyNulls; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRowNoNulls.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRowNoNulls.java new file mode 100644 index 0000000..e04ee03 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRowNoNulls.java @@ -0,0 +1,338 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.sql.Timestamp; +import java.util.List; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.serializewrite.SerializeWrite; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +/** + * This class serializes columns from a row in a VectorizedRowBatch into a serialization format. + * + * The caller provides the hive type names and column numbers in the order desired to + * serialize. + * + * This class uses an provided SerializeWrite object to directly serialize by writing + * field-by-field into a serialization format from the primitive values of the VectorizedRowBatch. + * + * Note that when serializing a row, the logical mapping using selected in use has already + * been performed. + * + * NOTE: This class is a variation of VectorSerializeRow for serialization of columns that + * have no nulls. + */ +public class VectorSerializeRowNoNulls { + private static final Log LOG = LogFactory.getLog(VectorSerializeRowNoNulls.class.getName()); + + private SerializeWrite serializeWrite; + + public VectorSerializeRowNoNulls(SerializeWrite serializeWrite) { + this(); + this.serializeWrite = serializeWrite; + } + + // Not public since we must have the serialize write object. + private VectorSerializeRowNoNulls() { + } + + private abstract class Writer { + protected int columnIndex; + + Writer(int columnIndex) { + this.columnIndex = columnIndex; + } + + abstract void apply(VectorizedRowBatch batch, int batchIndex); + } + + private abstract class AbstractLongWriter extends Writer { + + AbstractLongWriter(int columnIndex) { + super(columnIndex); + } + } + + private class BooleanWriter extends AbstractLongWriter { + + BooleanWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + serializeWrite.writeBoolean(colVector.vector[colVector.isRepeating ? 0 : batchIndex] != 0); + } + } + + private class ByteWriter extends AbstractLongWriter { + + ByteWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + serializeWrite.writeByte((byte) colVector.vector[colVector.isRepeating ? 0 : batchIndex]); + } + } + + private class ShortWriter extends AbstractLongWriter { + + ShortWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + serializeWrite.writeShort((short) colVector.vector[colVector.isRepeating ? 0 : batchIndex]); + } + } + + private class IntWriter extends AbstractLongWriter { + + IntWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + serializeWrite.writeInt((int) colVector.vector[colVector.isRepeating ? 0 : batchIndex]); + } + } + + private class LongWriter extends AbstractLongWriter { + + LongWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + serializeWrite.writeLong(colVector.vector[colVector.isRepeating ? 0 : batchIndex]); + } + } + + private class DateWriter extends AbstractLongWriter { + + DateWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + serializeWrite.writeDate((int) colVector.vector[colVector.isRepeating ? 0 : batchIndex]); + } + } + + private class TimestampWriter extends AbstractLongWriter { + + Timestamp scratchTimestamp; + + TimestampWriter(int columnIndex) { + super(columnIndex); + scratchTimestamp = new Timestamp(0); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + TimestampUtils.assignTimeInNanoSec(colVector.vector[colVector.isRepeating ? 0 : batchIndex], scratchTimestamp); + serializeWrite.writeTimestamp(scratchTimestamp); + } + } + + private abstract class AbstractDoubleWriter extends Writer { + + AbstractDoubleWriter(int columnIndex) { + super(columnIndex); + } + } + + private class FloatWriter extends AbstractDoubleWriter { + + FloatWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + DoubleColumnVector colVector = (DoubleColumnVector) batch.cols[columnIndex]; + serializeWrite.writeFloat((float) colVector.vector[colVector.isRepeating ? 0 : batchIndex]); + } + } + + private class DoubleWriter extends AbstractDoubleWriter { + + DoubleWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + DoubleColumnVector colVector = (DoubleColumnVector) batch.cols[columnIndex]; + serializeWrite.writeDouble(colVector.vector[colVector.isRepeating ? 0 : batchIndex]); + } + } + + private class BytesWriter extends Writer { + + BytesWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; + + if (colVector.isRepeating) { + serializeWrite.writeBytes(colVector.vector[0], colVector.start[0], colVector.length[0]); + } else { + serializeWrite.writeBytes(colVector.vector[batchIndex], colVector.start[batchIndex], colVector.length[batchIndex]); + } + } + } + + private class HiveDecimalWriter extends Writer { + + HiveDecimalWriter(int columnIndex) { + super(columnIndex); + } + + @Override + void apply(VectorizedRowBatch batch, int batchIndex) { + DecimalColumnVector colVector = (DecimalColumnVector) batch.cols[columnIndex]; + serializeWrite.writeHiveDecimal(colVector.vector[colVector.isRepeating ? 0 : batchIndex].getHiveDecimal()); + } + } + + private Writer[] writers; + + private Writer createWriter(TypeInfo typeInfo, int columnIndex) throws HiveException { + Writer writer; + Category category = typeInfo.getCategory(); + switch (category) { + case PRIMITIVE: + { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + switch (primitiveCategory) { + // case VOID: + // UNDONE: + // break; + case BOOLEAN: + writer = new BooleanWriter(columnIndex); + break; + case BYTE: + writer = new ByteWriter(columnIndex); + break; + case SHORT: + writer = new ShortWriter(columnIndex); + break; + case INT: + writer = new IntWriter(columnIndex); + break; + case LONG: + writer = new LongWriter(columnIndex); + break; + case DATE: + writer = new DateWriter(columnIndex); + break; + case TIMESTAMP: + writer = new TimestampWriter(columnIndex); + break; + case FLOAT: + writer = new FloatWriter(columnIndex); + break; + case DOUBLE: + writer = new DoubleWriter(columnIndex); + break; + case STRING: + case CHAR: + case VARCHAR: + case BINARY: + writer = new BytesWriter(columnIndex); + break; + case DECIMAL: + writer = new HiveDecimalWriter(columnIndex); + break; + default: + throw new HiveException("Unexpected primitive type category " + primitiveCategory); + } + } + break; + default: + throw new HiveException("Unexpected type category " + category); + } + return writer; + } + + public void init(List typeNames, int[] columnMap) throws HiveException { + writers = new Writer[typeNames.size()]; + for (int i = 0; i < typeNames.size(); i++) { + String typeName = typeNames.get(i); + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + int columnIndex = columnMap[i]; + Writer writer = createWriter(typeInfo, columnIndex); + writers[i] = writer; + } + } + + public void init(List typeNames) throws HiveException { + writers = new Writer[typeNames.size()]; + for (int i = 0; i < typeNames.size(); i++) { + String typeName = typeNames.get(i); + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + Writer writer = createWriter(typeInfo, i); + writers[i] = writer; + } + } + + public int getCount() { + return writers.length; + } + + public void setOutput(Output output) { + serializeWrite.set(output); + } + + /* + * Note that when serializing a row, the logical mapping using selected in use has already + * been performed. batchIndex is the actual index of the row. + */ + public void serializeWriteNoNulls(VectorizedRowBatch batch, int batchIndex) { + for (Writer writer : writers) { + writer.apply(batch, batchIndex); + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 5201c57..4d2f9c7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -31,8 +31,10 @@ import java.util.Map; import java.util.Set; import java.util.TreeMap; +import java.util.TreeSet; import java.util.regex.Pattern; +import org.apache.commons.lang.ArrayUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; @@ -125,6 +127,9 @@ private static final Log LOG = LogFactory.getLog( VectorizationContext.class.getName()); + private String contextName; + private int level; + VectorExpressionDescriptor vMap; private List projectedColumns; @@ -137,7 +142,10 @@ // Convenient constructor for initial batch creation takes // a list of columns names and maps them to 0..n-1 indices. - public VectorizationContext(List initialColumnNames) { + public VectorizationContext(String contextName, List initialColumnNames) { + this.contextName = contextName; + level = 0; + LOG.info("VectorizationContext consructor contextName " + contextName + " level " + level + " initialColumnNames " + initialColumnNames.toString()); this.projectionColumnNames = initialColumnNames; projectedColumns = new ArrayList(); @@ -154,8 +162,11 @@ public VectorizationContext(List initialColumnNames) { // Constructor to with the individual addInitialColumn method // followed by a call to finishedAddingInitialColumns. - public VectorizationContext() { - projectedColumns = new ArrayList(); + public VectorizationContext(String contextName) { + this.contextName = contextName; + level = 0; + LOG.info("VectorizationContext consructor contextName " + contextName + " level " + level); + projectedColumns = new ArrayList(); projectionColumnNames = new ArrayList(); projectionColumnMap = new HashMap(); this.ocm = new OutputColumnManager(0); @@ -166,7 +177,10 @@ public VectorizationContext() { // Constructor useful making a projection vectorization context. // Use with resetProjectionColumns and addProjectionColumn. // Keeps existing output column map, etc. - public VectorizationContext(VectorizationContext vContext) { + public VectorizationContext(String contextName, VectorizationContext vContext) { + this.contextName = contextName; + level = vContext.level + 1; + LOG.info("VectorizationContext consructor reference contextName " + contextName + " level " + level); this.projectedColumns = new ArrayList(); this.projectionColumnNames = new ArrayList(); this.projectionColumnMap = new HashMap(); @@ -235,13 +249,6 @@ public void addProjectionColumn(String columnName, int vectorBatchColIndex) { //Map column number to type private OutputColumnManager ocm; - // File key is used by operators to retrieve the scratch vectors - // from mapWork at runtime. The operators that modify the structure of - // a vector row batch, need to allocate scratch vectors as well. Every - // operator that creates a new Vectorization context should set a unique - // fileKey. - private String fileKey = null; - // Set of UDF classes for type casting data types in row-mode. private static Set> castExpressionUdfs = new HashSet>(); static { @@ -263,14 +270,6 @@ public void addProjectionColumn(String columnName, int vectorBatchColIndex) { castExpressionUdfs.add(UDFToShort.class); } - public String getFileKey() { - return fileKey; - } - - public void setFileKey(String fileKey) { - this.fileKey = fileKey; - } - protected int getInputColumnIndex(String name) throws HiveException { if (name == null) { throw new HiveException("Null column name"); @@ -311,6 +310,7 @@ int allocateOutputColumn(String hiveTypeName) { // We need to differentiate DECIMAL columns by their precision and scale... String normalizedTypeName = getNormalizedName(hiveTypeName); int relativeCol = allocateOutputColumnInternal(normalizedTypeName); + // LOG.info("allocateOutputColumn for hiveTypeName " + hiveTypeName + " column " + (initialOutputCol + relativeCol)); return initialOutputCol + relativeCol; } @@ -352,6 +352,22 @@ void freeOutputColumn(int index) { usedOutputColumns.remove(index-initialOutputCol); } } + + public int[] currentScratchColumns() { + TreeSet treeSet = new TreeSet(); + for (Integer col : usedOutputColumns) { + treeSet.add(initialOutputCol + col); + } + return ArrayUtils.toPrimitive(treeSet.toArray(new Integer[0])); + } + } + + public int allocateScratchColumn(String hiveTypeName) { + return ocm.allocateOutputColumn(hiveTypeName); + } + + public int[] currentScratchColumns() { + return ocm.currentScratchColumns(); } private VectorExpression getColumnVectorExpression(ExprNodeColumnDesc @@ -2066,6 +2082,10 @@ public VectorAggregateExpression getAggregatorExpression(AggregationDesc desc, b "\" for type: \"" + inputType.name() + " (reduce-side = " + isReduce + ")"); } + public int firstOutputColumnIndex() { + return firstOutputColumnIndex; + } + public Map getScratchColumnTypeMap() { Map map = new HashMap(); for (int i = 0; i < ocm.outputColCount; i++) { @@ -2077,7 +2097,7 @@ public VectorAggregateExpression getAggregatorExpression(AggregationDesc desc, b public String toString() { StringBuilder sb = new StringBuilder(32); - sb.append("Context key ").append(getFileKey()).append(", "); + sb.append("Context name ").append(contextName).append(", level " + level + ", "); Comparator comparerInteger = new Comparator() { @Override @@ -2089,11 +2109,11 @@ public int compare(Integer o1, Integer o2) { for (Map.Entry entry : projectionColumnMap.entrySet()) { sortedColumnMap.put(entry.getValue(), entry.getKey()); } - sb.append("sortedProjectionColumnMap ").append(sortedColumnMap).append(", "); + sb.append("sorted projectionColumnMap ").append(sortedColumnMap).append(", "); Map sortedScratchColumnTypeMap = new TreeMap(comparerInteger); sortedScratchColumnTypeMap.putAll(getScratchColumnTypeMap()); - sb.append("sortedScratchColumnTypeMap ").append(sortedScratchColumnTypeMap); + sb.append("sorted scratchColumnTypeMap ").append(sortedScratchColumnTypeMap); return sb.toString(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java index e304cf8..e3841d0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.sql.Timestamp; +import java.util.ArrayList; import java.util.LinkedList; import java.util.List; @@ -27,7 +28,6 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveVarchar; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DateWritable; @@ -39,10 +39,14 @@ import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataOutputBuffer; @@ -491,5 +495,82 @@ private static void setVector(Object row, poi.getPrimitiveCategory()); } } -} + public static StandardStructObjectInspector convertToStandardStructObjectInspector( + StructObjectInspector structObjectInspector) throws HiveException { + + List fields = structObjectInspector.getAllStructFieldRefs(); + List oids = new ArrayList(); + ArrayList columnNames = new ArrayList(); + + for(StructField field : fields) { + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString( + field.getFieldObjectInspector().getTypeName()); + ObjectInspector standardWritableObjectInspector = + TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(typeInfo); + oids.add(standardWritableObjectInspector); + columnNames.add(field.getFieldName()); + } + return ObjectInspectorFactory.getStandardStructObjectInspector(columnNames,oids); + } + + public static String displayBytes(byte[] bytes, int start, int length) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + char ch = (char) bytes[i]; + if (ch < ' ' || ch > '~') { + sb.append(String.format("\\%03d", (int) (bytes[i] & 0xff))); + } else { + sb.append(ch); + } + } + return sb.toString(); + } + + public static void debugDisplayOneRow(VectorizedRowBatch batch, int index, String prefix) { + StringBuffer sb = new StringBuffer(); + sb.append(prefix + " row " + index + " "); + for (int i = 0; i < batch.projectionSize; i++) { + int column = batch.projectedColumns[i]; + ColumnVector colVector = batch.cols[column]; + if (colVector == null) { + sb.append("(null colVector " + column + ")"); + } else { + boolean isRepeating = colVector.isRepeating; + index = (isRepeating ? 0 : index); + if (colVector.noNulls || !colVector.isNull[index]) { + if (colVector instanceof LongColumnVector) { + sb.append(((LongColumnVector) colVector).vector[index]); + } else if (colVector instanceof DoubleColumnVector) { + sb.append(((DoubleColumnVector) colVector).vector[index]); + } else if (colVector instanceof BytesColumnVector) { + BytesColumnVector bytesColumnVector = (BytesColumnVector) colVector; + byte[] bytes = bytesColumnVector.vector[index]; + int start = bytesColumnVector.start[index]; + int length = bytesColumnVector.length[index]; + if (bytes == null) { + sb.append("(Unexpected null bytes with start " + start + " length " + length + ")"); + } else { + sb.append(displayBytes(bytes, start, length)); + } + } else if (colVector instanceof DecimalColumnVector) { + sb.append(((DecimalColumnVector) colVector).vector[index].toString()); + } else { + sb.append("Unknown"); + } + } else { + sb.append("NULL"); + } + } + sb.append(" "); + } + System.out.println(sb.toString()); + } + + public static void debugDisplayBatch(VectorizedRowBatch batch, String prefix) throws HiveException { + for (int i = 0; i < batch.size; i++) { + int index = (batch.selectedInUse ? batch.selected[i] : i); + debugDisplayOneRow(batch, index, prefix); + } + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java index 4364572..7e41384 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java @@ -190,6 +190,7 @@ public void setValueWriters(VectorExpressionWriter[] valueWriters) { * - sets size to 0 * - sets endOfFile to false * - resets each column + * - inits each column */ public void reset() { selectedInUse = false; @@ -198,6 +199,7 @@ public void reset() { for (ColumnVector vc : cols) { if (vc != null) { vc.reset(); + vc.init(); } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java index 88ec2b2..c034dd0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java @@ -40,7 +40,6 @@ import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; import org.apache.hadoop.hive.ql.io.IOPrepareCache; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.Deserializer; @@ -170,9 +169,8 @@ public void init(Configuration hiveConf, FileSplit split) throws ClassNotFoundEx split.getPath(), IOPrepareCache.get().getPartitionDescMap()); String partitionPath = split.getPath().getParent().toString(); - scratchColumnTypeMap = Utilities - .getMapWorkAllScratchColumnVectorTypeMaps(hiveConf) - .get(partitionPath); + scratchColumnTypeMap = Utilities.getMapWorkVectorScratchColumnTypeMap(hiveConf); + // LOG.info("VectorizedRowBatchCtx init scratchColumnTypeMap " + scratchColumnTypeMap.toString()); Properties partProps = (part.getPartSpec() == null || part.getPartSpec().isEmpty()) ? @@ -601,7 +599,7 @@ private void addScratchColumnsToBatch(VectorizedRowBatch vrb) throws HiveExcepti for (int i = origNumCols; i < newNumCols; i++) { String typeName = scratchColumnTypeMap.get(i); if (typeName == null) { - throw new HiveException("No type found for column type entry " + i); + throw new HiveException("No type entry found for column " + i + " in map " + scratchColumnTypeMap.toString()); } vrb.cols[i] = allocateColumnVector(typeName, VectorizedRowBatch.DEFAULT_SIZE); @@ -616,7 +614,7 @@ private void addScratchColumnsToBatch(VectorizedRowBatch vrb) throws HiveExcepti * @param decimalType The given decimal type string. * @return An integer array of size 2 with first element set to precision and second set to scale. */ - private int[] getScalePrecisionFromDecimalType(String decimalType) { + private static int[] getScalePrecisionFromDecimalType(String decimalType) { Pattern p = Pattern.compile("\\d+"); Matcher m = p.matcher(decimalType); m.find(); @@ -627,7 +625,7 @@ private void addScratchColumnsToBatch(VectorizedRowBatch vrb) throws HiveExcepti return precScale; } - private ColumnVector allocateColumnVector(String type, int defaultSize) { + public static ColumnVector allocateColumnVector(String type, int defaultSize) { if (type.equalsIgnoreCase("double")) { return new DoubleColumnVector(defaultSize); } else if (VectorizationContext.isStringFamily(type)) { @@ -643,18 +641,4 @@ private ColumnVector allocateColumnVector(String type, int defaultSize) { throw new Error("Cannot allocate vector column for " + type); } } - - public VectorColumnAssign[] buildObjectAssigners(VectorizedRowBatch outputBatch) - throws HiveException { - List fieldRefs = rowOI.getAllStructFieldRefs(); - assert outputBatch.numCols == fieldRefs.size(); - VectorColumnAssign[] assigners = new VectorColumnAssign[fieldRefs.size()]; - for(int i = 0; i < assigners.length; ++i) { - StructField fieldRef = fieldRefs.get(i); - ObjectInspector fieldOI = fieldRef.getFieldObjectInspector(); - assigners[i] = VectorColumnAssignFactory.buildObjectAssign( - outputBatch, i, fieldOI); - } - return assigners; - } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index aca4273..638c86e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -135,7 +135,8 @@ Set supportedAggregationUdfs = new HashSet(); - private PhysicalContext physicalContext = null;; + private PhysicalContext physicalContext = null; + private HiveConf hiveConf; public Vectorizer() { @@ -281,13 +282,13 @@ public Vectorizer() { class VectorizationDispatcher implements Dispatcher { - private final PhysicalContext pctx; + private final PhysicalContext physicalContext; private List reduceColumnNames; private List reduceTypeInfos; - public VectorizationDispatcher(PhysicalContext pctx) { - this.pctx = pctx; + public VectorizationDispatcher(PhysicalContext physicalContext) { + this.physicalContext = physicalContext; reduceColumnNames = null; reduceTypeInfos = null; } @@ -305,7 +306,7 @@ public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) convertMapWork((MapWork) w, true); } else if (w instanceof ReduceWork) { // We are only vectorizing Reduce under Tez. - if (HiveConf.getBoolVar(pctx.getConf(), + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_ENABLED)) { convertReduceWork((ReduceWork) w); } @@ -317,7 +318,7 @@ public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) if (baseWork instanceof MapWork) { convertMapWork((MapWork) baseWork, false); } else if (baseWork instanceof ReduceWork - && HiveConf.getBoolVar(pctx.getConf(), + && HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_ENABLED)) { convertReduceWork((ReduceWork) baseWork); } @@ -388,13 +389,12 @@ private void vectorizeMapWork(MapWork mapWork) throws SemanticException { HashMap nodeOutput = new HashMap(); ogw.startWalking(topNodes, nodeOutput); - Map> allScratchColumnVectorTypeMaps = vnp.getAllScratchColumnVectorTypeMaps(); - mapWork.setAllScratchColumnVectorTypeMaps(allScratchColumnVectorTypeMaps); - Map> allColumnVectorMaps = vnp.getAllColumnVectorMaps(); - mapWork.setAllColumnVectorMaps(allColumnVectorMaps); + mapWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap()); + mapWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap()); + mapWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap()); if (LOG.isDebugEnabled()) { - debugDisplayAllMaps(allColumnVectorMaps, allScratchColumnVectorTypeMaps); + debugDisplayAllMaps(mapWork); } return; @@ -490,7 +490,7 @@ private void vectorizeReduceWork(ReduceWork reduceWork) throws SemanticException // VectorizationContext... Do we use PreOrderWalker instead of DefaultGraphWalker. Map opRules = new LinkedHashMap(); ReduceWorkVectorizationNodeProcessor vnp = - new ReduceWorkVectorizationNodeProcessor(reduceColumnNames); + new ReduceWorkVectorizationNodeProcessor(reduceColumnNames, reduceTypeInfos); addReduceWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new PreOrderWalker(disp); @@ -505,14 +505,12 @@ private void vectorizeReduceWork(ReduceWork reduceWork) throws SemanticException // Necessary since we are vectorizing the root operator in reduce. reduceWork.setReducer(vnp.getRootVectorOp()); - Map> allScratchColumnVectorTypeMaps = vnp.getAllScratchColumnVectorTypeMaps(); - reduceWork.setAllScratchColumnVectorTypeMaps(allScratchColumnVectorTypeMaps); - Map> allColumnVectorMaps = vnp.getAllColumnVectorMaps(); - reduceWork.setAllColumnVectorMaps(allColumnVectorMaps); - + reduceWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap()); + reduceWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap()); + reduceWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap()); if (LOG.isDebugEnabled()) { - debugDisplayAllMaps(allColumnVectorMaps, allScratchColumnVectorTypeMaps); + debugDisplayAllMaps(reduceWork); } } } @@ -569,38 +567,34 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // ReduceWorkVectorizationNodeProcessor. class VectorizationNodeProcessor implements NodeProcessor { - // This is used to extract scratch column types for each file key - protected final Map scratchColumnContext = - new HashMap(); + // The vectorization context for the Map or Reduce task. + protected VectorizationContext taskVectorizationContext; - protected final Map, VectorizationContext> vContextsByOp = - new HashMap, VectorizationContext>(); + // The input projection column type name map for the Map or Reduce task. + protected Map taskColumnTypeNameMap; - protected final Set> opsDone = - new HashSet>(); + VectorizationNodeProcessor() { + taskColumnTypeNameMap = new HashMap(); + } - public Map> getAllScratchColumnVectorTypeMaps() { - Map> allScratchColumnVectorTypeMaps = - new HashMap>(); - for (String onefile : scratchColumnContext.keySet()) { - VectorizationContext vc = scratchColumnContext.get(onefile); - Map cmap = vc.getScratchColumnTypeMap(); - allScratchColumnVectorTypeMaps.put(onefile, cmap); - } - return allScratchColumnVectorTypeMaps; + public Map getVectorColumnNameMap() { + return taskVectorizationContext.getProjectionColumnMap(); } - public Map> getAllColumnVectorMaps() { - Map> allColumnVectorMaps = - new HashMap>(); - for(String oneFile: scratchColumnContext.keySet()) { - VectorizationContext vc = scratchColumnContext.get(oneFile); - Map cmap = vc.getProjectionColumnMap(); - allColumnVectorMaps.put(oneFile, cmap); - } - return allColumnVectorMaps; + public Map getVectorColumnTypeMap() { + return taskColumnTypeNameMap; + } + + public Map getVectorScratchColumnTypeMap() { + return taskVectorizationContext.getScratchColumnTypeMap(); } + protected final Set> opsDone = + new HashSet>(); + + protected final Map, Operator> opToVectorOpMap = + new HashMap, Operator>(); + public VectorizationContext walkStackToFindVectorizationContext(Stack stack, Operator op) throws SemanticException { VectorizationContext vContext = null; @@ -617,7 +611,18 @@ public VectorizationContext walkStackToFindVectorizationContext(Stack stac return null; } Operator opParent = (Operator) stack.get(i); - vContext = vContextsByOp.get(opParent); + Operator vectorOpParent = opToVectorOpMap.get(opParent); + if (vectorOpParent != null) { + if (vectorOpParent instanceof VectorizationContextRegion) { + VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOpParent; + vContext = vcRegion.getOuputVectorizationContext(); + LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " has new vectorization context " + vContext.toString()); + } else { + LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " does not have new vectorization context"); + } + } else { + LOG.info("walkStackToFindVectorizationContext " + opParent.getName() + " is not vectorized"); + } --i; } return vContext; @@ -631,14 +636,9 @@ public VectorizationContext walkStackToFindVectorizationContext(Stack stac vectorOp = vectorizeOperator(op, vContext); opsDone.add(op); if (vectorOp != op) { + opToVectorOpMap.put(op, vectorOp); opsDone.add(vectorOp); } - if (vectorOp instanceof VectorizationContextRegion) { - VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; - VectorizationContext vOutContext = vcRegion.getOuputVectorizationContext(); - vContextsByOp.put(op, vOutContext); - scratchColumnContext.put(vOutContext.getFileKey(), vOutContext); - } } } catch (HiveException e) { throw new SemanticException(e); @@ -658,6 +658,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, private final MapWork mWork; public MapWorkVectorizationNodeProcessor(MapWork mWork) { + super(); this.mWork = mWork; } @@ -666,41 +667,26 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { Operator op = (Operator) nd; - LOG.info("MapWorkVectorizationNodeProcessor processing Operator: " + op.getName() + "..."); VectorizationContext vContext = null; if (op instanceof TableScanOperator) { - vContext = getVectorizationContext(op, physicalContext); - for (String onefile : mWork.getPathToAliases().keySet()) { - List aliases = mWork.getPathToAliases().get(onefile); - for (String alias : aliases) { - Operator opRoot = mWork.getAliasToWork().get(alias); - if (op == opRoot) { - // The same vectorization context is copied multiple times into - // the MapWork scratch columnMap - // Each partition gets a copy - // - vContext.setFileKey(onefile); - scratchColumnContext.put(onefile, vContext); - if (LOG.isDebugEnabled()) { - LOG.debug("Vectorized MapWork operator " + op.getName() + " vectorization context " + vContext.toString()); - } - break; - } - } + if (taskVectorizationContext == null) { + taskVectorizationContext = getVectorizationContext(op.getSchema(), op.getName(), + taskColumnTypeNameMap); } - vContextsByOp.put(op, vContext); + vContext = taskVectorizationContext; } else { + LOG.info("MapWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName()); vContext = walkStackToFindVectorizationContext(stack, op); if (vContext == null) { - throw new SemanticException( - String.format("Did not find vectorization context for operator %s in operator stack", - op.getName())); + // No operator has "pushed" a new context -- so use the task vectorization context. + vContext = taskVectorizationContext; } } assert vContext != null; + LOG.info("MapWorkVectorizationNodeProcessor process operator " + op.getName() + " using vectorization context" + vContext.toString()); // When Vectorized GROUPBY outputs rows instead of vectorized row batchs, we don't // vectorize the operators below it. @@ -715,9 +701,10 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Operator vectorOp = doVectorize(op, vContext); if (LOG.isDebugEnabled()) { - LOG.debug("Vectorized MapWork operator " + vectorOp.getName() + " vectorization context " + vContext.toString()); if (vectorOp instanceof VectorizationContextRegion) { - LOG.debug("Vectorized MapWork operator " + vectorOp.getName() + " added vectorization context " + vContext.toString()); + VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; + VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext(); + LOG.debug("Vectorized MapWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString()); } } @@ -728,8 +715,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { private final List reduceColumnNames; - - private VectorizationContext reduceShuffleVectorizationContext; + private final List reduceTypeInfos; private Operator rootVectorOp; @@ -737,10 +723,12 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return rootVectorOp; } - public ReduceWorkVectorizationNodeProcessor(List reduceColumnNames) { + public ReduceWorkVectorizationNodeProcessor(List reduceColumnNames, + List reduceTypeInfos) { + super(); this.reduceColumnNames = reduceColumnNames; + this.reduceTypeInfos = reduceTypeInfos; rootVectorOp = null; - reduceShuffleVectorizationContext = null; } @Override @@ -748,8 +736,6 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { Operator op = (Operator) nd; - LOG.info("ReduceWorkVectorizationNodeProcessor processing Operator: " + - op.getName() + "..."); VectorizationContext vContext = null; @@ -758,25 +744,30 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, if (op.getParentOperators().size() == 0) { LOG.info("ReduceWorkVectorizationNodeProcessor process reduceColumnNames " + reduceColumnNames.toString()); - vContext = new VectorizationContext(reduceColumnNames); - vContext.setFileKey("_REDUCE_SHUFFLE_"); - scratchColumnContext.put("_REDUCE_SHUFFLE_", vContext); - reduceShuffleVectorizationContext = vContext; + vContext = new VectorizationContext("__Reduce_Shuffle__", reduceColumnNames); + taskVectorizationContext = vContext; + int i = 0; + for (TypeInfo typeInfo : reduceTypeInfos) { + taskColumnTypeNameMap.put(i, typeInfo.getTypeName()); + i++; + } saveRootVectorOp = true; if (LOG.isDebugEnabled()) { LOG.debug("Vectorized ReduceWork reduce shuffle vectorization context " + vContext.toString()); } } else { + LOG.info("ReduceWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName()); vContext = walkStackToFindVectorizationContext(stack, op); if (vContext == null) { // If we didn't find a context among the operators, assume the top -- reduce shuffle's // vectorization context. - vContext = reduceShuffleVectorizationContext; + vContext = taskVectorizationContext; } } assert vContext != null; + LOG.info("ReduceWorkVectorizationNodeProcessor process operator " + op.getName() + " using vectorization context" + vContext.toString()); // When Vectorized GROUPBY outputs rows instead of vectorized row batchs, we don't // vectorize the operators below it. @@ -791,9 +782,10 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Operator vectorOp = doVectorize(op, vContext); if (LOG.isDebugEnabled()) { - LOG.debug("Vectorized ReduceWork operator " + vectorOp.getName() + " vectorization context " + vContext.toString()); if (vectorOp instanceof VectorizationContextRegion) { - LOG.debug("Vectorized ReduceWork operator " + vectorOp.getName() + " added vectorization context " + vContext.toString()); + VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; + VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext(); + LOG.debug("Vectorized ReduceWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString()); } } if (vectorOp instanceof VectorGroupByOperator) { @@ -811,7 +803,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, private static class ValidatorVectorizationContext extends VectorizationContext { private ValidatorVectorizationContext() { - super(); + super("No Name"); } @Override @@ -826,25 +818,27 @@ protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) { } @Override - public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException { - this.physicalContext = pctx; - boolean vectorPath = HiveConf.getBoolVar(pctx.getConf(), + public PhysicalContext resolve(PhysicalContext physicalContext) throws SemanticException { + this.physicalContext = physicalContext; + hiveConf = physicalContext.getConf(); + + boolean vectorPath = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED); if (!vectorPath) { LOG.info("Vectorization is disabled"); - return pctx; + return physicalContext; } // create dispatcher and graph walker - Dispatcher disp = new VectorizationDispatcher(pctx); + Dispatcher disp = new VectorizationDispatcher(physicalContext); TaskGraphWalker ogw = new TaskGraphWalker(disp); // get all the tasks nodes from root task ArrayList topNodes = new ArrayList(); - topNodes.addAll(pctx.getRootTasks()); + topNodes.addAll(physicalContext.getRootTasks()); // begin to walk through the task tree. ogw.startWalking(topNodes, null); - return pctx; + return physicalContext; } boolean validateMapWorkOperator(Operator op, MapWork mWork, boolean isTez) { @@ -896,7 +890,7 @@ boolean validateReduceWorkOperator(Operator op) { } break; case GROUPBY: - if (HiveConf.getBoolVar(physicalContext.getConf(), + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_GROUPBY_ENABLED)) { ret = validateGroupByOperator((GroupByOperator) op, true, true); } else { @@ -1257,20 +1251,24 @@ private boolean validateDataType(String type) { return supportedDataTypesPattern.matcher(type.toLowerCase()).matches(); } - private VectorizationContext getVectorizationContext(Operator op, - PhysicalContext pctx) { - RowSchema rs = op.getSchema(); + private VectorizationContext getVectorizationContext(RowSchema rowSchema, String contextName, + Map typeNameMap) { + + VectorizationContext vContext = new VectorizationContext(contextName); // Add all non-virtual columns to make a vectorization context for // the TableScan operator. - VectorizationContext vContext = new VectorizationContext(); - for (ColumnInfo c : rs.getSignature()) { + int i = 0; + for (ColumnInfo c : rowSchema.getSignature()) { // Earlier, validation code should have eliminated virtual columns usage (HIVE-5560). if (!isVirtualColumn(c)) { vContext.addInitialColumn(c.getInternalName()); + typeNameMap.put(i, c.getTypeName()); + i++; } } vContext.finishedAddingInitialColumns(); + return vContext; } @@ -1328,40 +1326,14 @@ private boolean isVirtualColumn(ColumnInfo column) { return false; } - public void debugDisplayAllMaps(Map> allColumnVectorMaps, - Map> allScratchColumnVectorTypeMaps) { - - // Context keys grow in length since they are a path... - Comparator comparerShorterString = new Comparator() { - @Override - public int compare(String o1, String o2) { - Integer length1 = o1.length(); - Integer length2 = o2.length(); - return length1.compareTo(length2); - }}; - - Comparator comparerInteger = new Comparator() { - @Override - public int compare(Integer o1, Integer o2) { - return o1.compareTo(o2); - }}; - - Map> sortedAllColumnVectorMaps = new TreeMap>(comparerShorterString); - for (Map.Entry> entry : allColumnVectorMaps.entrySet()) { - Map sortedColumnMap = new TreeMap(comparerInteger); - for (Map.Entry innerEntry : entry.getValue().entrySet()) { - sortedColumnMap.put(innerEntry.getValue(), innerEntry.getKey()); - } - sortedAllColumnVectorMaps.put(entry.getKey(), sortedColumnMap); - } - LOG.debug("sortedAllColumnVectorMaps " + sortedAllColumnVectorMaps); + public void debugDisplayAllMaps(BaseWork work) { - Map> sortedAllScratchColumnVectorTypeMap = new TreeMap>(comparerShorterString); - for (Map.Entry> entry : allScratchColumnVectorTypeMaps.entrySet()) { - Map sortedScratchColumnTypeMap = new TreeMap(comparerInteger); - sortedScratchColumnTypeMap.putAll(entry.getValue()); - sortedAllScratchColumnVectorTypeMap.put(entry.getKey(), sortedScratchColumnTypeMap); - } - LOG.debug("sortedAllScratchColumnVectorTypeMap " + sortedAllScratchColumnVectorTypeMap); + Map columnNameMap = work.getVectorColumnNameMap(); + Map columnTypeMap = work.getVectorColumnTypeMap(); + Map scratchColumnTypeMap = work.getVectorScratchColumnTypeMap(); + + LOG.debug("debugDisplayAllMaps columnNameMap " + columnNameMap.toString()); + LOG.debug("debugDisplayAllMaps columnTypeMap " + columnTypeMap.toString()); + LOG.debug("debugDisplayAllMaps scratchColumnTypeMap " + scratchColumnTypeMap.toString()); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java index 1737a34..b56159a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java @@ -58,8 +58,9 @@ public BaseWork(String name) { private String name; // Vectorization. - protected Map> allScratchColumnVectorTypeMaps = null; - protected Map> allColumnVectorMaps = null; + protected Map vectorColumnNameMap; + protected Map vectorColumnTypeMap; + protected Map vectorScratchColumnTypeMap; protected boolean vectorMode = false; public void setGatheringStats(boolean gatherStats) { @@ -142,21 +143,28 @@ public void addDummyOp(HashTableDummyOperator dummyOp) { return returnSet; } - public Map> getAllScratchColumnVectorTypeMaps() { - return allScratchColumnVectorTypeMaps; + public Map getVectorColumnNameMap() { + return vectorColumnNameMap; } - public void setAllScratchColumnVectorTypeMaps( - Map> allScratchColumnVectorTypeMaps) { - this.allScratchColumnVectorTypeMaps = allScratchColumnVectorTypeMaps; + public void setVectorColumnNameMap(Map vectorColumnNameMap) { + this.vectorColumnNameMap = vectorColumnNameMap; } - public Map> getAllColumnVectorMaps() { - return allColumnVectorMaps; + public Map getVectorColumnTypeMap() { + return vectorColumnTypeMap; } - public void setAllColumnVectorMaps(Map> allColumnVectorMaps) { - this.allColumnVectorMaps = allColumnVectorMaps; + public void setVectorColumnTypeMap(Map vectorColumnTypeMap) { + this.vectorColumnTypeMap = vectorColumnTypeMap; + } + + public Map getVectorScratchColumnTypeMap() { + return vectorScratchColumnTypeMap; + } + + public void setVectorScratchColumnTypeMap(Map vectorScratchColumnTypeMap) { + this.vectorScratchColumnTypeMap = vectorScratchColumnTypeMap; } /** diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowObjectSource.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowObjectSource.java new file mode 100644 index 0000000..df46c2e --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowObjectSource.java @@ -0,0 +1,210 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.sql.Date; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.BooleanWritable; + +/** + * Generate object inspector and random row object[]. + */ +public class RandomRowObjectSource { + + private Random r; + + private int columnCount; + + private List typeNames; + + private PrimitiveCategory[] primitiveCategories; + + private List primitiveObjectInspectorList; + + private StructObjectInspector rowStructObjectInspector; + + public List typeNames() { + return typeNames; + } + + public PrimitiveCategory[] primitiveCategories() { + return primitiveCategories; + } + + public StructObjectInspector rowStructObjectInspector() { + return rowStructObjectInspector; + } + + public void init(Random r) { + this.r = r; + chooseSchema(); + } + + private static String[] possibleHiveTypeNames = { + "boolean", + "tinyint", + "smallint", + "int", + "bigint", + "date", + "float", + "double", + "string" + }; + + private void chooseSchema() { + columnCount = 1 + r.nextInt(20); + typeNames = new ArrayList(columnCount); + primitiveCategories = new PrimitiveCategory[columnCount]; + primitiveObjectInspectorList = new ArrayList(columnCount); + List columnNames = new ArrayList(columnCount); + for (int c = 0; c < columnCount; c++) { + columnNames.add(String.format("col%d", c)); + int typeNum = r.nextInt(possibleHiveTypeNames.length); + String typeName = possibleHiveTypeNames[typeNum]; + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + primitiveCategories[c] = primitiveCategory; + primitiveObjectInspectorList.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveCategory)); + typeNames.add(typeName); + } + rowStructObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, primitiveObjectInspectorList); + } + + public Object[][] randomRows(int n) { + Object[][] result = new Object[n][]; + for (int i = 0; i < n; i++) { + result[i] = randomRow(); + } + return result; + } + + public Object[] randomRow() { + Object row[] = new Object[columnCount]; + for (int c = 0; c < columnCount; c++) { + Object object = randomObject(c); + if (object == null) { + throw new Error("Unexpected null for column " + c); + } + row[c] = getWritableObject(c, object); + if (row[c] == null) { + throw new Error("Unexpected null for writable for column " + c); + } + } + return row; + } + + public Object getWritableObject(int column, Object object) { + ObjectInspector objectInspector = primitiveObjectInspectorList.get(column); + PrimitiveCategory primitiveCategory = primitiveCategories[column]; + switch (primitiveCategory) { + case BOOLEAN: + return ((WritableBooleanObjectInspector) objectInspector).create((boolean) object); + case BYTE: + return ((WritableByteObjectInspector) objectInspector).create((byte) object); + case SHORT: + return ((WritableShortObjectInspector) objectInspector).create((short) object); + case INT: + return ((WritableIntObjectInspector) objectInspector).create((int) object); + case LONG: + return ((WritableLongObjectInspector) objectInspector).create((long) object); + case DATE: + return ((WritableDateObjectInspector) objectInspector).create((Date) object); + case FLOAT: + return ((WritableFloatObjectInspector) objectInspector).create((float) object); + case DOUBLE: + return ((WritableDoubleObjectInspector) objectInspector).create((double) object); + case STRING: + return ((WritableStringObjectInspector) objectInspector).create((String) object); + default: + throw new Error("Unknown primitive category " + primitiveCategory); + } + } + + public Object randomObject(int column) { + PrimitiveCategory primitiveCategory = primitiveCategories[column]; + switch (primitiveCategory) { + case BOOLEAN: + return Boolean.valueOf(r.nextInt(1) == 1); + case BYTE: + return Byte.valueOf((byte) r.nextInt()); + case SHORT: + return Short.valueOf((short) r.nextInt()); + case INT: + return Integer.valueOf(r.nextInt()); + case LONG: + return Long.valueOf(r.nextLong()); + case DATE: + return getRandDate(r); + case FLOAT: + return Float.valueOf(r.nextFloat() * 10 - 5); + case DOUBLE: + return Double.valueOf(r.nextDouble() * 10 - 5); + case STRING: + return getRandString(r); + default: + throw new Error("Unknown primitive category " + primitiveCategory); + } + } + + public static String getRandString(Random r) { + return getRandString(r, null, r.nextInt(10)); + } + + public static String getRandString(Random r, String characters, int length) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < length; i++) { + if (characters == null) { + sb.append((char) (r.nextInt(128))); + } else { + sb.append(characters.charAt(r.nextInt(characters.length()))); + } + } + return sb.toString(); + } + + public static Date getRandDate(Random r) { + String dateStr = String.format("%d-%02d-%02d", + Integer.valueOf(1800 + r.nextInt(500)), // year + Integer.valueOf(1 + r.nextInt(12)), // month + Integer.valueOf(1 + r.nextInt(28))); // day + Date dateVal = Date.valueOf(dateStr); + return dateVal; + } + +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorFilterOperator.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorFilterOperator.java index ac3cb81..8008f3e 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorFilterOperator.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorFilterOperator.java @@ -87,7 +87,7 @@ private VectorFilterOperator getAVectorFilterOperator() throws HiveException { ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(Long.class, "col1", "table", false); List columns = new ArrayList(); columns.add("col1"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); FilterDesc fdesc = new FilterDesc(); fdesc.setPredicate(col1Expr); return new VectorFilterOperator(vc, fdesc); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorGroupByOperator.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorGroupByOperator.java index fbb7ff2..13d1fd0 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorGroupByOperator.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorGroupByOperator.java @@ -172,7 +172,7 @@ public void testMemoryPressureFlush() throws HiveException { List mapColumnNames = new ArrayList(); mapColumnNames.add("Key"); mapColumnNames.add("Value"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); GroupByDesc desc = buildKeyGroupByDesc (ctx, "max", "Value", TypeInfoFactory.longTypeInfo, @@ -1709,7 +1709,7 @@ private void testMultiKey( mapColumnNames.put("value", i); outputColumnNames.add("value"); - VectorizationContext ctx = new VectorizationContext(outputColumnNames); + VectorizationContext ctx = new VectorizationContext("name", outputColumnNames); ArrayList aggs = new ArrayList(1); aggs.add( @@ -1820,7 +1820,7 @@ private void testKeyTypeAggregate( List mapColumnNames = new ArrayList(); mapColumnNames.add("Key"); mapColumnNames.add("Value"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); Set keys = new HashSet(); AggregationDesc agg = buildAggregationDesc(ctx, aggregateName, @@ -2234,7 +2234,7 @@ public void testAggregateCountStarIterable ( Object expected) throws HiveException { List mapColumnNames = new ArrayList(); mapColumnNames.add("A"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); GroupByDesc desc = buildGroupByDescCountStar (ctx); @@ -2263,7 +2263,7 @@ public void testAggregateCountReduceIterable ( Object expected) throws HiveException { List mapColumnNames = new ArrayList(); mapColumnNames.add("A"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); GroupByDesc desc = buildGroupByDescType(ctx, "count", "A", TypeInfoFactory.longTypeInfo); VectorGroupByDesc vectorDesc = desc.getVectorDesc(); @@ -2295,7 +2295,7 @@ public void testAggregateStringIterable ( Object expected) throws HiveException { List mapColumnNames = new ArrayList(); mapColumnNames.add("A"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); GroupByDesc desc = buildGroupByDescType(ctx, aggregateName, "A", TypeInfoFactory.stringTypeInfo); @@ -2326,7 +2326,7 @@ public void testAggregateDecimalIterable ( Object expected) throws HiveException { List mapColumnNames = new ArrayList(); mapColumnNames.add("A"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); GroupByDesc desc = buildGroupByDescType(ctx, aggregateName, "A", TypeInfoFactory.getDecimalTypeInfo(30, 4)); @@ -2358,7 +2358,7 @@ public void testAggregateDoubleIterable ( Object expected) throws HiveException { List mapColumnNames = new ArrayList(); mapColumnNames.add("A"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); GroupByDesc desc = buildGroupByDescType (ctx, aggregateName, "A", TypeInfoFactory.doubleTypeInfo); @@ -2389,7 +2389,7 @@ public void testAggregateLongIterable ( Object expected) throws HiveException { List mapColumnNames = new ArrayList(); mapColumnNames.add("A"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); GroupByDesc desc = buildGroupByDescType(ctx, aggregateName, "A", TypeInfoFactory.longTypeInfo); @@ -2420,7 +2420,7 @@ public void testAggregateLongKeyIterable ( List mapColumnNames = new ArrayList(); mapColumnNames.add("Key"); mapColumnNames.add("Value"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); Set keys = new HashSet(); @@ -2487,7 +2487,7 @@ public void testAggregateStringKeyIterable ( List mapColumnNames = new ArrayList(); mapColumnNames.add("Key"); mapColumnNames.add("Value"); - VectorizationContext ctx = new VectorizationContext(mapColumnNames); + VectorizationContext ctx = new VectorizationContext("name", mapColumnNames); Set keys = new HashSet(); GroupByDesc desc = buildKeyGroupByDesc (ctx, aggregateName, "Value", diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java new file mode 100644 index 0000000..0f8712e --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; + +import junit.framework.TestCase; + +/** + * Unit test for the vectorized conversion to and from row object[]. + */ +public class TestVectorRowObject extends TestCase { + + void examineBatch(VectorizedRowBatch batch, VectorExtractRowSameBatch vectorExtractRow, + Object[][] randomRows, int firstRandomRowIndex ) { + + int rowSize = vectorExtractRow.getCount(); + Object[] row = new Object[rowSize]; + for (int i = 0; i < batch.size; i++) { + vectorExtractRow.extractRow(i, row); + Object[] expectedRow = randomRows[firstRandomRowIndex + i]; + for (int c = 0; c < rowSize; c++) { + if (!row[c].equals(expectedRow[c])) { + fail("Row " + (firstRandomRowIndex + i) + " and column " + c + " mismatch"); + } + } + } + } + + void testVectorRowObject(int caseNum, Random r) throws HiveException { + + Map emptyScratchMap = new HashMap(); + + RandomRowObjectSource source = new RandomRowObjectSource(); + source.init(r); + + VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); + batchContext.init(emptyScratchMap, source.rowStructObjectInspector()); + VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); + + VectorAssignRowSameBatch vectorAssignRow = new VectorAssignRowSameBatch(); + vectorAssignRow.init(source.typeNames()); + vectorAssignRow.setOneBatch(batch); + + VectorExtractRowSameBatch vectorExtractRow = new VectorExtractRowSameBatch(); + vectorExtractRow.init(source.typeNames()); + vectorExtractRow.setOneBatch(batch); + + Object[][] randomRows = source.randomRows(100000); + int firstRandomRowIndex = 0; + for (int i = 0; i < randomRows.length; i++) { + Object[] row = randomRows[i]; + + vectorAssignRow.assignRow(batch.size, row); + batch.size++; + if (batch.size == batch.DEFAULT_SIZE) { + examineBatch(batch, vectorExtractRow, randomRows, firstRandomRowIndex); + firstRandomRowIndex = i + 1; + batch.reset(); + } + } + if (batch.size > 0) { + examineBatch(batch, vectorExtractRow, randomRows, firstRandomRowIndex); + } + } + + public void testVectorRowObject() throws Throwable { + + try { + Random r = new Random(5678); + for (int c = 0; c < 10; c++) { + testVectorRowObject(c, r); + } + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSelectOperator.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSelectOperator.java index 3c004a1..872ef6e 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSelectOperator.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSelectOperator.java @@ -88,7 +88,7 @@ public void testSelectOperator() throws HiveException { columns.add("a"); columns.add("b"); columns.add("c"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); SelectDesc selDesc = new SelectDesc(false); List colList = new ArrayList(); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java new file mode 100644 index 0000000..6f20b4e --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java @@ -0,0 +1,410 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.io.IOException; +import java.sql.Date; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.binarysortable.deserializeread.BinarySortableDeserializeRead; +import org.apache.hadoop.hive.serde2.binarysortable.serializewrite.BinarySortableSerializeWrite; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadBytesResults; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadDateResults; +import org.apache.hadoop.hive.serde2.lazybinary.deserializeread.LazyBinaryDeserializeRead; +import org.apache.hadoop.hive.serde2.lazybinary.serializewrite.LazyBinarySerializeWrite; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.serializewrite.SerializeWrite; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +import junit.framework.TestCase; + +/** + * Unit test for the vectorized serialize and deserialize row. + */ +public class TestVectorSerDeRow extends TestCase { + + public static enum SerializationType { + NONE, + BINARY_SORTABLE, + LAZY_BINARY + } + + void deserializeAndVerify(Output output, DeserializeRead deserializeRead, + RandomRowObjectSource source, Object[] expectedRow) + throws HiveException, IOException { + deserializeRead.set(output.getData(), 0, output.getLength()); + PrimitiveCategory[] primitiveCategories = source.primitiveCategories(); + for (int i = 0; i < primitiveCategories.length; i++) { + Object expected = expectedRow[i]; + PrimitiveCategory primitiveCategory = primitiveCategories[i]; + if (deserializeRead.readCheckNull()) { + throw new HiveException("Unexpected NULL"); + } + switch (primitiveCategory) { + case BOOLEAN: + { + Boolean value = deserializeRead.readBoolean(); + BooleanWritable expectedWritable = (BooleanWritable) expected; + if (!value.equals(expectedWritable.get())) { + TestCase.fail("Boolean field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case BYTE: + { + Byte value = deserializeRead.readByte(); + ByteWritable expectedWritable = (ByteWritable) expected; + if (!value.equals(expectedWritable.get())) { + TestCase.fail("Byte field mismatch (expected " + (int) expected + " found " + (int) value + ")"); + } + } + break; + case SHORT: + { + Short value = deserializeRead.readShort(); + ShortWritable expectedWritable = (ShortWritable) expected; + if (!value.equals(expectedWritable.get())) { + TestCase.fail("Short field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case INT: + { + Integer value = deserializeRead.readInt(); + IntWritable expectedWritable = (IntWritable) expected; + if (!value.equals(expectedWritable.get())) { + TestCase.fail("Int field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case LONG: + { + Long value = deserializeRead.readLong(); + LongWritable expectedWritable = (LongWritable) expected; + if (!value.equals(expectedWritable.get())) { + TestCase.fail("Long field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case DATE: + { + ReadDateResults readDateResults = deserializeRead.createReadDateResults(); + deserializeRead.readDate(readDateResults); + Date value = readDateResults.getDate(); + DateWritable expectedWritable = (DateWritable) expected; + if (!value.equals(expectedWritable.get())) { + TestCase.fail("Date field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case FLOAT: + { + Float value = deserializeRead.readFloat(); + FloatWritable expectedWritable = (FloatWritable) expected; + if (!value.equals(expectedWritable.get())) { + TestCase.fail("Float field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case DOUBLE: + { + Double value = deserializeRead.readDouble(); + DoubleWritable expectedWritable = (DoubleWritable) expected; + if (!value.equals(expectedWritable.get())) { + TestCase.fail("Double field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case STRING: + { + ReadBytesResults readBytesResults = deserializeRead.createReadBytesResults(); + deserializeRead.readBytes(readBytesResults); + byte[] stringBytes = Arrays.copyOfRange(readBytesResults.bytes, readBytesResults.start, readBytesResults.start + readBytesResults.length); + Text text = new Text(stringBytes); + String value = text.toString(); + Text expectedWritable = (Text) expected; + if (!value.equals(expectedWritable.toString())) { + TestCase.fail("String field mismatch (expected '" + expected + "' found '" + value + "')"); + } + } + break; + default: + throw new HiveException("Unexpected primitive category " + primitiveCategory); + } + } + } + + void serializeBatch(VectorizedRowBatch batch, VectorSerializeRow vectorSerializeRow, + DeserializeRead deserializeRead, RandomRowObjectSource source, Object[][] randomRows, + int firstRandomRowIndex) throws HiveException, IOException { + + Output output = new Output(); + for (int i = 0; i < batch.size; i++) { + output.reset(); + vectorSerializeRow.setOutput(output); + vectorSerializeRow.serializeWrite(batch, i); + Object[] expectedRow = randomRows[firstRandomRowIndex + i]; + deserializeAndVerify(output, deserializeRead, source,expectedRow); + } + } + + void testVectorSerializeRow(int caseNum, Random r, SerializationType serializationType) throws HiveException, IOException { + + Map emptyScratchMap = new HashMap(); + + RandomRowObjectSource source = new RandomRowObjectSource(); + source.init(r); + + VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); + batchContext.init(emptyScratchMap, source.rowStructObjectInspector()); + VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); + + VectorAssignRowSameBatch vectorAssignRow = new VectorAssignRowSameBatch(); + vectorAssignRow.init(source.typeNames()); + vectorAssignRow.setOneBatch(batch); + + int fieldCount = source.typeNames().size(); + DeserializeRead deserializeRead; + SerializeWrite serializeWrite; + switch (serializationType) { + case BINARY_SORTABLE: + deserializeRead = new BinarySortableDeserializeRead(fieldCount); + serializeWrite = new BinarySortableSerializeWrite(fieldCount); + break; + case LAZY_BINARY: + deserializeRead = new LazyBinaryDeserializeRead(fieldCount); + serializeWrite = new LazyBinarySerializeWrite(fieldCount, false); + break; + default: + throw new Error("Unknown serialization type " + serializationType); + } + VectorSerializeRow vectorSerializeRow = new VectorSerializeRow(serializeWrite); + vectorSerializeRow.init(source.typeNames()); + + Object[][] randomRows = source.randomRows(100000); + int firstRandomRowIndex = 0; + for (int i = 0; i < randomRows.length; i++) { + Object[] row = randomRows[i]; + + vectorAssignRow.assignRow(batch.size, row); + batch.size++; + if (batch.size == batch.DEFAULT_SIZE) { + serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex); + firstRandomRowIndex = i + 1; + batch.reset(); + } + } + if (batch.size > 0) { + serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex); + } + } + + void examineBatch(VectorizedRowBatch batch, VectorExtractRowSameBatch vectorExtractRow, + Object[][] randomRows, int firstRandomRowIndex ) { + + int rowSize = vectorExtractRow.getCount(); + Object[] row = new Object[rowSize]; + for (int i = 0; i < batch.size; i++) { + vectorExtractRow.extractRow(i, row); + Object[] expectedRow = randomRows[firstRandomRowIndex + i]; + for (int c = 0; c < rowSize; c++) { + if (!row[c].equals(expectedRow[c])) { + fail("Row " + (firstRandomRowIndex + i) + " and column " + c + " mismatch"); + } + } + } + } + + private Output serializeRow(Object[] row, RandomRowObjectSource source, SerializeWrite serializeWrite) throws HiveException { + Output output = new Output(); + serializeWrite.set(output); + PrimitiveCategory[] primitiveCategories = source.primitiveCategories(); + for (int i = 0; i < primitiveCategories.length; i++) { + Object object = row[i]; + PrimitiveCategory primitiveCategory = primitiveCategories[i]; + switch (primitiveCategory) { + case BOOLEAN: + { + BooleanWritable expectedWritable = (BooleanWritable) object; + boolean value = expectedWritable.get(); + serializeWrite.writeBoolean(value); + } + break; + case BYTE: + { + ByteWritable expectedWritable = (ByteWritable) object; + byte value = expectedWritable.get(); + serializeWrite.writeByte(value); + } + break; + case SHORT: + { + ShortWritable expectedWritable = (ShortWritable) object; + short value = expectedWritable.get(); + serializeWrite.writeShort(value); + } + break; + case INT: + { + IntWritable expectedWritable = (IntWritable) object; + int value = expectedWritable.get(); + serializeWrite.writeInt(value); + } + break; + case LONG: + { + LongWritable expectedWritable = (LongWritable) object; + long value = expectedWritable.get(); + serializeWrite.writeLong(value); + } + break; + case DATE: + { + DateWritable expectedWritable = (DateWritable) object; + Date value = expectedWritable.get(); + serializeWrite.writeDate(value); + } + break; + case FLOAT: + { + FloatWritable expectedWritable = (FloatWritable) object; + float value = expectedWritable.get(); + serializeWrite.writeFloat(value); + } + break; + case DOUBLE: + { + DoubleWritable expectedWritable = (DoubleWritable) object; + double value = expectedWritable.get(); + serializeWrite.writeDouble(value); + } + break; + case STRING: + { + Text text = (Text) object; + serializeWrite.writeBytes(text.getBytes(), 0, text.getLength()); + } + break; + default: + throw new HiveException("Unexpected primitive category " + primitiveCategory); + } + } + return output; + } + + void testVectorDeserializeRow(int caseNum, Random r, SerializationType serializationType) throws HiveException, IOException { + + Map emptyScratchMap = new HashMap(); + + RandomRowObjectSource source = new RandomRowObjectSource(); + source.init(r); + + VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); + batchContext.init(emptyScratchMap, source.rowStructObjectInspector()); + VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); + + int fieldCount = source.typeNames().size(); + DeserializeRead deserializeRead; + SerializeWrite serializeWrite; + switch (serializationType) { + case BINARY_SORTABLE: + deserializeRead = new BinarySortableDeserializeRead(fieldCount); + serializeWrite = new BinarySortableSerializeWrite(fieldCount); + break; + case LAZY_BINARY: + deserializeRead = new LazyBinaryDeserializeRead(fieldCount); + serializeWrite = new LazyBinarySerializeWrite(fieldCount, false); + break; + default: + throw new Error("Unknown serialization type " + serializationType); + } + VectorDeserializeRow vectorDeserializeRow = new VectorDeserializeRow(deserializeRead); + vectorDeserializeRow.init(source.typeNames()); + + VectorExtractRowSameBatch vectorExtractRow = new VectorExtractRowSameBatch(); + vectorExtractRow.init(source.typeNames()); + vectorExtractRow.setOneBatch(batch); + + Object[][] randomRows = source.randomRows(100000); + int firstRandomRowIndex = 0; + for (int i = 0; i < randomRows.length; i++) { + Object[] row = randomRows[i]; + + Output output = serializeRow(row, source, serializeWrite); + vectorDeserializeRow.setBytes(output.getData(), 0, output.getLength()); + vectorDeserializeRow.deserializeByValue(batch, batch.size); + batch.size++; + if (batch.size == batch.DEFAULT_SIZE) { + examineBatch(batch, vectorExtractRow, randomRows, firstRandomRowIndex); + firstRandomRowIndex = i + 1; + batch.reset(); + } + } + if (batch.size > 0) { + examineBatch(batch, vectorExtractRow, randomRows, firstRandomRowIndex); + } + } + + public void testVectorSerDeRow() throws Throwable { + + try { + Random r = new Random(5678); + for (int c = 0; c < 10; c++) { + testVectorSerializeRow(c, r, SerializationType.BINARY_SORTABLE); + } + for (int c = 0; c < 10; c++) { + testVectorSerializeRow(c, r, SerializationType.LAZY_BINARY); + } + + for (int c = 0; c < 10; c++) { + testVectorDeserializeRow(c, r, SerializationType.BINARY_SORTABLE); + } + for (int c = 0; c < 10; c++) { + testVectorDeserializeRow(c, r, SerializationType.LAZY_BINARY); + } + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java index efe2efe..58784fc 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java @@ -247,7 +247,7 @@ public void testArithmeticExpressionVectorization() throws HiveException { children5.add(col6Expr); modExpr.setChildren(children5); - VectorizationContext vc = new VectorizationContext(); + VectorizationContext vc = new VectorizationContext("name"); vc.addInitialColumn("col1"); vc.addInitialColumn("col2"); vc.addInitialColumn("col3"); @@ -297,7 +297,7 @@ public void testStringFilterExpressions() throws HiveException { columns.add("col0"); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); @@ -322,7 +322,7 @@ public void testFilterStringColCompareStringColumnExpressions() throws HiveExcep columns.add("col0"); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); @@ -341,7 +341,7 @@ public void testFilterStringColCompareStringColumnExpressions() throws HiveExcep children1.add(col2Expr); exprDesc.setChildren(children1); - vc = new VectorizationContext(columns); + vc = new VectorizationContext("name", columns); ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); @@ -360,7 +360,7 @@ public void testFilterStringColCompareStringColumnExpressions() throws HiveExcep children1.add(col2Expr); exprDesc.setChildren(children1); - vc = new VectorizationContext(columns); + vc = new VectorizationContext("name", columns); ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); @@ -378,7 +378,7 @@ public void testFilterStringColCompareStringColumnExpressions() throws HiveExcep children1.add(col2Expr); exprDesc.setChildren(children1); - vc = new VectorizationContext(columns); + vc = new VectorizationContext("name", columns); ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); @@ -395,7 +395,7 @@ public void testFilterStringColCompareStringColumnExpressions() throws HiveExcep children1.add(col2Expr); exprDesc.setChildren(children1); - vc = new VectorizationContext(columns); + vc = new VectorizationContext("name", columns); ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); @@ -412,7 +412,7 @@ public void testFilterStringColCompareStringColumnExpressions() throws HiveExcep children1.add(col2Expr); exprDesc.setChildren(children1); - vc = new VectorizationContext(columns); + vc = new VectorizationContext("name", columns); ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); @@ -434,7 +434,7 @@ public void testFloatInExpressions() throws HiveException { List columns = new ArrayList(); columns.add("col1"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.PROJECTION); @@ -480,7 +480,7 @@ public void testVectorizeFilterAndOrExpression() throws HiveException { columns.add("col0"); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(andExprDesc, VectorExpressionDescriptor.Mode.FILTER); @@ -530,7 +530,7 @@ public void testVectorizeAndOrProjectionExpression() throws HiveException { List columns = new ArrayList(); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression veAnd = vc.getVectorExpression(andExprDesc, VectorExpressionDescriptor.Mode.FILTER); assertEquals(veAnd.getClass(), FilterExprAndExpr.class); assertEquals(veAnd.getChildExpressions()[0].getClass(), FilterLongColGreaterLongScalar.class); @@ -555,7 +555,7 @@ public void testVectorizeAndOrProjectionExpression() throws HiveException { orExprDesc.setChildren(children4); //Allocate new Vectorization context to reset the intermediate columns. - vc = new VectorizationContext(columns); + vc = new VectorizationContext("name", columns); VectorExpression veOr = vc.getVectorExpression(orExprDesc, VectorExpressionDescriptor.Mode.FILTER); assertEquals(veOr.getClass(), FilterExprOrExpr.class); assertEquals(veOr.getChildExpressions()[0].getClass(), FilterLongColGreaterLongScalar.class); @@ -596,7 +596,7 @@ public void testNotExpression() throws HiveException { columns.add("col0"); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(notExpr, VectorExpressionDescriptor.Mode.FILTER); @@ -633,7 +633,7 @@ public void testNullExpressions() throws HiveException { List columns = new ArrayList(); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(isNullExpr, VectorExpressionDescriptor.Mode.FILTER); @@ -674,7 +674,7 @@ public void testNotNullExpressions() throws HiveException { List columns = new ArrayList(); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(isNotNullExpr, VectorExpressionDescriptor.Mode.FILTER); @@ -703,7 +703,7 @@ public void testVectorizeScalarColumnExpression() throws HiveException { List columns = new ArrayList(); columns.add("a"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(scalarMinusConstant, VectorExpressionDescriptor.Mode.PROJECTION); assertEquals(ve.getClass(), LongScalarSubtractLongColumn.class); @@ -726,7 +726,7 @@ public void testFilterWithNegativeScalar() throws HiveException { columns.add("col0"); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); @@ -744,7 +744,7 @@ public void testUnaryMinusColumnLong() throws HiveException { List columns = new ArrayList(); columns.add("col0"); columns.add("col1"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(negExprDesc, VectorExpressionDescriptor.Mode.PROJECTION); @@ -762,7 +762,7 @@ public void testUnaryMinusColumnDouble() throws HiveException { List columns = new ArrayList(); columns.add("col0"); columns.add("col1"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(negExprDesc, VectorExpressionDescriptor.Mode.PROJECTION); @@ -787,7 +787,7 @@ public void testFilterScalarCompareColumn() throws HiveException { List columns = new ArrayList(); columns.add("a"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(scalarGreaterColExpr, VectorExpressionDescriptor.Mode.FILTER); assertEquals(FilterLongScalarGreaterLongColumn.class, ve.getClass()); } @@ -810,7 +810,7 @@ public void testFilterBooleanColumnCompareBooleanScalar() throws HiveException { List columns = new ArrayList(); columns.add("a"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(colEqualScalar, VectorExpressionDescriptor.Mode.FILTER); assertEquals(FilterLongColEqualLongScalar.class, ve.getClass()); } @@ -833,7 +833,7 @@ public void testBooleanColumnCompareBooleanScalar() throws HiveException { List columns = new ArrayList(); columns.add("a"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(colEqualScalar, VectorExpressionDescriptor.Mode.PROJECTION); assertEquals(LongColEqualLongScalar.class, ve.getClass()); } @@ -850,7 +850,7 @@ public void testUnaryStringExpressions() throws HiveException { List columns = new ArrayList(); columns.add("b"); columns.add("a"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); GenericUDF stringLower = new GenericUDFLower(); stringUnary.setGenericUDF(stringLower); @@ -860,7 +860,7 @@ public void testUnaryStringExpressions() throws HiveException { assertEquals(1, ((StringLower) ve).getColNum()); assertEquals(2, ((StringLower) ve).getOutputColumn()); - vc = new VectorizationContext(columns); + vc = new VectorizationContext("name", columns); ExprNodeGenericFuncDesc anotherUnary = new ExprNodeGenericFuncDesc(); anotherUnary.setTypeInfo(TypeInfoFactory.stringTypeInfo); @@ -895,7 +895,7 @@ public void testMathFunctions() throws HiveException { List columns = new ArrayList(); columns.add("b"); columns.add("a"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); // Sin(double) GenericUDFBridge gudfBridge = new GenericUDFBridge("sin", false, UDFSin.class.getName()); @@ -986,7 +986,7 @@ public void testTimeStampUdfs() throws HiveException { List columns = new ArrayList(); columns.add("b"); columns.add("a"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); //UDFYear GenericUDFBridge gudfBridge = new GenericUDFBridge("year", false, UDFYear.class.getName()); @@ -1024,7 +1024,7 @@ public void testBetweenFilters() throws HiveException { columns.add("col0"); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); assertTrue(ve instanceof FilterStringColumnBetween); @@ -1050,7 +1050,7 @@ public void testBetweenFilters() throws HiveException { exprDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, udf, children1); - vc = new VectorizationContext(columns); + vc = new VectorizationContext("name", columns); ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); assertTrue(ve instanceof FilterCharColumnBetween); @@ -1075,7 +1075,7 @@ public void testBetweenFilters() throws HiveException { exprDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, udf, children1); - vc = new VectorizationContext(columns); + vc = new VectorizationContext("name", columns); ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); assertTrue(ve instanceof FilterVarCharColumnBetween); @@ -1144,7 +1144,7 @@ public void testInFiltersAndExprs() throws HiveException { columns.add("col0"); columns.add("col1"); columns.add("col2"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); assertTrue(ve instanceof FilterStringColumnInList); ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.PROJECTION); @@ -1199,7 +1199,7 @@ public void testIfConditionalExprs() throws HiveException { columns.add("col1"); columns.add("col2"); columns.add("col3"); - VectorizationContext vc = new VectorizationContext(columns); + VectorizationContext vc = new VectorizationContext("name", columns); VectorExpression ve = vc.getVectorExpression(exprDesc); assertTrue(ve instanceof IfExprLongColumnLongColumn); diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 2cc3d7a..c62264a 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -1315,9 +1315,6 @@ JobConf createMockExecutionEnvironment(Path workDir, } mapWork.setPathToAliases(aliasMap); mapWork.setPathToPartitionInfo(partMap); - mapWork.setAllColumnVectorMaps(new HashMap>()); - mapWork.setAllScratchColumnVectorTypeMaps(new HashMap>()); // write the plan out FileSystem localFs = FileSystem.getLocal(conf).getRaw(); diff --git ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java index ec47c08..d12c137 100644 --- ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java +++ ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java @@ -52,7 +52,7 @@ public void setUp() { columns.add("col3"); //Generate vectorized expression - vContext = new VectorizationContext(columns); + vContext = new VectorizationContext("name", columns); } @Description(name = "fake", value = "FAKE") diff --git serde/src/java/org/apache/hadoop/hive/serde2/ByteStream.java serde/src/java/org/apache/hadoop/hive/serde2/ByteStream.java index 390d9de..0a54c2d 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/ByteStream.java +++ serde/src/java/org/apache/hadoop/hive/serde2/ByteStream.java @@ -19,9 +19,11 @@ package org.apache.hadoop.hive.serde2; import java.io.IOException; +import java.util.Arrays; import org.apache.hadoop.hive.common.io.NonSyncByteArrayInputStream; import org.apache.hadoop.hive.common.io.NonSyncByteArrayOutputStream; +import org.apache.hadoop.hive.serde2.ByteStream.Output; /** * Extensions to bytearrayinput/output streams. @@ -88,15 +90,33 @@ public void writeInt(long offset, int value) { } @Override + public void writeByte(long offset, byte value) { + getData()[(int) offset] = value; + } + + @Override public void reserve(int byteCount) { for (int i = 0; i < byteCount; ++i) { write(0); } } + + public boolean arraysEquals(Output output) { + if (count != output.count) { + return false; + } + for (int i = 0; i < count; i++) { + if (buf[i] != output.buf[i]) { + return false; + } + } + return true; + } } public static interface RandomAccessOutput { public void writeInt(long offset, int value); + public void writeByte(long offset, byte value); public void reserve(int byteCount); public void write(int b); public void write(byte b[]) throws IOException; diff --git serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java index bed4d0a..6ea0a7d 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java +++ serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java @@ -276,6 +276,38 @@ public boolean isEqual(byte[] left, int leftLength, long rightOffset, int rightL return true; } + /** + * Compares part of the buffer with a part of an external byte array. + * Does not modify readPoint. + */ + public boolean isEqual(byte[] left, int leftOffset, int leftLength, long rightOffset, int rightLength) { + if (rightLength != leftLength) { + return false; + } + int rightIndex = getBufferIndex(rightOffset), rightFrom = getOffset(rightOffset); + byte[] rightBuffer = writeBuffers.get(rightIndex); + if (rightFrom + rightLength <= wbSize) { + // TODO: allow using unsafe optionally. + for (int i = 0; i < leftLength; ++i) { + if (left[leftOffset + i] != rightBuffer[rightFrom + i]) { + return false; + } + } + return true; + } + for (int i = 0; i < rightLength; ++i) { + if (rightFrom == wbSize) { + ++rightIndex; + rightBuffer = writeBuffers.get(rightIndex); + rightFrom = 0; + } + if (left[leftOffset + i] != rightBuffer[rightFrom++]) { + return false; + } + } + return true; + } + public void clear() { writeBuffers.clear(); currentWriteBuffer = currentReadBuffer = null; @@ -478,8 +510,23 @@ public void writeInt(long offset, int v) { currentWriteOffset = prevOffset; } + + @Override + public void writeByte(long offset, byte value) { + int prevIndex = currentWriteBufferIndex, prevOffset = currentWriteOffset; + setWritePoint(offset); + if (isAllInOneWriteBuffer(1)) { + currentWriteBuffer[currentWriteOffset] = value; + } else { + setByte(offset, value); + } + currentWriteBufferIndex = prevIndex; + currentWriteBuffer = writeBuffers.get(currentWriteBufferIndex); + currentWriteOffset = prevOffset; + } + // Lifted from org.apache.hadoop.util.hash.MurmurHash... but supports offset. - private static int murmurHash(byte[] data, int offset, int length) { + public static int murmurHash(byte[] data, int offset, int length) { int m = 0x5bd1e995; int r = 24; diff --git serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java index 2b7fba6..fae6fda 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java @@ -126,7 +126,7 @@ boolean[] columnSortOrderIsDesc; private static byte[] decimalBuffer = null; - private static Charset decimalCharSet = Charset.forName("US-ASCII"); + public static Charset decimalCharSet = Charset.forName("US-ASCII"); @Override public void initialize(Configuration conf, Properties tbl) @@ -546,7 +546,7 @@ static int getCharacterMaxLength(TypeInfo type) { return ((BaseCharTypeInfo)type).getLength(); } - static Text deserializeText(InputByteBuffer buffer, boolean invert, Text r) + public static Text deserializeText(InputByteBuffer buffer, boolean invert, Text r) throws IOException { // Get the actual length first int start = buffer.tell(); @@ -610,7 +610,7 @@ public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDe return serializeBytesWritable; } - private static void writeByte(RandomAccessOutput buffer, byte b, boolean invert) { + public static void writeByte(RandomAccessOutput buffer, byte b, boolean invert) { if (invert) { b = (byte) (0xff ^ b); } @@ -857,7 +857,7 @@ static void serialize(ByteStream.Output buffer, Object o, ObjectInspector oi, } - private static void serializeBytes( + public static void serializeBytes( ByteStream.Output buffer, byte[] data, int length, boolean invert) { for (int i = 0; i < length; i++) { if (data[i] == 0 || data[i] == 1) { @@ -870,7 +870,20 @@ private static void serializeBytes( writeByte(buffer, (byte) 0, invert); } - private static void serializeInt(ByteStream.Output buffer, int v, boolean invert) { + public static void serializeBytes( + ByteStream.Output buffer, byte[] data, int offset, int length, boolean invert) { + for (int i = offset; i < offset + length; i++) { + if (data[i] == 0 || data[i] == 1) { + writeByte(buffer, (byte) 1, invert); + writeByte(buffer, (byte) (data[i] + 1), invert); + } else { + writeByte(buffer, data[i], invert); + } + } + writeByte(buffer, (byte) 0, invert); + } + + public static void serializeInt(ByteStream.Output buffer, int v, boolean invert) { writeByte(buffer, (byte) ((v >> 24) ^ 0x80), invert); writeByte(buffer, (byte) (v >> 16), invert); writeByte(buffer, (byte) (v >> 8), invert); diff --git serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/deserializeread/BinarySortableDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/deserializeread/BinarySortableDeserializeRead.java new file mode 100644 index 0000000..51669e1 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/deserializeread/BinarySortableDeserializeRead.java @@ -0,0 +1,495 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.binarysortable.deserializeread; + +import java.io.IOException; +import java.math.BigInteger; +import java.util.Arrays; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe; +import org.apache.hadoop.hive.serde2.binarysortable.InputByteBuffer; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.io.Text; + +/* + * Directly deserialize with the caller reading field-by-field the LazyBinary serialization format. + * + * The caller is responsible for calling the read method for the right type of each field + * (after calling readCheckNull). + * + * Reading some fields require a results object to receive value information. A separate + * results object is created by the caller at initialization per different field even for the same + * type. + * + * Some type values are by reference to either bytes in the deserialization buffer or to + * other type specific buffers. So, those references are only valid until the next time set is + * called. + */ +public class BinarySortableDeserializeRead implements DeserializeRead { + public static final Log LOG = LogFactory.getLog(BinarySortableDeserializeRead.class.getName()); + + // The sort order (ascending/descending) for each field. Set to true when descending (invert). + private boolean[] columnSortOrderIsDesc; + + // Which field we are on. We start with -1 so readCheckNull can increment once and the read + // field data methods don't increment. + private int index; + + private int fieldCount; + + private InputByteBuffer inputByteBuffer = new InputByteBuffer(); + + public BinarySortableDeserializeRead(boolean[] columnSortOrderIsDesc) { + this(); + this.fieldCount = columnSortOrderIsDesc.length; + this.columnSortOrderIsDesc = columnSortOrderIsDesc; + } + + /* + * Use this constructor when only ascending sort order is used. + */ + public BinarySortableDeserializeRead(int fieldCount) { + this(); + this.fieldCount = fieldCount; + columnSortOrderIsDesc = new boolean[fieldCount]; + Arrays.fill(columnSortOrderIsDesc, false); + } + + // Not public since we must have the field count or column sort order information. + private BinarySortableDeserializeRead() { + index = -1; + inputByteBuffer = new InputByteBuffer(); + } + + /* + * Set the range of bytes to be deserialized. + */ + @Override + public void set(byte[] bytes, int offset, int length) { + index = -1; + inputByteBuffer.reset(bytes, offset, length); + } + + /* + * Reads the NULL information for a field. + * + * @return Returns true when the field is NULL; reading is positioned to the next field. + * Otherwise, false when the field is NOT NULL; reading is positioned to the field data. + */ + @Override + public boolean readCheckNull() throws IOException { + // We start with index as -1 so we can increment once here and then the read + // field data methods don't increment. + byte isNull = inputByteBuffer.read(columnSortOrderIsDesc[++index]); + return (isNull == 0); + } + + /* + * BOOLEAN. + */ + @Override + public boolean readBoolean() throws IOException { + byte b = inputByteBuffer.read(columnSortOrderIsDesc[index]); + return (b == 2); + } + + /* + * BYTE. + */ + @Override + public byte readByte() throws IOException { + return (byte) (inputByteBuffer.read(columnSortOrderIsDesc[index]) ^ 0x80); + } + + /* + * SHORT. + */ + @Override + public short readShort() throws IOException { + final boolean invert = columnSortOrderIsDesc[index]; + int v = inputByteBuffer.read(invert) ^ 0x80; + v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); + return (short) v; + } + + /* + * INT. + */ + @Override + public int readInt() throws IOException { + final boolean invert = columnSortOrderIsDesc[index]; + int v = inputByteBuffer.read(invert) ^ 0x80; + for (int i = 0; i < 3; i++) { + v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); + } + return v; + } + + /* + * LONG. + */ + @Override + public long readLong() throws IOException { + final boolean invert = columnSortOrderIsDesc[index]; + long v = inputByteBuffer.read(invert) ^ 0x80; + for (int i = 0; i < 7; i++) { + v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); + } + return v; + } + + /* + * FLOAT. + */ + @Override + public float readFloat() throws IOException { + final boolean invert = columnSortOrderIsDesc[index]; + int v = 0; + for (int i = 0; i < 4; i++) { + v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); + } + if ((v & (1 << 31)) == 0) { + // negative number, flip all bits + v = ~v; + } else { + // positive number, flip the first bit + v = v ^ (1 << 31); + } + return Float.intBitsToFloat(v); + } + + /* + * DOUBLE. + */ + @Override + public double readDouble() throws IOException { + final boolean invert = columnSortOrderIsDesc[index]; + long v = 0; + for (int i = 0; i < 8; i++) { + v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); + } + if ((v & (1L << 63)) == 0) { + // negative number, flip all bits + v = ~v; + } else { + // positive number, flip the first bit + v = v ^ (1L << 63); + } + return Double.longBitsToDouble(v); + } + + /* + * STRING and BINARY. + * + * Can be used to read CHAR and VARCHAR when the caller takes responsibility for + * truncation/padding issues. + */ + + // This class is for internal use. + private static class BinarySortableReadBytesResults extends ReadBytesResults { + + // Use an org.apache.hadoop.io.Text object as a buffer to decode the BinarySortable + // format string into. + private Text text; + + public BinarySortableReadBytesResults() { + super(); + text = new Text(); + } + } + + // Reading a bytes field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different bytes field. + @Override + public ReadBytesResults createReadBytesResults() { + return new BinarySortableReadBytesResults(); + } + + + @Override + public void readBytes(ReadBytesResults readBytesResults) throws IOException { + BinarySortableReadBytesResults binarySortableReadBytesResults = (BinarySortableReadBytesResults) readBytesResults; + + BinarySortableSerDe.deserializeText(inputByteBuffer, columnSortOrderIsDesc[index], binarySortableReadBytesResults.text); + readBytesResults.bytes = binarySortableReadBytesResults.text.getBytes(); + readBytesResults.start = 0; + readBytesResults.length = binarySortableReadBytesResults.text.getLength(); + } + + + /* + * CHAR. + */ + + // This class is for internal use. + private static class BinarySortableReadHiveCharResults extends ReadHiveCharResults { + + public BinarySortableReadHiveCharResults(CharTypeInfo charTypeInfo) { + super(charTypeInfo); + } + + public HiveCharWritable getHiveCharWritable() { + return hiveCharWritable; + } + } + + // Reading a CHAR field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different CHAR field. + @Override + public ReadHiveCharResults createReadHiveCharResults(CharTypeInfo charTypeInfo) { + return new BinarySortableReadHiveCharResults(charTypeInfo); + } + + public void readHiveChar(ReadHiveCharResults readHiveCharResults) throws IOException { + BinarySortableReadHiveCharResults binarySortableReadHiveCharResults = (BinarySortableReadHiveCharResults) readHiveCharResults; + + HiveCharWritable hiveCharWritable = binarySortableReadHiveCharResults.getHiveCharWritable(); + + // Decode the bytes into our Text buffer, then truncate. + BinarySortableSerDe.deserializeText(inputByteBuffer, columnSortOrderIsDesc[index], hiveCharWritable.getTextValue()); + hiveCharWritable.enforceMaxLength(binarySortableReadHiveCharResults.getMaxLength()); + + readHiveCharResults.bytes = hiveCharWritable.getTextValue().getBytes(); + readHiveCharResults.start = 0; + readHiveCharResults.length = hiveCharWritable.getTextValue().getLength(); + } + + /* + * VARCHAR. + */ + + // This class is for internal use. + private static class BinarySortableReadHiveVarcharResults extends ReadHiveVarcharResults { + + public BinarySortableReadHiveVarcharResults(VarcharTypeInfo varcharTypeInfo) { + super(varcharTypeInfo); + } + + public HiveVarcharWritable getHiveVarcharWritable() { + return hiveVarcharWritable; + } + } + + // Reading a VARCHAR field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different VARCHAR field. + @Override + public ReadHiveVarcharResults createReadHiveVarcharResults(VarcharTypeInfo varcharTypeInfo) { + return new BinarySortableReadHiveVarcharResults(varcharTypeInfo); + } + + public void readHiveVarchar(ReadHiveVarcharResults readHiveVarcharResults) throws IOException { + BinarySortableReadHiveVarcharResults binarySortableReadHiveVarcharResults = (BinarySortableReadHiveVarcharResults) readHiveVarcharResults; + + HiveVarcharWritable hiveVarcharWritable = binarySortableReadHiveVarcharResults.getHiveVarcharWritable(); + + // Decode the bytes into our Text buffer, then truncate. + BinarySortableSerDe.deserializeText(inputByteBuffer, columnSortOrderIsDesc[index], hiveVarcharWritable.getTextValue()); + hiveVarcharWritable.enforceMaxLength(binarySortableReadHiveVarcharResults.getMaxLength()); + + readHiveVarcharResults.bytes = hiveVarcharWritable.getTextValue().getBytes(); + readHiveVarcharResults.start = 0; + readHiveVarcharResults.length = hiveVarcharWritable.getTextValue().getLength(); + } + + /* + * DATE. + */ + + // This class is for internal use. + private static class BinarySortableReadDateResults extends ReadDateResults { + + public BinarySortableReadDateResults() { + super(); + } + + public DateWritable getDateWritable() { + return dateWritable; + } + } + + // Reading a DATE field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different DATE field. + @Override + public ReadDateResults createReadDateResults() { + return new BinarySortableReadDateResults(); + } + + @Override + public void readDate(ReadDateResults readDateResults) throws IOException { + BinarySortableReadDateResults binarySortableReadDateResults = (BinarySortableReadDateResults) readDateResults; + final boolean invert = columnSortOrderIsDesc[index]; + int v = inputByteBuffer.read(invert) ^ 0x80; + for (int i = 0; i < 3; i++) { + v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); + } + DateWritable dateWritable = binarySortableReadDateResults.getDateWritable(); + dateWritable.set(v); + } + + /* + * TIMESTAMP. + */ + + // This class is for internal use. + private static class BinarySortableReadTimestampResults extends ReadTimestampResults { + + private byte[] timestampBytes; + + public BinarySortableReadTimestampResults() { + super(); + timestampBytes = new byte[TimestampWritable.BINARY_SORTABLE_LENGTH]; + } + + public TimestampWritable getTimestampWritable() { + return timestampWritable; + } + } + + // Reading a TIMESTAMP field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different TIMESTAMP field. + @Override + public ReadTimestampResults createReadTimestampResults() { + return new BinarySortableReadTimestampResults(); + } + + @Override + public void readTimestamp(ReadTimestampResults readTimestampResults) throws IOException { + BinarySortableReadTimestampResults binarySortableReadTimestampResults = (BinarySortableReadTimestampResults) readTimestampResults; + final boolean invert = columnSortOrderIsDesc[index]; + for (int i = 0; i < binarySortableReadTimestampResults.timestampBytes.length; i++) { + binarySortableReadTimestampResults.timestampBytes[i] = inputByteBuffer.read(invert); + } + TimestampWritable timestampWritable = binarySortableReadTimestampResults.getTimestampWritable(); + timestampWritable.setBinarySortable(binarySortableReadTimestampResults.timestampBytes, 0); + } + + /* + * DECIMAL. + */ + + // This class is for internal use. + private static class BinarySortableReadDecimalResults extends ReadDecimalResults { + + private byte[] decimalBuffer; + + // We use WritableHiveDecimalObjectInspector in this specialization because SortableBinarySerDe + // uses it and we copied the deserialize decimal code. + private WritableHiveDecimalObjectInspector writableHiveDecimalObjectInspector; + private Object object; + + public BinarySortableReadDecimalResults(DecimalTypeInfo decimalTypeInfo) { + super(decimalTypeInfo); + decimalBuffer = null; + writableHiveDecimalObjectInspector = new WritableHiveDecimalObjectInspector(decimalTypeInfo); + object = writableHiveDecimalObjectInspector.create(HiveDecimal.ZERO); + } + + public void set(HiveDecimal hiveDecimal) { + writableHiveDecimalObjectInspector.set(object, hiveDecimal); + } + + public byte[] getDecimalBuffer() { + return decimalBuffer; + } + + public void setDecimalBuffer(byte[] decimalBuffer) { + this.decimalBuffer = decimalBuffer; + } + + @Override + public HiveDecimal getHiveDecimal() { + return writableHiveDecimalObjectInspector.getPrimitiveJavaObject(object); + } + } + + // Reading a DECIMAL field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different DECIMAL field. + @Override + public ReadDecimalResults createReadDecimalResults(DecimalTypeInfo decimalTypeInfo) { + return new BinarySortableReadDecimalResults(decimalTypeInfo); + } + + @Override + public void readHiveDecimal(ReadDecimalResults readDecimalResults) throws IOException { + BinarySortableReadDecimalResults binarySortableReadDecimalResults = (BinarySortableReadDecimalResults) readDecimalResults; + byte[] decimalBuffer = binarySortableReadDecimalResults.getDecimalBuffer(); + final boolean invert = columnSortOrderIsDesc[index]; + int b = inputByteBuffer.read(invert) - 1; + assert (b == 1 || b == -1 || b == 0); + boolean positive = b != -1; + + int factor = inputByteBuffer.read(invert) ^ 0x80; + for (int i = 0; i < 3; i++) { + factor = (factor << 8) + (inputByteBuffer.read(invert) & 0xff); + } + + if (!positive) { + factor = -factor; + } + + int start = inputByteBuffer.tell(); + int length = 0; + + do { + b = inputByteBuffer.read(positive ? invert : !invert); + assert(b != 1); + + if (b == 0) { + // end of digits + break; + } + + length++; + } while (true); + + if(decimalBuffer == null || decimalBuffer.length < length) { + decimalBuffer = new byte[length]; + } + + inputByteBuffer.seek(start); + for (int i = 0; i < length; ++i) { + decimalBuffer[i] = inputByteBuffer.read(positive ? invert : !invert); + } + + // read the null byte again + inputByteBuffer.read(positive ? invert : !invert); + + String digits = new String(decimalBuffer, 0, length, BinarySortableSerDe.decimalCharSet); + BigInteger bi = new BigInteger(digits); + HiveDecimal bd = HiveDecimal.create(bi).scaleByPowerOfTen(factor-length); + + if (!positive) { + bd = bd.negate(); + } + + binarySortableReadDecimalResults.set(bd); + binarySortableReadDecimalResults.setDecimalBuffer(decimalBuffer); + } +} \ No newline at end of file diff --git serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/serializewrite/BinarySortableSerializeWrite.java serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/serializewrite/BinarySortableSerializeWrite.java new file mode 100644 index 0000000..65bb966 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/serializewrite/BinarySortableSerializeWrite.java @@ -0,0 +1,361 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.binarysortable.serializewrite; + +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Arrays; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe; +import org.apache.hadoop.hive.serde2.binarysortable.InputByteBuffer; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.serializewrite.SerializeWrite; + +/* + * Directly serialize, field-by-field, the BinarySortable format. + * + * This is an alternative way to serialize than what is provided by BinarySortableSerDe. + */ +public class BinarySortableSerializeWrite implements SerializeWrite { + public static final Log LOG = LogFactory.getLog(BinarySortableSerializeWrite.class.getName()); + + private Output output; + + // The sort order (ascending/descending) for each field. Set to true when descending (invert). + private boolean[] columnSortOrderIsDesc; + + // Which field we are on. We start with -1 to be consistent in style with + // BinarySortableDeserializeRead. + private int index; + + private int fieldCount; + + public BinarySortableSerializeWrite(boolean[] columnSortOrderIsDesc) { + super(); + fieldCount = columnSortOrderIsDesc.length; + this.columnSortOrderIsDesc = columnSortOrderIsDesc; + } + + /* + * Use this constructor when only ascending sort order is used. + */ + public BinarySortableSerializeWrite(int fieldCount) { + super(); + this.fieldCount = fieldCount; + columnSortOrderIsDesc = new boolean[fieldCount]; + Arrays.fill(columnSortOrderIsDesc, false); + } + + // Not public since we must have the field count or column sort order information. + private BinarySortableSerializeWrite() { + index = -1; + } + + /* + * Set the buffer that will receive the serialized data. + */ + @Override + public void set(Output output) { + this.output = output; + this.output.reset(); + index = -1; + } + + /* + * Reset the previously supplied buffer that will receive the serialized data. + */ + @Override + public void reset() { + output.reset(); + index = -1; + } + + /* + * Write a NULL field. + */ + @Override + public void writeNull() { + BinarySortableSerDe.writeByte(output, (byte) 0, columnSortOrderIsDesc[++index]); + } + + /* + * BOOLEAN. + */ + @Override + public void writeBoolean(boolean v) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + BinarySortableSerDe.writeByte(output, (byte) (v ? 2 : 1), invert); + } + + /* + * BYTE. + */ + @Override + public void writeByte(byte v) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + BinarySortableSerDe.writeByte(output, (byte) (v ^ 0x80), invert); + } + + /* + * SHORT. + */ + @Override + public void writeShort(short v) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + BinarySortableSerDe.writeByte(output, (byte) ((v >> 8) ^ 0x80), invert); + BinarySortableSerDe.writeByte(output, (byte) v, invert); + } + + /* + * INT. + */ + @Override + public void writeInt(int v) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + BinarySortableSerDe.serializeInt(output, v, invert); + } + + /* + * LONG. + */ + @Override + public void writeLong(long v) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + BinarySortableSerDe.writeByte(output, (byte) ((v >> 56) ^ 0x80), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 48), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 40), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 32), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 24), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 16), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 8), invert); + BinarySortableSerDe.writeByte(output, (byte) v, invert); + + } + + /* + * FLOAT. + */ + @Override + public void writeFloat(float vf) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + int v = Float.floatToIntBits(vf); + if ((v & (1 << 31)) != 0) { + // negative number, flip all bits + v = ~v; + } else { + // positive number, flip the first bit + v = v ^ (1 << 31); + } + BinarySortableSerDe.writeByte(output, (byte) (v >> 24), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 16), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 8), invert); + BinarySortableSerDe.writeByte(output, (byte) v, invert); + } + + /* + * DOUBLE. + */ + @Override + public void writeDouble(double vd) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + long v = Double.doubleToLongBits(vd); + if ((v & (1L << 63)) != 0) { + // negative number, flip all bits + v = ~v; + } else { + // positive number, flip the first bit + v = v ^ (1L << 63); + } + BinarySortableSerDe.writeByte(output, (byte) (v >> 56), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 48), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 40), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 32), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 24), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 16), invert); + BinarySortableSerDe.writeByte(output, (byte) (v >> 8), invert); + BinarySortableSerDe.writeByte(output, (byte) v, invert); + } + + /* + * STRING and BINARY. + * + * Can be used to write CHAR and VARCHAR when the caller takes responsibility for + * truncation/padding issues. + */ + @Override + public void writeBytes(byte[] v) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + BinarySortableSerDe.serializeBytes(output, v, 0, v.length, invert); + } + + @Override + public void writeBytes(byte[] v, int start, int length) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + BinarySortableSerDe.serializeBytes(output, v, start, length, invert); + } + + /* + * CHAR. + */ + @Override + public void writeHiveChar(HiveChar hiveChar) { + String string = hiveChar.getStrippedValue(); + byte[] bytes = string.getBytes(); + writeBytes(bytes); + } + + /* + * VARCHAR. + */ + @Override + public void writeHiveVarchar(HiveVarchar hiveVarchar) { + String string = hiveVarchar.getValue(); + byte[] bytes = string.getBytes(); + writeBytes(bytes); + } + + /* + * DATE. + */ + @Override + public void writeDate(Date date) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + BinarySortableSerDe.serializeInt(output, DateWritable.dateToDays(date), invert); + } + + // We provide a faster way to write a date without a Date object. + @Override + public void writeDate(int dateAsDays) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + BinarySortableSerDe.serializeInt(output, dateAsDays, invert); + } + + /* + * TIMESTAMP. + */ + @Override + public void writeTimestamp(Timestamp vt) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + byte[] data = TimestampWritable.timeStampToBinarySortable(vt); + for (int i = 0; i < data.length; i++) { + BinarySortableSerDe.writeByte(output, data[i], invert); + } + } + + /* + * DECIMAL. + */ + @Override + public void writeHiveDecimal(HiveDecimal dec) { + final boolean invert = columnSortOrderIsDesc[++index]; + + // This field is not a null. + BinarySortableSerDe.writeByte(output, (byte) 1, invert); + + // decimals are encoded in three pieces: + // sign: 1, 2 or 3 for smaller, equal or larger than 0 respectively + // factor: Number that indicates the amount of digits you have to move + // the decimal point left or right until the resulting number is smaller + // than zero but has something other than 0 as the first digit. + // digits: which is a string of all the digits in the decimal. If the number + // is negative the binary string will be inverted to get the correct ordering. + // Example: 0.00123 + // Sign is 3 (bigger than 0) + // Factor is -2 (move decimal point 2 positions right) + // Digits are: 123 + + // get the sign of the big decimal + int sign = dec.compareTo(HiveDecimal.ZERO); + + // we'll encode the absolute value (sign is separate) + dec = dec.abs(); + + // get the scale factor to turn big decimal into a decimal < 1 + int factor = dec.precision() - dec.scale(); + factor = sign == 1 ? factor : -factor; + + // convert the absolute big decimal to string + dec.scaleByPowerOfTen(Math.abs(dec.scale())); + String digits = dec.unscaledValue().toString(); + + // finally write out the pieces (sign, scale, digits) + BinarySortableSerDe.writeByte(output, (byte) ( sign + 1), invert); + BinarySortableSerDe.writeByte(output, (byte) ((factor >> 24) ^ 0x80), invert); + BinarySortableSerDe.writeByte(output, (byte) ( factor >> 16), invert); + BinarySortableSerDe.writeByte(output, (byte) ( factor >> 8), invert); + BinarySortableSerDe.writeByte(output, (byte) factor, invert); + BinarySortableSerDe.serializeBytes(output, digits.getBytes(BinarySortableSerDe.decimalCharSet), + digits.length(), sign == -1 ? !invert : invert); + } +} \ No newline at end of file diff --git serde/src/java/org/apache/hadoop/hive/serde2/deserializeread/DeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/deserializeread/DeserializeRead.java new file mode 100644 index 0000000..41e1df3 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/deserializeread/DeserializeRead.java @@ -0,0 +1,278 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.deserializeread; + +import java.io.IOException; +import java.sql.Date; +import java.sql.Timestamp; + +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; + +/* + * Directly deserialize with the caller reading field-by-field a serialization format. + * + * The caller is responsible for calling the read method for the right type of each field + * (after calling readCheckNull). + * + * Reading some fields require a results object to receive value information. A separate + * results object is created by the caller at initialization per different field even for the same + * type. + * + * Some type values are by reference to either bytes in the deserialization buffer or to + * other type specific buffers. So, those references are only valid until the next time set is + * called. + */ +public interface DeserializeRead { + + /* + * Set the range of bytes to be deserialized. + */ + void set(byte[] bytes, int offset, int length); + + /* + * Reads the NULL information for a field. + * + * @return Return true when the field is NULL; reading is positioned to the next field. + * Otherwise, false when the field is NOT NULL; reading is positioned to the field data. + */ + boolean readCheckNull() throws IOException; + + /* + * BOOLEAN. + */ + boolean readBoolean() throws IOException; + + /* + * BYTE. + */ + byte readByte() throws IOException; + + /* + * SHORT. + */ + short readShort() throws IOException; + + /* + * INT. + */ + int readInt() throws IOException; + + /* + * LONG. + */ + long readLong() throws IOException; + + /* + * FLOAT. + */ + float readFloat() throws IOException; + + /* + * DOUBLE. + */ + double readDouble() throws IOException; + + /* + * STRING and BINARY. + * + * Can be used to read CHAR and VARCHAR when the caller takes responsibility for + * truncation/padding issues. + */ + + // This class is for abstract since each format may need its own specialization. + public abstract class ReadBytesResults { + + public byte[] bytes; + public int start; + public int length; + + public ReadBytesResults() { + bytes = null; + start = 0; + length = 0; + } + } + + // Reading a bytes field require a results object to receive value information. A separate + // results object is created at initialization per different bytes field. + ReadBytesResults createReadBytesResults(); + + void readBytes(ReadBytesResults readBytesResults) throws IOException; + + /* + * CHAR. + */ + + // This class is for abstract since each format may need its own specialization. + public abstract class ReadHiveCharResults extends ReadBytesResults { + + private CharTypeInfo charTypeInfo; + private int maxLength; + + protected HiveCharWritable hiveCharWritable; + + public ReadHiveCharResults(CharTypeInfo charTypeInfo) { + this.charTypeInfo = charTypeInfo; + this.maxLength = charTypeInfo.getLength(); + hiveCharWritable = new HiveCharWritable(); + } + + // Not public since we must supply CharTypeInfo. + private ReadHiveCharResults() { + } + + public int getMaxLength() { + return maxLength; + } + + public HiveChar getHiveChar() { + return hiveCharWritable.getHiveChar(); + } + } + + // Reading a CHAR field require a results object to receive value information. A separate + // results object is created at initialization per different CHAR field. + ReadHiveCharResults createReadHiveCharResults(CharTypeInfo charTypeInfo); + + void readHiveChar(ReadHiveCharResults readHiveCharResults) throws IOException; + + /* + * VARCHAR. + */ + + // This class is for abstract since each format may need its own specialization. + public abstract class ReadHiveVarcharResults extends ReadBytesResults { + + private VarcharTypeInfo varcharTypeInfo; + private int maxLength; + + protected HiveVarcharWritable hiveVarcharWritable; + + public ReadHiveVarcharResults(VarcharTypeInfo varcharTypeInfo) { + this.varcharTypeInfo = varcharTypeInfo; + this.maxLength = varcharTypeInfo.getLength(); + hiveVarcharWritable = new HiveVarcharWritable(); + } + + // Not public since we must supply VarcharTypeInfo. + private ReadHiveVarcharResults() { + } + + public int getMaxLength() { + return maxLength; + } + + public HiveVarchar getHiveVarchar() { + return hiveVarcharWritable.getHiveVarchar(); + } + } + + // Reading a VARCHAR field require a results object to receive value information. A separate + // results object is created at initialization per different VARCHAR field. + ReadHiveVarcharResults createReadHiveVarcharResults(VarcharTypeInfo varcharTypeInfo); + + void readHiveVarchar(ReadHiveVarcharResults readHiveVarcharResults) throws IOException; + + /* + * DATE. + */ + + // This class is for abstract since each format may need its own specialization. + public abstract class ReadDateResults { + + protected DateWritable dateWritable; + + public ReadDateResults() { + dateWritable = new DateWritable(); + } + + public Date getDate() { + return dateWritable.get(); + } + + public int getDays() { + return dateWritable.getDays(); + } + } + + // Reading a DATE field require a results object to receive value information. A separate + // results object is created at initialization per different DATE field. + ReadDateResults createReadDateResults(); + + void readDate(ReadDateResults readDateResults) throws IOException; + + /* + * TIMESTAMP. + */ + + // This class is for abstract since each format may need its own specialization. + public abstract class ReadTimestampResults { + + protected TimestampWritable timestampWritable; + + public ReadTimestampResults() { + timestampWritable = new TimestampWritable(); + } + + public Timestamp getTimestamp() { + return timestampWritable.getTimestamp(); + } + } + + // Reading a TIMESTAMP field require a results object to receive value information. A separate + // results object is created at initialization per different TIMESTAMP field. + ReadTimestampResults createReadTimestampResults(); + + void readTimestamp(ReadTimestampResults readTimestampResult) throws IOException; + + /* + * DECIMAL. + */ + + // This class is for abstract since each format may need its own specialization. + public abstract class ReadDecimalResults { + + protected DecimalTypeInfo decimalTypeInfo; + + public ReadDecimalResults(DecimalTypeInfo decimalTypeInfo) { + this.decimalTypeInfo = decimalTypeInfo; + } + + // Not public since we must supply DecimalTypeInfo. + private ReadDecimalResults() { + } + + public abstract HiveDecimal getHiveDecimal(); + } + + // Reading a DECIMAL field require a results object to receive value information. A separate + // results object is created at initialization per different DECIMAL field. + ReadDecimalResults createReadDecimalResults(DecimalTypeInfo decimalTypeInfo); + + void readHiveDecimal(ReadDecimalResults readDecimalResults) throws IOException; +} \ No newline at end of file diff --git serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java index a738b02..3f3903f 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java +++ serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java @@ -276,6 +276,21 @@ public void setBinarySortable(byte[] bytes, int binSortOffset) { } /** + * @return byte[] representation of TimestampWritable that is binary + * sortable (7 bytes for seconds, 4 bytes for nanoseconds) + */ + public static byte[] timeStampToBinarySortable(Timestamp t) { + byte[] b = new byte[BINARY_SORTABLE_LENGTH]; + int nanos = t.getNanos(); + // We flip the highest-order bit of the seven-byte representation of seconds to make negative + // values come before positive ones. + long seconds = t.getSeconds() ^ SEVEN_BYTE_LONG_SIGN_FLIP; + sevenByteLongToBytes(seconds, b, 0); + intToBytes(nanos, b, 7); + return b; + } + + /** * The data of TimestampWritable can be stored either in a byte[] * or in a Timestamp object. Calling this method ensures that the byte[] * is populated from the Timestamp object if previously empty. diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/deserializeread/LazyBinaryDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/deserializeread/LazyBinaryDeserializeRead.java new file mode 100644 index 0000000..1300e81 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/deserializeread/LazyBinaryDeserializeRead.java @@ -0,0 +1,553 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.lazybinary.deserializeread; + +import java.io.IOException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils.VInt; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils.VLong; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; + +/* + * Directly deserialize with the caller reading field-by-field the LazyBinary serialization format. + * + * The caller is responsible for calling the read method for the right type of each field + * (after calling readCheckNull). + * + * Reading some fields require a results object to receive value information. A separate + * results object is created by the caller at initialization per different field even for the same + * type. + * + * Some type values are by reference to either bytes in the deserialization buffer or to + * other type specific buffers. So, those references are only valid until the next time set is + * called. + */ +public class LazyBinaryDeserializeRead implements DeserializeRead { + public static final Log LOG = LogFactory.getLog(LazyBinaryDeserializeRead.class.getName()); + + private byte[] bytes; + private int offset; + private int length; + private int fieldCount; + private int index; + private byte nullByte; + + // Object to receive results of reading a decoded variable length int or long. + private VInt tempVInt; + private VLong tempVLong; + + public LazyBinaryDeserializeRead(int fieldCount) { + this(); + this.fieldCount = fieldCount; + } + + // Not public since we must have the field count so every 8 fields NULL bytes can be navigated. + private LazyBinaryDeserializeRead() { + tempVInt = new VInt(); + tempVLong = new VLong(); + + } + + /* + * Set the range of bytes to be deserialized. + */ + @Override + public void set(byte[] bytes, int offset, int length) { + this.bytes = bytes; + this.offset = offset; + this.length = length; + index = 0; + nullByte = bytes[this.offset++]; + } + + /* + * Reads the NULL information for a field. + * + * @return Returns true when the field is NULL; reading is positioned to the next field. + * Otherwise, false when the field is NOT NULL; reading is positioned to the field data. + */ + @Override + public boolean readCheckNull() { + // NOTE: The bit is set to 1 if a field is NOT NULL. + if ((nullByte & (1 << (index % 8))) != 0) { + return false; + } + + // When NULL, we need to move past this field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + // Get next null byte. + nullByte = bytes[offset++]; + } + } + + return true; + } + + /* + * BOOLEAN. + */ + @Override + public boolean readBoolean() { + byte result = bytes[offset++]; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + return (result != 0); + } + + /* + * BYTE. + */ + @Override + public byte readByte() { + byte result = bytes[offset++]; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + return result; + } + + /* + * SHORT. + */ + @Override + public short readShort() { + short result = LazyBinaryUtils.byteArrayToShort(bytes, offset); + offset += 2; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + return result; + } + + /* + * INT. + */ + @Override + public int readInt() { + LazyBinaryUtils.readVInt(bytes, offset, tempVInt); + offset += tempVInt.length; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + return tempVInt.value; + } + + /* + * LONG. + */ + @Override + public long readLong() { + LazyBinaryUtils.readVLong(bytes, offset, tempVLong); + offset += tempVLong.length; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + return tempVLong.value; + } + + /* + * FLOAT. + */ + @Override + public float readFloat() { + float result = Float.intBitsToFloat(LazyBinaryUtils.byteArrayToInt(bytes, offset)); + offset += 4; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + return result; + } + + /* + * DOUBLE. + */ + @Override + public double readDouble() { + double result = Double.longBitsToDouble(LazyBinaryUtils.byteArrayToLong(bytes, offset)); + offset += 8; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + return result; + } + + /* + * STRING and BINARY. + * + * Can be used to read CHAR and VARCHAR when the caller takes responsibility for + * truncation/padding issues. + */ + + // This class is for internal use. + private class LazyBinaryReadBytesResults extends ReadBytesResults { + public LazyBinaryReadBytesResults() { + super(); + } + } + + // Reading a bytes field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different bytes field. + @Override + public ReadBytesResults createReadBytesResults() { + return new LazyBinaryReadBytesResults(); + } + + @Override + public void readBytes(ReadBytesResults readBytesResults) { + // using vint instead of 4 bytes + LazyBinaryUtils.readVInt(bytes, offset, tempVInt); + offset += tempVInt.length; + int start = offset; + int length = tempVInt.value; + offset += length; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + readBytesResults.bytes = bytes; + readBytesResults.start = start; + readBytesResults.length = length; + } + + /* + * CHAR. + */ + + // This class is for internal use. + private static class LazyBinaryReadHiveCharResults extends ReadHiveCharResults { + + // Use our bytes reader. + public LazyBinaryReadBytesResults readBytesResults; + + public LazyBinaryReadHiveCharResults(CharTypeInfo charTypeInfo) { + super(charTypeInfo); + } + + public HiveCharWritable getHiveCharWritable() { + return hiveCharWritable; + } + } + + // Reading a CHAR field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different CHAR field. + @Override + public ReadHiveCharResults createReadHiveCharResults(CharTypeInfo charTypeInfo) { + return new LazyBinaryReadHiveCharResults(charTypeInfo); + } + + public void readHiveChar(ReadHiveCharResults readHiveCharResults) throws IOException { + LazyBinaryReadHiveCharResults lazyBinaryReadHiveCharResults = (LazyBinaryReadHiveCharResults) readHiveCharResults; + + if (lazyBinaryReadHiveCharResults.readBytesResults == null) { + lazyBinaryReadHiveCharResults.readBytesResults = new LazyBinaryReadBytesResults(); + } + LazyBinaryReadBytesResults readBytesResults = lazyBinaryReadHiveCharResults.readBytesResults; + + // Read the bytes using our basic method. + readBytes(readBytesResults); + + // Copy the bytes into our Text object, then truncate. + HiveCharWritable hiveCharWritable = lazyBinaryReadHiveCharResults.getHiveCharWritable(); + hiveCharWritable.getTextValue().set(readBytesResults.bytes, readBytesResults.start, readBytesResults.length); + hiveCharWritable.enforceMaxLength(lazyBinaryReadHiveCharResults.getMaxLength()); + + readHiveCharResults.bytes = hiveCharWritable.getTextValue().getBytes(); + readHiveCharResults.start = 0; + readHiveCharResults.length = hiveCharWritable.getTextValue().getLength(); + } + + /* + * VARCHAR. + */ + + // This class is for internal use. + private static class LazyBinaryReadHiveVarcharResults extends ReadHiveVarcharResults { + + // Use our bytes reader. + public LazyBinaryReadBytesResults readBytesResults; + + public LazyBinaryReadHiveVarcharResults(VarcharTypeInfo varcharTypeInfo) { + super(varcharTypeInfo); + } + + public HiveVarcharWritable getHiveVarcharWritable() { + return hiveVarcharWritable; + } + } + + // Reading a VARCHAR field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different VARCHAR field. + @Override + public ReadHiveVarcharResults createReadHiveVarcharResults(VarcharTypeInfo varcharTypeInfo) { + return new LazyBinaryReadHiveVarcharResults(varcharTypeInfo); + } + + public void readHiveVarchar(ReadHiveVarcharResults readHiveVarcharResults) throws IOException { + LazyBinaryReadHiveVarcharResults lazyBinaryReadHiveVarvarcharResults = (LazyBinaryReadHiveVarcharResults) readHiveVarcharResults; + + if (lazyBinaryReadHiveVarvarcharResults.readBytesResults == null) { + lazyBinaryReadHiveVarvarcharResults.readBytesResults = new LazyBinaryReadBytesResults(); + } + LazyBinaryReadBytesResults readBytesResults = lazyBinaryReadHiveVarvarcharResults.readBytesResults; + + // Read the bytes using our basic method. + readBytes(readBytesResults); + + // Copy the bytes into our Text object, then truncate. + HiveVarcharWritable hiveVarcharWritable = lazyBinaryReadHiveVarvarcharResults.getHiveVarcharWritable(); + hiveVarcharWritable.getTextValue().set(readBytesResults.bytes, readBytesResults.start, readBytesResults.length); + hiveVarcharWritable.enforceMaxLength(lazyBinaryReadHiveVarvarcharResults.getMaxLength()); + + readHiveVarcharResults.bytes = hiveVarcharWritable.getTextValue().getBytes(); + readHiveVarcharResults.start = 0; + readHiveVarcharResults.length = hiveVarcharWritable.getTextValue().getLength(); + } + + /* + * DATE. + */ + + // This class is for internal use. + private static class LazyBinaryReadDateResults extends ReadDateResults { + + public LazyBinaryReadDateResults() { + super(); + } + + public DateWritable getDateWritable() { + return dateWritable; + } + } + + // Reading a DATE field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different DATE field. + @Override + public ReadDateResults createReadDateResults() { + return new LazyBinaryReadDateResults(); + } + + @Override + public void readDate(ReadDateResults readDateResults) { + LazyBinaryReadDateResults lazyBinaryReadDateResults = (LazyBinaryReadDateResults) readDateResults; + LazyBinaryUtils.readVInt(bytes, offset, tempVInt); + offset += tempVInt.length; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + DateWritable dateWritable = lazyBinaryReadDateResults.getDateWritable(); + dateWritable.set(tempVInt.value); + } + + /* + * TIMESTAMP. + */ + + // This class is for internal use. + private static class LazyBinaryReadTimestampResults extends ReadTimestampResults { + + public LazyBinaryReadTimestampResults() { + super(); + } + + public TimestampWritable getTimestampWritable() { + return timestampWritable; + } + } + + // Reading a TIMESTAMP field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different TIMESTAMP field. + @Override + public ReadTimestampResults createReadTimestampResults() { + return new LazyBinaryReadTimestampResults(); + } + + @Override + public void readTimestamp(ReadTimestampResults readTimestampResults) { + LazyBinaryReadTimestampResults lazyBinaryReadTimestampResults = (LazyBinaryReadTimestampResults) readTimestampResults; + int length = TimestampWritable.getTotalLength(bytes, offset); + int start = offset; + offset += length; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + TimestampWritable timestampWritable = lazyBinaryReadTimestampResults.getTimestampWritable(); + timestampWritable.set(bytes, start); + } + + /* + * DECIMAL. + */ + + // This class is for internal use. + private static class LazyBinaryReadDecimalResults extends ReadDecimalResults { + + private int precision; + private int scale; + + // We use HiveDecimalWritable in this specialization because LazyBinaryHiveDecimal uses it. + // The main thing that class does is copy the bytes into an exactly allocated byte[] array + // so it can be passed the BigDecimal with the primitive length. + private HiveDecimalWritable hiveDecimalWritable; + + public LazyBinaryReadDecimalResults(DecimalTypeInfo decimalTypeInfo) { + super(decimalTypeInfo); + precision = decimalTypeInfo.getPrecision(); + scale = decimalTypeInfo.getScale(); + hiveDecimalWritable = new HiveDecimalWritable(); + } + + public HiveDecimalWritable getHiveDecimalWritable() { + return hiveDecimalWritable; + } + + @Override + public HiveDecimal getHiveDecimal() { + return hiveDecimalWritable.getHiveDecimal(precision, scale); + } + } + + // Reading a DECIMAL field require a results object to receive value information. A separate + // results object is created by the caller at initialization per different DECIMAL field. + @Override + public ReadDecimalResults createReadDecimalResults(DecimalTypeInfo decimalTypeInfo) { + return new LazyBinaryReadDecimalResults(decimalTypeInfo); + } + + @Override + public void readHiveDecimal(ReadDecimalResults readDecimalResults) { + LazyBinaryReadDecimalResults lazyBinaryReadDecimalResults = (LazyBinaryReadDecimalResults) readDecimalResults; + + // These calls are to see how much data there is. The setFromBytes call below will do the same + // readVInt reads but actually unpack the decimal. + LazyBinaryUtils.readVInt(bytes, offset, tempVInt); + int start = offset; + offset += tempVInt.length; + LazyBinaryUtils.readVInt(bytes, offset, tempVInt); + offset += tempVInt.length + tempVInt.value; + int length = offset - start; + + // Move past this NOT NULL field. + index++; + + // Every 8 fields we read a new NULL byte. + if (index < fieldCount) { + if ((index % 8) == 0) { + nullByte = bytes[offset++]; + } + } + + HiveDecimalWritable hiveDecimalWritable = lazyBinaryReadDecimalResults.getHiveDecimalWritable(); + hiveDecimalWritable.setFromBytes(bytes, start, length); + } +} \ No newline at end of file diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/serializewrite/LazyBinarySerializeWrite.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/serializewrite/LazyBinarySerializeWrite.java new file mode 100644 index 0000000..5bbe4e3 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/serializewrite/LazyBinarySerializeWrite.java @@ -0,0 +1,585 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.lazybinary.serializewrite; + +import java.sql.Date; +import java.sql.Timestamp; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; +import org.apache.hadoop.hive.serde2.serializewrite.SerializeWrite; +import org.apache.hadoop.io.Text; + +/* + * Directly serialize, field-by-field, the LazyBinary format. +* + * This is an alternative way to serialize than what is provided by LazyBinarySerDe. + */ +public class LazyBinarySerializeWrite implements SerializeWrite { + public static final Log LOG = LogFactory.getLog(LazyBinarySerializeWrite.class.getName()); + + private Output output; + + private int fieldCount; + private boolean skipLengthPrefix; + private int index; + private byte nullByte; + private long nullOffset; + + // For thread safety, we allocate private writable objects for our use only. + private HiveDecimalWritable hiveDecimalWritable; + private TimestampWritable timestampWritable; + + public LazyBinarySerializeWrite(int fieldCount, boolean skipLengthPrefix) { + this(); + this.fieldCount = fieldCount; + this.skipLengthPrefix = skipLengthPrefix; + } + + // Not public since we must have the field count and other information. + private LazyBinarySerializeWrite() { + hiveDecimalWritable = new HiveDecimalWritable(); + timestampWritable = new TimestampWritable(); + } + + /* + * Set the buffer that will receive the serialized data. + */ + @Override + public void set(Output output) { + this.output = output; + output.reset(); + index = 0; + nullByte = 0; + nullOffset = 0; + } + + /* + * Reset the previously supplied buffer that will receive the serialized data. + */ + @Override + public void reset() { + output.reset(); + index = 0; + nullByte = 0; + nullOffset = 0; + } + + /* + * General Pattern: + * + * // Every 8 fields we write a NULL byte. + * IF ((index % 8) == 0), then + * IF (index > 0), then + * Write back previous NullByte + * NullByte = 0 + * Remember write position + * Allocate room for next NULL byte. + * + * WHEN NOT NULL: Set bit in NULL byte; Write value. + * OTHERWISE NULL: We do not set a bit in the nullByte when we are writing a null. + * + * Increment index + * + * IF (index == fieldCount), then + * Write back final NullByte + * + */ + + /* + * Write a NULL field. + */ + @Override + public void writeNull() { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // We DO NOT set a bit in the NULL byte when we are writing a NULL. + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * BOOLEAN. + */ + @Override + public void writeBoolean(boolean v) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + output.write((byte) (v ? 1 : 0)); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * BYTE. + */ + @Override + public void writeByte(byte v) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + output.write(v); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * SHORT. + */ + @Override + public void writeShort(short v) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + output.write((byte) (v >> 8)); + output.write((byte) (v)); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * INT. + */ + @Override + public void writeInt(int v) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + LazyBinaryUtils.writeVInt(output, v); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * LONG. + */ + @Override + public void writeLong(long v) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + LazyBinaryUtils.writeVLong(output, v); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * FLOAT. + */ + @Override + public void writeFloat(float vf) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + int v = Float.floatToIntBits(vf); + output.write((byte) (v >> 24)); + output.write((byte) (v >> 16)); + output.write((byte) (v >> 8)); + output.write((byte) (v)); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * DOUBLE. + */ + @Override + public void writeDouble(double v) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + LazyBinaryUtils.writeDouble(output, v); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * STRING and BINARY. + * + * Can be used to write CHAR and VARCHAR when the caller takes responsibility for + * truncation/padding issues. + */ + @Override + public void writeBytes(byte[] v) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + int length = v.length; + if(!skipLengthPrefix){ + LazyBinaryUtils.writeVInt(output, length); + } else { + if (length == 0){ + throw new RuntimeException("LazyBinaryColumnarSerde cannot serialize a non-null zero " + + "length binary field. Consider using either LazyBinarySerde or ColumnarSerde."); + } + } + output.write(v, 0, length); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + @Override + public void writeBytes(byte[] v, int start, int length) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + if(!skipLengthPrefix){ + LazyBinaryUtils.writeVInt(output, length); + } else { + if (length == 0){ + throw new RuntimeException("LazyBinaryColumnarSerde cannot serialize a non-null zero " + + "length binary field. Consider using either LazyBinarySerde or ColumnarSerde."); + } + } + output.write(v, start, length); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * CHAR. + */ + @Override + public void writeHiveChar(HiveChar hiveChar) { + String string = hiveChar.getStrippedValue(); + byte[] bytes = string.getBytes(); + writeBytes(bytes); + } + + /* + * VARCHAR. + */ + @Override + public void writeHiveVarchar(HiveVarchar hiveVarchar) { + String string = hiveVarchar.getValue(); + byte[] bytes = string.getBytes(); + writeBytes(bytes); + } + + /* + * DATE. + */ + @Override + public void writeDate(Date date) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + LazyBinaryUtils.writeVInt(output, DateWritable.dateToDays(date)); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + // We provide a faster way to write a date without a Date object. + @Override + public void writeDate(int dateAsDays) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + LazyBinaryUtils.writeVInt(output, dateAsDays); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * TIMESTAMP. + */ + @Override + public void writeTimestamp(Timestamp v) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + timestampWritable.set(v); + timestampWritable.writeToByteStream(output); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } + + /* + * DECIMAL. + */ + @Override + public void writeHiveDecimal(HiveDecimal v) { + + // Every 8 fields we write a NULL byte. + if ((index % 8) == 0) { + if (index > 0) { + // Write back previous 8 field's NULL byte. + output.writeByte(nullOffset, nullByte); + nullByte = 0; + nullOffset = output.getLength(); + } + // Allocate next NULL byte. + output.reserve(1); + } + + // Set bit in NULL byte when a field is NOT NULL. + nullByte |= 1 << (index % 8); + + hiveDecimalWritable.set(v); + hiveDecimalWritable.writeToByteStream(output); + + index++; + + if (index == fieldCount) { + // Write back the final NULL byte before the last fields. + output.writeByte(nullOffset, nullByte); + } + } +} \ No newline at end of file diff --git serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/WritableHiveDecimalObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/WritableHiveDecimalObjectInspector.java index f650409..e156f4d 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/WritableHiveDecimalObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/WritableHiveDecimalObjectInspector.java @@ -29,7 +29,7 @@ public WritableHiveDecimalObjectInspector() { } - protected WritableHiveDecimalObjectInspector(DecimalTypeInfo typeInfo) { + public WritableHiveDecimalObjectInspector(DecimalTypeInfo typeInfo) { super(typeInfo); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/serializewrite/SerializeWrite.java serde/src/java/org/apache/hadoop/hive/serde2/serializewrite/SerializeWrite.java new file mode 100644 index 0000000..a2d124e --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/serializewrite/SerializeWrite.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.serializewrite; + +import java.sql.Date; +import java.sql.Timestamp; + +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.ByteStream.Output; + +/* + * Directly serialize with the caller writing field-by-field a serialization format. + * + * The caller is responsible for calling the write method for the right type of each field + * (or calling writeNull if the field is a NULL). + * + */ +public interface SerializeWrite { + + /* + * Set the buffer that will receive the serialized data. + */ + void set(Output output); + + /* + * Reset the previously supplied buffer that will receive the serialized data. + */ + void reset(); + + /* + * Write a NULL field. + */ + void writeNull(); + + /* + * BOOLEAN. + */ + void writeBoolean(boolean v); + + /* + * BYTE. + */ + void writeByte(byte v); + + /* + * SHORT. + */ + void writeShort(short v); + + /* + * INT. + */ + void writeInt(int v); + + /* + * LONG. + */ + void writeLong(long v); + + /* + * FLOAT. + */ + void writeFloat(float vf); + + /* + * DOUBLE. + */ + void writeDouble(double vd); + + /* + * STRING and BINARY. + * + * Can be used to write CHAR and VARCHAR when the caller takes responsibility for + * truncation/padding issues. + */ + void writeBytes(byte[] v); + void writeBytes(byte[] v, int start, int length); + + /* + * CHAR. + */ + void writeHiveChar(HiveChar hiveChar); + + /* + * VARCHAR. + */ + void writeHiveVarchar(HiveVarchar hiveVarchar); + + /* + * DATE. + */ + void writeDate(Date date); + + // We provide a faster way to write a date without a Date object. + void writeDate(int dateAsDays); + + /* + * TIMESTAMP. + */ + void writeTimestamp(Timestamp vt); + + /* + * DECIMAL. + */ + void writeHiveDecimal(HiveDecimal dec); +} diff --git serde/src/test/org/apache/hadoop/hive/serde2/TestStatsSerde.java serde/src/test/org/apache/hadoop/hive/serde2/TestStatsSerde.java index 3226114..561ab11 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/TestStatsSerde.java +++ serde/src/test/org/apache/hadoop/hive/serde2/TestStatsSerde.java @@ -30,7 +30,9 @@ import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.binarysortable.MyTestClass; import org.apache.hadoop.hive.serde2.binarysortable.MyTestInnerStruct; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass; import org.apache.hadoop.hive.serde2.binarysortable.TestBinarySortableSerDe; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; @@ -104,24 +106,11 @@ public void testLazyBinarySerDe() throws Throwable { int num = 1000; Random r = new Random(1234); MyTestClass rows[] = new MyTestClass[num]; + ExtraTypeInfo extraTypeInfos[] = MyTestPrimitiveClass.createExtraTypeInfos(num); + for (int i = 0; i < num; i++) { - int randField = r.nextInt(12); - Byte b = randField > 0 ? null : Byte.valueOf((byte) r.nextInt()); - Short s = randField > 1 ? null : Short.valueOf((short) r.nextInt()); - Integer n = randField > 2 ? null : Integer.valueOf(r.nextInt()); - Long l = randField > 3 ? null : Long.valueOf(r.nextLong()); - Float f = randField > 4 ? null : Float.valueOf(r.nextFloat()); - Double d = randField > 5 ? null : Double.valueOf(r.nextDouble()); - String st = randField > 6 ? null : TestBinarySortableSerDe - .getRandString(r); - HiveDecimal bd = randField > 7 ? null : TestBinarySortableSerDe.getRandHiveDecimal(r); - Date date = randField > 8 ? null : TestBinarySortableSerDe.getRandDate(r); - MyTestInnerStruct is = randField > 9 ? null : new MyTestInnerStruct(r - .nextInt(5) - 2, r.nextInt(5) - 2); - List li = randField > 10 ? null : TestBinarySortableSerDe - .getRandIntegerArray(r); - byte[] ba = TestBinarySortableSerDe.getRandBA(r, i); - MyTestClass t = new MyTestClass(b, s, n, l, f, d, st, bd, date, is, li,ba); + MyTestClass t = new MyTestClass(); + t.randomFill(r, extraTypeInfos[i]); rows[i] = t; } diff --git serde/src/test/org/apache/hadoop/hive/serde2/VerifyReadWrite.java serde/src/test/org/apache/hadoop/hive/serde2/VerifyReadWrite.java new file mode 100644 index 0000000..6fc2002 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/VerifyReadWrite.java @@ -0,0 +1,300 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2; + +import java.io.IOException; +import java.sql.Date; +import java.util.Arrays; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadBytesResults; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadDateResults; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadDecimalResults; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadHiveCharResults; +import org.apache.hadoop.hive.serde2.deserializeread.DeserializeRead.ReadHiveVarcharResults; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.serializewrite.SerializeWrite; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.io.Text; + +/** + * TestBinarySortableSerDe. + * + */ +public class VerifyReadWrite { + + public static void verifyDeserializeRead(DeserializeRead deserializeRead, PrimitiveCategory primitiveCategory, Object object, ExtraTypeInfo extraTypeInfo) throws IOException { + + boolean isNull; + + isNull = deserializeRead.readCheckNull(); + if (isNull) { + if (object != null) { + TestCase.fail("Field reports null but object is not null"); + } + return; + } else if (object == null) { + TestCase.fail("Field report not null but object is null"); + } + switch (primitiveCategory) { + case BOOLEAN: + { + boolean value = deserializeRead.readBoolean(); + Boolean expected = (Boolean) object; + if (value != expected) { + TestCase.fail("Boolean field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case BYTE: + { + byte value = deserializeRead.readByte(); + Byte expected = (Byte) object; + if (value != expected) { + TestCase.fail("Byte field mismatch (expected " + (int) expected + " found " + (int) value + ")"); + } + } + break; + case SHORT: + { + short value = deserializeRead.readShort(); + Short expected = (Short) object; + if (value != expected) { + TestCase.fail("Short field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case INT: + { + int value = deserializeRead.readInt(); + Integer expected = (Integer) object; + if (value != expected) { + TestCase.fail("Int field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case LONG: + { + long value = deserializeRead.readLong(); + Long expected = (Long) object; + if (value != expected) { + TestCase.fail("Long field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case FLOAT: + { + float value = deserializeRead.readFloat(); + Float expected = (Float) object; + if (value != expected) { + TestCase.fail("Float field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case DOUBLE: + { + double value = deserializeRead.readDouble(); + Double expected = (Double) object; + if (value != expected) { + TestCase.fail("Double field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case STRING: + { + ReadBytesResults readBytesResults = deserializeRead.createReadBytesResults(); + deserializeRead.readBytes(readBytesResults); + byte[] stringBytes = Arrays.copyOfRange(readBytesResults.bytes, readBytesResults.start, readBytesResults.start + readBytesResults.length); + Text text = new Text(stringBytes); + String string = text.toString(); + String expected = (String) object; + if (!string.equals(expected)) { + TestCase.fail("String field mismatch (expected '" + expected + "' found '" + string + "')"); + } + } + break; + case CHAR: + { + int maxLength = extraTypeInfo.hiveCharMaxLength; + CharTypeInfo charTypeInfo = new CharTypeInfo(maxLength); + ReadHiveCharResults readHiveCharResults = deserializeRead.createReadHiveCharResults(charTypeInfo); + deserializeRead.readHiveChar(readHiveCharResults); + HiveChar hiveChar = readHiveCharResults.getHiveChar(); + HiveChar expected = (HiveChar) object; + if (!hiveChar.equals(expected)) { + TestCase.fail("Char field mismatch (expected '" + expected + "' found '" + hiveChar + "') maxLength " + maxLength); + } + } + break; + case VARCHAR: + { + int maxLength = extraTypeInfo.hiveVarcharMaxLength; + VarcharTypeInfo varcharTypeInfo = new VarcharTypeInfo(maxLength); + ReadHiveVarcharResults readHiveVarcharResults = deserializeRead.createReadHiveVarcharResults(varcharTypeInfo); + deserializeRead.readHiveVarchar(readHiveVarcharResults); + HiveVarchar hiveVarchar = readHiveVarcharResults.getHiveVarchar(); + HiveVarchar expected = (HiveVarchar) object; + if (!hiveVarchar.equals(expected)) { + TestCase.fail("Varchar field mismatch (expected '" + expected + "' found '" + hiveVarchar + "') maxLength " + maxLength); + } + } + break; + case DECIMAL: + { + DecimalTypeInfo decimalTypeInfo = new DecimalTypeInfo(HiveDecimal.SYSTEM_DEFAULT_PRECISION, + HiveDecimal.SYSTEM_DEFAULT_SCALE); + ReadDecimalResults readDecimalResults = deserializeRead.createReadDecimalResults(decimalTypeInfo); + deserializeRead.readHiveDecimal(readDecimalResults); + HiveDecimal value = readDecimalResults.getHiveDecimal(); + HiveDecimal expected = (HiveDecimal) object; + if (!value.equals(expected)) { + TestCase.fail("Decimal field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case DATE: + { + ReadDateResults readDateResults = deserializeRead.createReadDateResults(); + deserializeRead.readDate(readDateResults); + Date value = readDateResults.getDate(); + Date expected = (Date) object; + if (!value.equals(expected)) { + TestCase.fail("Date field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case BINARY: + { + ReadBytesResults readBytesResults = deserializeRead.createReadBytesResults(); + deserializeRead.readBytes(readBytesResults); + byte[] byteArray = Arrays.copyOfRange(readBytesResults.bytes, readBytesResults.start, readBytesResults.start + readBytesResults.length); + byte[] expected = (byte[]) object; + if (byteArray.length != expected.length){ + TestCase.fail("Byte Array field mismatch (expected " + expected + " found " + byteArray + ")"); + } + for (int b = 0; b < byteArray.length; b++) { + if (byteArray[b] != expected[b]) { + TestCase.fail("Byte Array field mismatch (expected " + expected + " found " + byteArray + ")"); + } + } + } + break; + } + } + + public static void serializeWrite(SerializeWrite serializeWrite, PrimitiveCategory primitiveCategory, Object object) throws IOException { + if (object == null) { + serializeWrite.writeNull(); + return; + } + switch (primitiveCategory) { + case BOOLEAN: + { + boolean value = (Boolean) object; + serializeWrite.writeBoolean(value); + } + break; + case BYTE: + { + byte value = (Byte) object; + serializeWrite.writeByte(value); + } + break; + case SHORT: + { + short value = (Short) object; + serializeWrite.writeShort(value); + } + break; + case INT: + { + int value = (Integer) object; + serializeWrite.writeInt(value); + } + break; + case LONG: + { + long value = (Long) object; + serializeWrite.writeLong(value); + } + break; + case FLOAT: + { + float value = (Float) object; + serializeWrite.writeFloat(value); + } + break; + case DOUBLE: + { + double value = (Double) object; + serializeWrite.writeDouble(value); + } + break; + case STRING: + { + String value = (String) object; + byte[] stringBytes = value.getBytes(); + int stringLength = stringBytes.length; + serializeWrite.writeBytes(stringBytes, 0, stringLength); + } + break; + case CHAR: + { + HiveChar value = (HiveChar) object; + serializeWrite.writeHiveChar(value); + } + break; + case VARCHAR: + { + HiveVarchar value = (HiveVarchar) object; + serializeWrite.writeHiveVarchar(value); + } + break; + case DECIMAL: + { + HiveDecimal value = (HiveDecimal) object; + serializeWrite.writeHiveDecimal(value); + } + break; + case DATE: + { + Date value = (Date) object; + serializeWrite.writeDate(value); + } + break; + case BINARY: + { + byte[] binaryBytes = (byte[]) object; + int length = binaryBytes.length; + serializeWrite.writeBytes(binaryBytes, 0, length); + } + break; + default: + throw new Error("Unknown primitive category " + primitiveCategory.name()); + } + } +} \ No newline at end of file diff --git serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestClass.java serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestClass.java index d1d5760..f545d7a 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestClass.java +++ serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestClass.java @@ -18,40 +18,73 @@ package org.apache.hadoop.hive.serde2.binarysortable; import java.sql.Date; +import java.util.ArrayList; import java.util.List; +import java.util.Random; +import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; public class MyTestClass { - Byte myByte; - Short myShort; - Integer myInt; - Long myLong; - Float myFloat; - Double myDouble; - String myString; - HiveDecimal myDecimal; - Date myDate; - MyTestInnerStruct myStruct; - List myList; - byte[] myBA; + + public Boolean myBool; + public Byte myByte; + public Short myShort; + public Integer myInt; + public Long myLong; + public Float myFloat; + public Double myDouble; + public String myString; + public HiveChar myHiveChar; + public HiveVarchar myHiveVarchar; + public byte[] myBinary; + public HiveDecimal myDecimal; + public Date myDate; + + // Add more complex types. + public MyTestInnerStruct myStruct; + public List myList; public MyTestClass() { } - public MyTestClass(Byte b, Short s, Integer i, Long l, Float f, Double d, - String st, HiveDecimal bd, Date date, MyTestInnerStruct is, List li, byte[] ba) { - myByte = b; - myShort = s; - myInt = i; - myLong = l; - myFloat = f; - myDouble = d; - myString = st; - myDecimal = bd; - myDate = date; - myStruct = is; - myList = li; - myBA = ba; + public final static int fieldCount = 15; + + public int randomFill(Random r, ExtraTypeInfo extraTypeInfo) { + int randField = r.nextInt(MyTestClass.fieldCount); + int field = 0; + + myBool = randField > field++ ? null : (r.nextInt(1) == 1); + myByte = randField > field++ ? null : Byte.valueOf((byte) r.nextInt()); + myShort = randField > field++ ? null : Short.valueOf((short) r.nextInt()); + myInt = randField > field++ ? null : Integer.valueOf(r.nextInt()); + myLong = randField > field++ ? null : Long.valueOf(r.nextLong()); + myFloat = randField > field++ ? null : Float + .valueOf(r.nextFloat() * 10 - 5); + myDouble = randField > field++ ? null : Double + .valueOf(r.nextDouble() * 10 - 5); + myString = randField > field++ ? null : MyTestPrimitiveClass.getRandString(r); + myHiveChar = randField > field++ ? null : MyTestPrimitiveClass.getRandHiveChar(r, extraTypeInfo); + myHiveVarchar = randField > field++ ? null : MyTestPrimitiveClass.getRandHiveVarchar(r, extraTypeInfo); + myBinary = MyTestPrimitiveClass.getRandBinary(r, r.nextInt(1000)); + myDecimal = randField > field++ ? null : MyTestPrimitiveClass.getRandHiveDecimal(r); + myDate = randField > field++ ? null : MyTestPrimitiveClass.getRandDate(r); + + myStruct = randField > field++ ? null : new MyTestInnerStruct( + r.nextInt(5) - 2, r.nextInt(5) - 2); + myList = randField > field++ ? null : getRandIntegerArray(r); + return field; + } + + public static List getRandIntegerArray(Random r) { + int length = r.nextInt(10); + ArrayList result = new ArrayList(length); + for (int i = 0; i < length; i++) { + result.add(r.nextInt(128)); + } + return result; } + } diff --git serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestPrimitiveClass.java serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestPrimitiveClass.java new file mode 100644 index 0000000..c3a1b1e --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/MyTestPrimitiveClass.java @@ -0,0 +1,361 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.binarysortable; + +import java.sql.Date; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.common.type.HiveBaseChar; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.io.Writable; + +// Just the primitive types. +public class MyTestPrimitiveClass { + + public Boolean myBool; + public Byte myByte; + public Short myShort; + public Integer myInt; + public Long myLong; + public Float myFloat; + public Double myDouble; + public String myString; + public HiveChar myHiveChar; + public HiveVarchar myHiveVarchar; + public byte[] myBinary; + public HiveDecimal myDecimal; + public Date myDate; + + public MyTestPrimitiveClass() { + } + + public MyTestPrimitiveClass(Boolean bool, Byte b, Short s, Integer i, Long l, Float f, Double d, + String st, HiveChar hiveChar, HiveVarchar hiveVarchar, byte[] binary, HiveDecimal bd, Date date) { + myBool = bool; + myByte = b; + myShort = s; + myInt = i; + myLong = l; + myFloat = f; + myDouble = d; + myString = st; + myHiveChar = hiveChar; + myHiveVarchar = hiveVarchar; + myBinary = binary; + myDecimal = bd; + myDate = date; + } + + public final static int primitiveCount = 13; + + public int randomFill(Random r, ExtraTypeInfo extraTypeInfo) { + int randField = r.nextInt(MyTestClass.fieldCount); + int field = 0; + return randomFill(r, randField, field, extraTypeInfo); + } + + public int randomFill(Random r, int randField, int field, ExtraTypeInfo extraTypeInfo) { + myBool = randField > field++ ? null : Boolean.valueOf(r.nextInt(1) == 1); + myByte = randField > field++ ? null : Byte.valueOf((byte) r.nextInt()); + myShort = randField > field++ ? null : Short.valueOf((short) r.nextInt()); + myInt = randField > field++ ? null : Integer.valueOf(r.nextInt()); + myLong = randField > field++ ? null : Long.valueOf(r.nextLong()); + myFloat = randField > field++ ? null : Float + .valueOf(r.nextFloat() * 10 - 5); + myDouble = randField > field++ ? null : Double + .valueOf(r.nextDouble() * 10 - 5); + myString = randField > field++ ? null : getRandString(r); + myHiveChar = randField > field++ ? null : getRandHiveChar(r, extraTypeInfo); + myHiveVarchar = randField > field++ ? null : getRandHiveVarchar(r, extraTypeInfo); + myBinary = getRandBinary(r, r.nextInt(1000)); + myDecimal = randField > field++ ? null : getRandHiveDecimal(r); + myDate = randField > field++ ? null : getRandDate(r); + return field; + } + + public static class ExtraTypeInfo { + public int hiveCharMaxLength; + public int hiveVarcharMaxLength; + + public ExtraTypeInfo() { + // For NULL fields, make up a valid max length. + hiveCharMaxLength = 1; + hiveVarcharMaxLength = 1; + } + } + + public static ExtraTypeInfo[] createExtraTypeInfos(int num) { + ExtraTypeInfo[] extraTypeInfos = new ExtraTypeInfo[num]; + for (int i = 0; i < num; i++) { + extraTypeInfos[i] = new ExtraTypeInfo(); + } + return extraTypeInfos; + } + + public static String getRandString(Random r) { + return getRandString(r, null, r.nextInt(10)); + } + + public static String getRandString(Random r, String characters, int length) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < length; i++) { + if (characters == null) { + sb.append((char) (r.nextInt(128))); + } else { + sb.append(characters.charAt(r.nextInt(characters.length()))); + } + } + return sb.toString(); + } + + public static HiveChar getRandHiveChar(Random r, ExtraTypeInfo extraTypeInfo) { + int maxLength = 10 + r.nextInt(60); + extraTypeInfo.hiveCharMaxLength = maxLength; + String randomString = getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); + // String truncated = HiveBaseChar.enforceMaxLength(randomString, maxLength); + // System.err.println("getRandHiveChar maxLength " + maxLength + " '" + randomString + "' truncated '" + truncated + "'"); + HiveChar hiveChar = new HiveChar(randomString, maxLength); + return hiveChar; + } + + public static HiveVarchar getRandHiveVarchar(Random r, ExtraTypeInfo extraTypeInfo) { + int maxLength = 10 + r.nextInt(60); + extraTypeInfo.hiveVarcharMaxLength = maxLength; + String randomString = getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); + // String truncated = HiveBaseChar.enforceMaxLength(randomString, maxLength); + // System.err.println("getRandHiveVarchar maxLength " + maxLength + " '" + randomString + "' truncated '" + truncated + "'"); + HiveVarchar hiveVarchar = new HiveVarchar(randomString, maxLength); + return hiveVarchar; + } + + public static byte[] getRandBinary(Random r, int len){ + byte[] bytes = new byte[len]; + for (int j = 0; j < len; j++){ + bytes[j] = Byte.valueOf((byte) r.nextInt()); + } + return bytes; + } + + private static final String DECIMAL_CHARS = "0123456789"; + + public static HiveDecimal getRandHiveDecimal(Random r) { + StringBuilder sb = new StringBuilder(); + int l1 = 1+r.nextInt(18), l2 = r.nextInt(19); + + if (r.nextBoolean()) { + sb.append("-"); + } + + sb.append(getRandString(r, DECIMAL_CHARS, l1)); + if (l2 != 0) { + sb.append("."); + sb.append(getRandString(r, DECIMAL_CHARS, l2)); + } + + HiveDecimal bd = HiveDecimal.create(sb.toString()); + return bd; + } + + public static Date getRandDate(Random r) { + String dateStr = String.format("%d-%02d-%02d", + Integer.valueOf(1800 + r.nextInt(500)), // year + Integer.valueOf(1 + r.nextInt(12)), // month + Integer.valueOf(1 + r.nextInt(28))); // day + Date dateVal = Date.valueOf(dateStr); + return dateVal; + } + + public Object getPrimitiveObject(int index) { + int field = 0; + if (index == field++) { + return myBool; + } else if (index == field++) { + return myByte; + } else if (index == field++) { + return myShort; + } else if (index == field++) { + return myInt; + } else if (index == field++) { + return myLong; + } else if (index == field++) { + return myFloat; + } else if (index == field++) { + return myDouble; + } else if (index == field++) { + return myString; + } else if (index == field++) { + return myHiveChar; + } else if (index == field++) { + return myHiveVarchar; + } else if (index == field++) { + return myBinary; + } else if (index == field++) { + return myDecimal; + } else if (index == field++) { + return myDate; + } else { + throw new Error("Field " + " field not handled"); + } + } + + public Object getPrimitiveWritableObject(int index, ExtraTypeInfo extraTypeInfo) { + int field = 0; + if (index == field++) { + return (myBool == null ? null : PrimitiveObjectInspectorFactory.writableBooleanObjectInspector.create((boolean) myBool)); + } else if (index == field++) { + return (myByte == null ? null : PrimitiveObjectInspectorFactory.writableByteObjectInspector.create((byte) myByte)); + } else if (index == field++) { + return (myShort == null ? null : PrimitiveObjectInspectorFactory.writableShortObjectInspector.create((short) myShort)); + } else if (index == field++) { + return (myInt == null ? null : PrimitiveObjectInspectorFactory.writableIntObjectInspector.create((int) myInt)); + } else if (index == field++) { + return (myLong == null ? null : PrimitiveObjectInspectorFactory.writableLongObjectInspector.create((long) myLong)); + } else if (index == field++) { + return (myFloat == null ? null : PrimitiveObjectInspectorFactory.writableFloatObjectInspector.create((float) myFloat)); + } else if (index == field++) { + return (myDouble == null ? null : PrimitiveObjectInspectorFactory.writableDoubleObjectInspector.create((double) myDouble)); + } else if (index == field++) { + return (myString == null ? null : PrimitiveObjectInspectorFactory.writableStringObjectInspector.create(myString)); + } else if (index == field++) { + if (myHiveChar == null) { + return null; + } + int maxLength = extraTypeInfo.hiveCharMaxLength; + CharTypeInfo charTypeInfo = new CharTypeInfo(maxLength); + WritableHiveCharObjectInspector writableCharObjectInspector = new WritableHiveCharObjectInspector(charTypeInfo); + return writableCharObjectInspector.create(myHiveChar); + } else if (index == field++) { + if (myHiveVarchar == null) { + return null; + } + int maxLength = extraTypeInfo.hiveVarcharMaxLength; + VarcharTypeInfo varcharTypeInfo = new VarcharTypeInfo(maxLength); + WritableHiveVarcharObjectInspector writableVarcharObjectInspector = new WritableHiveVarcharObjectInspector(varcharTypeInfo); + return writableVarcharObjectInspector.create(myHiveVarchar); + } else if (index == field++) { + return (myBinary == null ? null : PrimitiveObjectInspectorFactory.writableBinaryObjectInspector.create(myBinary)); + } else if (index == field++) { + if (myDecimal == null) { + return null; + } + DecimalTypeInfo decimalTypeInfo = new DecimalTypeInfo(myDecimal.precision(), myDecimal.scale()); + WritableHiveDecimalObjectInspector writableDecimalObjectInspector = new WritableHiveDecimalObjectInspector(decimalTypeInfo); + return writableDecimalObjectInspector.create(myDecimal); + } else if (index == field++) { + return (myDate == null ? null : PrimitiveObjectInspectorFactory.writableDateObjectInspector.create(myDate)); + } else { + throw new Error("Field " + " field not handled"); + } + } + + + public PrimitiveCategory getPrimitiveCategory(int index) { + int field = 0; + if (index == field++) { + return PrimitiveCategory.BOOLEAN; + } else if (index == field++) { + return PrimitiveCategory.BYTE; + } else if (index == field++) { + return PrimitiveCategory.SHORT; + } else if (index == field++) { + return PrimitiveCategory.INT; + } else if (index == field++) { + return PrimitiveCategory.LONG; + } else if (index == field++) { + return PrimitiveCategory.FLOAT; + } else if (index == field++) { + return PrimitiveCategory.DOUBLE; + } else if (index == field++) { + return PrimitiveCategory.STRING; + } else if (index == field++) { + return PrimitiveCategory.CHAR; + } else if (index == field++) { + return PrimitiveCategory.VARCHAR; + } else if (index == field++) { + return PrimitiveCategory.BINARY; + } else if (index == field++) { + return PrimitiveCategory.DECIMAL; + } else if (index == field++) { + return PrimitiveCategory.DATE; + } else { + throw new Error("Field " + " field not handled"); + } + } + + public PrimitiveTypeInfo getPrimitiveTypeInfo(int index, ExtraTypeInfo extraTypeInfo) { + PrimitiveCategory primitiveCategory = getPrimitiveCategory(index); + String typeName; + switch (primitiveCategory) { + case BYTE: + typeName = "tinyint"; + break; + case SHORT: + typeName = "smallint"; + break; + case LONG: + typeName = "bigint"; + break; + case CHAR: + typeName = String.format("char(%d)", extraTypeInfo.hiveCharMaxLength); + break; + case VARCHAR: + typeName = String.format("varchar(%d)", extraTypeInfo.hiveVarcharMaxLength); + break; + default: + // No type name difference or adornment. + typeName = primitiveCategory.name().toLowerCase(); + break; + } + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); + return primitiveTypeInfo; + } + + public StructObjectInspector getRowInspector(ExtraTypeInfo extraTypeInfo) { + List columnNames = new ArrayList(MyTestPrimitiveClass.primitiveCount); + List primitiveObjectInspectorList = new ArrayList(MyTestPrimitiveClass.primitiveCount); + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + columnNames.add(String.format("col%d", index)); + PrimitiveTypeInfo primitiveTypeInfo = getPrimitiveTypeInfo(index, extraTypeInfo); + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + primitiveObjectInspectorList.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveCategory)); + } + StandardStructObjectInspector rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, primitiveObjectInspectorList); + return rowOI; + } +} diff --git serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableReadWrite.java serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableReadWrite.java new file mode 100644 index 0000000..852f176 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableReadWrite.java @@ -0,0 +1,209 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.binarysortable; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import junit.framework.TestCase; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.VerifyReadWrite; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; +import org.apache.hadoop.hive.serde2.binarysortable.deserializeread.BinarySortableDeserializeRead; +import org.apache.hadoop.hive.serde2.binarysortable.serializewrite.BinarySortableSerializeWrite; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.BytesWritable; + +/** + * TestBinarySortableSerDe. + * + */ +public class TestBinarySortableReadWrite extends TestCase { + + private void testBinarySortableReadWrite(MyTestPrimitiveClass[] myTestPrimitiveClasses, + boolean[] columnSortOrderIsDesc, SerDe serde, StructObjectInspector rowOI, boolean ascending, + Map extraTypeInfoMap) throws Throwable { + + BinarySortableSerializeWrite binarySortableSerializeWrite = new BinarySortableSerializeWrite(columnSortOrderIsDesc); + + // Try to serialize + + // One Writable per row. + BytesWritable serializeWriteBytes[] = new BytesWritable[myTestPrimitiveClasses.length]; + + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + ExtraTypeInfo extraTypeInfo = extraTypeInfoMap.get(t); + Output output = new Output(); + binarySortableSerializeWrite.set(output); + + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + Object object = t.getPrimitiveObject(index); + PrimitiveCategory primitiveCategory = t.getPrimitiveCategory(index); + VerifyReadWrite.serializeWrite(binarySortableSerializeWrite, primitiveCategory, object); + } + + BytesWritable bytesWritable = new BytesWritable(); + bytesWritable.set(output.getData(), 0, output.getLength()); + serializeWriteBytes[i] = bytesWritable; + if (i > 0) { + int compareResult = serializeWriteBytes[i - 1].compareTo(serializeWriteBytes[i]); + if ((compareResult < 0 && !ascending) + || (compareResult > 0 && ascending)) { + System.out.println("Test failed in " + + (ascending ? "ascending" : "descending") + " order with " + + (i - 1) + " and " + i); + System.out.println("serialized data [" + (i - 1) + "] = " + + TestBinarySortableSerDe.hexString(serializeWriteBytes[i - 1])); + System.out.println("serialized data [" + i + "] = " + + TestBinarySortableSerDe.hexString(serializeWriteBytes[i])); + fail("Sort order of serialized " + (i - 1) + " and " + i + + " are reversed!"); + } + } + } + + BinarySortableDeserializeRead binarySortableDeserializeRead = new BinarySortableDeserializeRead(columnSortOrderIsDesc); + + // Try to deserialize using DeserializeRead our Writable row objects created by SerializeWrite. + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + + BytesWritable bytesWritable = serializeWriteBytes[i]; + binarySortableDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength()); + + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + Object object = t.getPrimitiveObject(index); + ExtraTypeInfo extraTypeInfo = extraTypeInfoMap.get(t); + PrimitiveCategory primitiveCategory = t.getPrimitiveCategory(index); + VerifyReadWrite.verifyDeserializeRead(binarySortableDeserializeRead, primitiveCategory, object, extraTypeInfo); + } + } + + // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite. + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + BytesWritable bytesWritable = serializeWriteBytes[i]; + List deserializedRow = (List) serde.deserialize(bytesWritable); + + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + ExtraTypeInfo extraTypeInfo = extraTypeInfoMap.get(t); + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + Object expected = t.getPrimitiveWritableObject(index, extraTypeInfo); + Object object = deserializedRow.get(index); + if (expected == null || object == null) { + if (expected != null || object != null) { + fail("SerDe deserialized NULL column mismatch"); + } + } else { + if (!object.equals(expected)) { + fail("SerDe deserialized value does not match"); + } + } + } + } + + // One Writable per row. + BytesWritable serdeBytes[] = new BytesWritable[myTestPrimitiveClasses.length]; + + // Serialize using the SerDe, then below deserialize using DeserializeRead. + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + + // Since SerDe reuses memory, we will need to make a copy. + BytesWritable serialized = (BytesWritable) serde.serialize(t, rowOI); + BytesWritable bytesWritable = new BytesWritable(); + bytesWritable.set(serialized); + byte[] bytes1 = Arrays.copyOfRange(bytesWritable.getBytes(), 0, bytesWritable.getLength()); + + byte[] bytes2 = Arrays.copyOfRange(serializeWriteBytes[i].getBytes(), 0, serializeWriteBytes[i].getLength()); + if (!Arrays.equals(bytes1, bytes2)) { + fail("SerializeWrite and SerDe serialization does not match"); + } + serdeBytes[i] = bytesWritable; + } + + // Try to deserialize using DeserializeRead our Writable row objects created by SerDe. + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + + BytesWritable bytesWritable = serdeBytes[i]; + binarySortableDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength()); + + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + Object object = t.getPrimitiveObject(index); + ExtraTypeInfo extraTypeInfo = extraTypeInfoMap.get(t); + PrimitiveCategory primitiveCategory = t.getPrimitiveCategory(index); + VerifyReadWrite.verifyDeserializeRead(binarySortableDeserializeRead, primitiveCategory, object, extraTypeInfo); + } + } + } + + public void testBinarySortableReadWrite() throws Throwable { + try { + + int num = 1000; + Random r = new Random(1234); + MyTestPrimitiveClass myTestPrimitiveClasses[] = new MyTestPrimitiveClass[num]; + // Need a map because we sort. + Map extraTypeInfoMap = new HashMap(); + + for (int i = 0; i < num; i++) { + int randField = r.nextInt(MyTestPrimitiveClass.primitiveCount); + MyTestPrimitiveClass t = new MyTestPrimitiveClass(); + int field = 0; + ExtraTypeInfo extraTypeInfo = new ExtraTypeInfo(); + t.randomFill(r, randField, field, extraTypeInfo); + myTestPrimitiveClasses[i] = t; + extraTypeInfoMap.put(t, extraTypeInfo); + } + + StructObjectInspector rowOI = (StructObjectInspector) ObjectInspectorFactory + .getReflectionObjectInspector(MyTestPrimitiveClass.class, + ObjectInspectorOptions.JAVA); + + TestBinarySortableSerDe.sort(myTestPrimitiveClasses, rowOI); + + String fieldNames = ObjectInspectorUtils.getFieldNames(rowOI); + String fieldTypes = ObjectInspectorUtils.getFieldTypes(rowOI); + String order; + order = StringUtils.leftPad("", MyTestPrimitiveClass.primitiveCount, '+'); + SerDe serde_ascending = TestBinarySortableSerDe.getSerDe(fieldNames, fieldTypes, order); + order = StringUtils.leftPad("", MyTestPrimitiveClass.primitiveCount, '-'); + SerDe serde_descending = TestBinarySortableSerDe.getSerDe(fieldNames, fieldTypes, order); + + boolean[] columnSortOrderIsDesc = new boolean[MyTestPrimitiveClass.primitiveCount]; + Arrays.fill(columnSortOrderIsDesc, false); + testBinarySortableReadWrite(myTestPrimitiveClasses, columnSortOrderIsDesc, serde_ascending, rowOI, true, extraTypeInfoMap); + Arrays.fill(columnSortOrderIsDesc, true); + testBinarySortableReadWrite(myTestPrimitiveClasses, columnSortOrderIsDesc, serde_descending, rowOI, false, extraTypeInfoMap); + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } +} \ No newline at end of file diff --git serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableSerDe.java serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableSerDe.java index cefb72e..d91a494 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableSerDe.java +++ serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableSerDe.java @@ -26,11 +26,13 @@ import junit.framework.TestCase; +import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; @@ -64,7 +66,7 @@ public static String hexString(BytesWritable bytes) { return sb.toString(); } - private SerDe getSerDe(String fieldNames, String fieldTypes, String order) + public static SerDe getSerDe(String fieldNames, String fieldTypes, String order) throws Throwable { Properties schema = new Properties(); schema.setProperty(serdeConstants.LIST_COLUMNS, fieldNames); @@ -124,7 +126,7 @@ private void testBinarySortableSerDe(Object[] rows, ObjectInspector rowOI, } } - private void sort(Object[] structs, ObjectInspector oi) { + public static void sort(Object[] structs, ObjectInspector oi) { for (int i = 0; i < structs.length; i++) { for (int j = i + 1; j < structs.length; j++) { if (ObjectInspectorUtils.compare(structs[i], oi, structs[j], oi) > 0) { @@ -136,66 +138,6 @@ private void sort(Object[] structs, ObjectInspector oi) { } } - public static HiveDecimal getRandHiveDecimal(Random r) { - StringBuilder sb = new StringBuilder(); - int l1 = 1+r.nextInt(18), l2 = r.nextInt(19); - - if (r.nextBoolean()) { - sb.append("-"); - } - - sb.append(getRandString(r, DECIMAL_CHARS, l1)); - if (l2 != 0) { - sb.append("."); - sb.append(getRandString(r, DECIMAL_CHARS, l2)); - } - - HiveDecimal bd = HiveDecimal.create(sb.toString()); - return bd; - } - - public static Date getRandDate(Random r) { - String dateStr = String.format("%d-%02d-%02d", - Integer.valueOf(1800 + r.nextInt(500)), // year - Integer.valueOf(1 + r.nextInt(12)), // month - Integer.valueOf(1 + r.nextInt(28))); // day - Date dateVal = Date.valueOf(dateStr); - return dateVal; - } - - public static String getRandString(Random r) { - return getRandString(r, null, r.nextInt(10)); - } - - public static String getRandString(Random r, String characters, int length) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < length; i++) { - if (characters == null) { - sb.append((char) (r.nextInt(128))); - } else { - sb.append(characters.charAt(r.nextInt(characters.length()))); - } - } - return sb.toString(); - } - - public static List getRandIntegerArray(Random r) { - int length = r.nextInt(10); - ArrayList result = new ArrayList(length); - for (int i = 0; i < length; i++) { - result.add(r.nextInt(128)); - } - return result; - } - - public static byte[] getRandBA(Random r, int len){ - byte[] bytes = new byte[len]; - for (int j = 0; j < len; j++){ - bytes[j] = Byte.valueOf((byte) r.nextInt()); - } - return bytes; - } - public void testBinarySortableSerDe() throws Throwable { try { @@ -204,25 +146,11 @@ public void testBinarySortableSerDe() throws Throwable { int num = 1000; Random r = new Random(1234); MyTestClass rows[] = new MyTestClass[num]; + ExtraTypeInfo extraTypeInfos[] = MyTestPrimitiveClass.createExtraTypeInfos(num); for (int i = 0; i < num; i++) { - int randField = r.nextInt(11); MyTestClass t = new MyTestClass(); - t.myByte = randField > 0 ? null : Byte.valueOf((byte) r.nextInt()); - t.myShort = randField > 1 ? null : Short.valueOf((short) r.nextInt()); - t.myInt = randField > 2 ? null : Integer.valueOf(r.nextInt()); - t.myLong = randField > 3 ? null : Long.valueOf(r.nextLong()); - t.myFloat = randField > 4 ? null : Float - .valueOf(r.nextFloat() * 10 - 5); - t.myDouble = randField > 5 ? null : Double - .valueOf(r.nextDouble() * 10 - 5); - t.myString = randField > 6 ? null : getRandString(r); - t.myDecimal = randField > 7 ? null : getRandHiveDecimal(r); - t.myDate = randField > 8 ? null : getRandDate(r); - t.myStruct = randField > 9 ? null : new MyTestInnerStruct( - r.nextInt(5) - 2, r.nextInt(5) - 2); - t.myList = randField > 10 ? null : getRandIntegerArray(r); - t.myBA = getRandBA(r, i); + t.randomFill(r, extraTypeInfos[i]); rows[i] = t; } @@ -234,10 +162,13 @@ public void testBinarySortableSerDe() throws Throwable { String fieldNames = ObjectInspectorUtils.getFieldNames(rowOI); String fieldTypes = ObjectInspectorUtils.getFieldTypes(rowOI); + String order; + order = StringUtils.leftPad("", MyTestClass.fieldCount, '+'); testBinarySortableSerDe(rows, rowOI, getSerDe(fieldNames, fieldTypes, - "++++++++++++"), true); + order), true); + order = StringUtils.leftPad("", MyTestClass.fieldCount, '-'); testBinarySortableSerDe(rows, rowOI, getSerDe(fieldNames, fieldTypes, - "------------"), false); + order), false); System.out.println("Test testTBinarySortableProtocol passed!"); } catch (Throwable e) { diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/MyTestClassBigger.java serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/MyTestClassBigger.java index b6467ef..b8aa591 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/MyTestClassBigger.java +++ serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/MyTestClassBigger.java @@ -18,49 +18,109 @@ package org.apache.hadoop.hive.serde2.lazybinary; import java.sql.Date; +import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Random; +import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestClass; import org.apache.hadoop.hive.serde2.binarysortable.MyTestInnerStruct; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass; +import org.apache.hadoop.hive.serde2.binarysortable.TestBinarySortableSerDe; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; /** * MyTestClassBigger. * */ public class MyTestClassBigger { - Byte myByte; - Short myShort; - Integer myInt; - Long myLong; - Float myFloat; - Double myDouble; - String myString; - HiveDecimal myDecimal; - Date myDate; - MyTestInnerStruct myStruct; - List myList; - byte[] myBA; + + // The primitives. + public Boolean myBool; + public Byte myByte; + public Short myShort; + public Integer myInt; + public Long myLong; + public Float myFloat; + public Double myDouble; + public String myString; + public HiveChar myHiveChar; + public HiveVarchar myHiveVarchar; + public byte[] myBinary; + public HiveDecimal myDecimal; + public Date myDate; + + // Add more complex types. + public MyTestInnerStruct myStruct; + public List myList; + + // Bigger addition. Map> myMap; + public final static int mapPos = 15; + public MyTestClassBigger() { } - public MyTestClassBigger(Byte b, Short s, Integer i, Long l, Float f, - Double d, String st, HiveDecimal bd, Date date, MyTestInnerStruct is, List li, - byte[] ba, Map> mp) { - myByte = b; - myShort = s; - myInt = i; - myLong = l; - myFloat = f; - myDouble = d; - myString = st; - myDecimal = bd; - myDate = date; - myStruct = is; - myList = li; - myBA = ba; - myMap = mp; + public final static int biggerCount = 16; + + public int randomFill(Random r, ExtraTypeInfo extraTypeInfo) { + int randField = r.nextInt(biggerCount); + int field = 0; + myBool = randField > field++ ? null : (r.nextInt(1) == 1); + myByte = randField > field++ ? null : Byte.valueOf((byte) r.nextInt()); + myShort = randField > field++ ? null : Short.valueOf((short) r.nextInt()); + myInt = randField > field++ ? null : Integer.valueOf(r.nextInt()); + myLong = randField > field++ ? null : Long.valueOf(r.nextLong()); + myFloat = randField > field++ ? null : Float + .valueOf(r.nextFloat() * 10 - 5); + myDouble = randField > field++ ? null : Double + .valueOf(r.nextDouble() * 10 - 5); + myString = randField > field++ ? null : MyTestPrimitiveClass.getRandString(r); + myHiveChar = randField > field++ ? null : MyTestPrimitiveClass.getRandHiveChar(r, extraTypeInfo); + myHiveVarchar = randField > field++ ? null : MyTestPrimitiveClass.getRandHiveVarchar(r, extraTypeInfo); + myBinary = MyTestPrimitiveClass.getRandBinary(r, r.nextInt(1000)); + myDecimal = randField > field++ ? null : MyTestPrimitiveClass.getRandHiveDecimal(r); + myDate = randField > field++ ? null : MyTestPrimitiveClass.getRandDate(r); + + myStruct = randField > field++ ? null : new MyTestInnerStruct( + r.nextInt(5) - 2, r.nextInt(5) - 2); + myList = randField > field++ ? null : MyTestClass.getRandIntegerArray(r); + + Map> mp = new HashMap>(); + String key = MyTestPrimitiveClass.getRandString(r); + List value = randField > 9 ? null + : getRandStructArray(r); + mp.put(key, value); + String key1 = MyTestPrimitiveClass.getRandString(r); + mp.put(key1, null); + String key2 = MyTestPrimitiveClass.getRandString(r); + List value2 = getRandStructArray(r); + mp.put(key2, value2); + myMap = mp; + return field; + } + + /** + * Generate a random struct array. + * + * @param r + * random number generator + * @return an struct array + */ + static List getRandStructArray(Random r) { + int length = r.nextInt(10); + ArrayList result = new ArrayList( + length); + for (int i = 0; i < length; i++) { + MyTestInnerStruct ti = new MyTestInnerStruct(r.nextInt(), r.nextInt()); + result.add(ti); + } + return result; } + } diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/MyTestClassSmaller.java serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/MyTestClassSmaller.java index 8c7ffba..cd793dd 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/MyTestClassSmaller.java +++ serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/MyTestClassSmaller.java @@ -18,36 +18,59 @@ package org.apache.hadoop.hive.serde2.lazybinary; import java.sql.Date; +import java.util.Random; +import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestClass; import org.apache.hadoop.hive.serde2.binarysortable.MyTestInnerStruct; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; public class MyTestClassSmaller { - Byte myByte; - Short myShort; - Integer myInt; - Long myLong; - Float myFloat; - Double myDouble; - String myString; - HiveDecimal myDecimal; - Date myDate; + + public Boolean myBool; + public Byte myByte; + public Short myShort; + public Integer myInt; + public Long myLong; + public Float myFloat; + public Double myDouble; + public String myString; + public HiveChar myHiveChar; + public HiveVarchar myHiveVarchar; + public byte[] myBinary; + public HiveDecimal myDecimal; + public Date myDate; + MyTestInnerStruct myStruct; - public MyTestClassSmaller() { - } + public final static int smallerCount = 14; - public MyTestClassSmaller(Byte b, Short s, Integer i, Long l, Float f, - Double d, String st, HiveDecimal bd, Date date, MyTestInnerStruct is) { - myByte = b; - myShort = s; - myInt = i; - myLong = l; - myFloat = f; - myDouble = d; - myString = st; - myDecimal = bd; - myDate = date; - myStruct = is; + public int randomFill(Random r, ExtraTypeInfo extraTypeInfo) { + int randField = r.nextInt(smallerCount); + int field = 0; + + myBool = randField > field++ ? null : (r.nextInt(1) == 1); + myByte = randField > field++ ? null : Byte.valueOf((byte) r.nextInt()); + myShort = randField > field++ ? null : Short.valueOf((short) r.nextInt()); + myInt = randField > field++ ? null : Integer.valueOf(r.nextInt()); + myLong = randField > field++ ? null : Long.valueOf(r.nextLong()); + myFloat = randField > field++ ? null : Float + .valueOf(r.nextFloat() * 10 - 5); + myDouble = randField > field++ ? null : Double + .valueOf(r.nextDouble() * 10 - 5); + myString = randField > field++ ? null : MyTestPrimitiveClass.getRandString(r); + myHiveChar = randField > field++ ? null : MyTestPrimitiveClass.getRandHiveChar(r, extraTypeInfo); + myHiveVarchar = randField > field++ ? null : MyTestPrimitiveClass.getRandHiveVarchar(r, extraTypeInfo); + myBinary = MyTestPrimitiveClass.getRandBinary(r, r.nextInt(1000)); + myDecimal = randField > field++ ? null : MyTestPrimitiveClass.getRandHiveDecimal(r); + myDate = randField > field++ ? null : MyTestPrimitiveClass.getRandDate(r); + + myStruct = randField > field++ ? null : new MyTestInnerStruct( + r.nextInt(5) - 2, r.nextInt(5) - 2); + return field; } + } diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryReadWrite.java serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryReadWrite.java new file mode 100644 index 0000000..1873a65 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryReadWrite.java @@ -0,0 +1,193 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.lazybinary; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.VerifyReadWrite; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; +import org.apache.hadoop.hive.serde2.lazybinary.deserializeread.LazyBinaryDeserializeRead; +import org.apache.hadoop.hive.serde2.lazybinary.serializewrite.LazyBinarySerializeWrite; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.BytesWritable; + +/** + * TestBinarySortableSerDe. + * + */ +public class TestLazyBinaryReadWrite extends TestCase { + + private void testLazyBinaryReadWrite(MyTestPrimitiveClass[] myTestPrimitiveClasses, SerDe[] serdes, StructObjectInspector[] rowOIs, ExtraTypeInfo extraTypeInfos[]) throws Throwable { + + LazyBinarySerializeWrite lazyBinarySerializeWrite = new LazyBinarySerializeWrite(MyTestPrimitiveClass.primitiveCount, false); + + // Try to serialize + BytesWritable serializeWriteBytes[] = new BytesWritable[myTestPrimitiveClasses.length]; + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + Output output = new Output(); + lazyBinarySerializeWrite.set(output); + + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + Object object = t.getPrimitiveObject(index); + PrimitiveCategory primitiveCategory = t.getPrimitiveCategory(index); + VerifyReadWrite.serializeWrite(lazyBinarySerializeWrite, primitiveCategory, object); + } + + BytesWritable bytesWritable = new BytesWritable(); + bytesWritable.set(output.getData(), 0, output.getLength()); + serializeWriteBytes[i] = bytesWritable; + } + + LazyBinaryDeserializeRead lazyBinaryDeserializeRead = new LazyBinaryDeserializeRead(MyTestPrimitiveClass.primitiveCount); + + // Try to deserialize + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + + BytesWritable bytesWritable = serializeWriteBytes[i]; + lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength()); + + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + Object object = t.getPrimitiveObject(index); + PrimitiveCategory primitiveCategory = t.getPrimitiveCategory(index); + VerifyReadWrite.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveCategory, object, extraTypeInfos[i]); + } + } + + // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite. + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + BytesWritable bytesWritable = serializeWriteBytes[i]; + LazyBinaryStruct lazyBinaryStruct = (LazyBinaryStruct) serdes[i].deserialize(bytesWritable); + + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + ExtraTypeInfo extraTypeInfo = extraTypeInfos[i]; + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + Object expected = t.getPrimitiveWritableObject(index, extraTypeInfo); + Object object = lazyBinaryStruct.getField(index); + if (expected == null || object == null) { + if (expected != null || object != null) { + fail("SerDe deserialized NULL column mismatch"); + } + } else { + if (!object.equals(expected)) { + fail("SerDe deserialized value does not match"); + } + } + } + } + + // One Writable per row. + BytesWritable serdeBytes[] = new BytesWritable[myTestPrimitiveClasses.length]; + + // Serialize using the SerDe, then below deserialize using DeserializeRead. + Object[] row = new Object[MyTestPrimitiveClass.primitiveCount]; + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + + // LazyBinary seems to work better with an row object array instead of a Java object... + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + ExtraTypeInfo extraTypeInfo = extraTypeInfos[i]; + Object object = t.getPrimitiveWritableObject(index, extraTypeInfo); + row[index] = object; + } + + BytesWritable serialized = (BytesWritable) serdes[i].serialize(row, rowOIs[i]); + BytesWritable bytesWritable = new BytesWritable(); + bytesWritable.set(serialized); + byte[] bytes1 = Arrays.copyOfRange(bytesWritable.getBytes(), 0, bytesWritable.getLength()); + + byte[] bytes2 = Arrays.copyOfRange(serializeWriteBytes[i].getBytes(), 0, serializeWriteBytes[i].getLength()); + if (!Arrays.equals(bytes1, bytes2)) { + fail("SerializeWrite and SerDe serialization does not match"); + } + serdeBytes[i] = bytesWritable; + } + + // Try to deserialize using DeserializeRead our Writable row objects created by SerDe. + for (int i = 0; i < myTestPrimitiveClasses.length; i++) { + MyTestPrimitiveClass t = myTestPrimitiveClasses[i]; + + BytesWritable bytesWritable = serdeBytes[i]; + lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength()); + + for (int index = 0; index < MyTestPrimitiveClass.primitiveCount; index++) { + Object object = t.getPrimitiveObject(index); + ExtraTypeInfo extraTypeInfo = extraTypeInfos[i]; + PrimitiveCategory primitiveCategory = t.getPrimitiveCategory(index); + VerifyReadWrite.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveCategory, object, extraTypeInfo); + } + } + } + + public void testLazyBinaryReadWrite() throws Throwable { + try { + + int num = 1000; + Random r = new Random(1234); + MyTestPrimitiveClass rows[] = new MyTestPrimitiveClass[num]; + ExtraTypeInfo extraTypeInfos[] = MyTestPrimitiveClass.createExtraTypeInfos(num); + for (int i = 0; i < num; i++) { + int randField = r.nextInt(MyTestPrimitiveClass.primitiveCount); + MyTestPrimitiveClass t = new MyTestPrimitiveClass(); + int field = 0; + t.randomFill(r, randField, field, extraTypeInfos[i]); + rows[i] = t; + } + + // To get the specific type information for CHAR and VARCHAR, seems like we need an + // inspector and SerDe per row... + StructObjectInspector[] rowOIs = new StructObjectInspector[num]; + SerDe[] serdes = new SerDe[num]; + for (int i = 0; i < num; i++) { + MyTestPrimitiveClass t = rows[i]; + ExtraTypeInfo extraTypeInfo = extraTypeInfos[i]; + + StructObjectInspector rowOI = t.getRowInspector(extraTypeInfo); + + String fieldNames = ObjectInspectorUtils.getFieldNames(rowOI); + String fieldTypes = ObjectInspectorUtils.getFieldTypes(rowOI); + + rowOIs[i] = rowOI; + serdes[i] = TestLazyBinarySerDe.getSerDe(fieldNames, fieldTypes); + } + + testLazyBinaryReadWrite(rows, serdes, rowOIs, extraTypeInfos); + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } +} \ No newline at end of file diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinarySerDe.java serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinarySerDe.java index 02ae6f8..8f32353 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinarySerDe.java +++ serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinarySerDe.java @@ -36,7 +36,9 @@ import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.binarysortable.MyTestClass; import org.apache.hadoop.hive.serde2.binarysortable.MyTestInnerStruct; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass; import org.apache.hadoop.hive.serde2.binarysortable.TestBinarySortableSerDe; +import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef; import org.apache.hadoop.hive.serde2.lazy.LazyBinary; import org.apache.hadoop.hive.serde2.lazy.LazyFactory; @@ -91,7 +93,7 @@ * @return the initialized LazyBinarySerDe * @throws Throwable */ - private SerDe getSerDe(String fieldNames, String fieldTypes) throws Throwable { + protected static SerDe getSerDe(String fieldNames, String fieldTypes) throws Throwable { Properties schema = new Properties(); schema.setProperty(serdeConstants.LIST_COLUMNS, fieldNames); schema.setProperty(serdeConstants.LIST_COLUMN_TYPES, fieldTypes); @@ -194,46 +196,20 @@ private void testShorterSchemaDeserialization(Random r) throws Throwable { int num = 100; for (int itest = 0; itest < num; itest++) { - int randField = r.nextInt(11); - Byte b = randField > 0 ? null : Byte.valueOf((byte) r.nextInt()); - Short s = randField > 1 ? null : Short.valueOf((short) r.nextInt()); - Integer n = randField > 2 ? null : Integer.valueOf(r.nextInt()); - Long l = randField > 3 ? null : Long.valueOf(r.nextLong()); - Float f = randField > 4 ? null : Float.valueOf(r.nextFloat()); - Double d = randField > 5 ? null : Double.valueOf(r.nextDouble()); - String st = randField > 6 ? null : TestBinarySortableSerDe - .getRandString(r); - HiveDecimal bd = randField > 7 ? null : TestBinarySortableSerDe.getRandHiveDecimal(r); - Date date = randField > 8 ? null : TestBinarySortableSerDe.getRandDate(r); - MyTestInnerStruct is = randField > 9 ? null : new MyTestInnerStruct(r - .nextInt(5) - 2, r.nextInt(5) - 2); - List li = randField > 10 ? null : TestBinarySortableSerDe - .getRandIntegerArray(r); - byte[] ba = TestBinarySortableSerDe.getRandBA(r, itest); - Map> mp = new HashMap>(); - String key = TestBinarySortableSerDe.getRandString(r); - List value = randField > 9 ? null - : getRandStructArray(r); - mp.put(key, value); - String key1 = TestBinarySortableSerDe.getRandString(r); - mp.put(key1, null); - String key2 = TestBinarySortableSerDe.getRandString(r); - List value2 = getRandStructArray(r); - mp.put(key2, value2); - - MyTestClassBigger input = new MyTestClassBigger(b, s, n, l, f, d, st, bd, date, is, - li, ba, mp); - BytesWritable bw = (BytesWritable) serde1.serialize(input, rowOI1); + MyTestClassBigger t = new MyTestClassBigger(); + ExtraTypeInfo extraTypeInfo = new ExtraTypeInfo(); + t.randomFill(r, extraTypeInfo); + BytesWritable bw = (BytesWritable) serde1.serialize(t, rowOI1); Object output = serde2.deserialize(bw); - if (0 != compareDiffSizedStructs(input, rowOI1, output, serdeOI2)) { + if (0 != compareDiffSizedStructs(t, rowOI1, output, serdeOI2)) { System.out.println("structs = " - + SerDeUtils.getJSONString(input, rowOI1)); + + SerDeUtils.getJSONString(t, rowOI1)); System.out.println("deserialized = " + SerDeUtils.getJSONString(output, serdeOI2)); System.out.println("serialized = " + TestBinarySortableSerDe.hexString(bw)); - assertEquals(input, output); + assertEquals(t, output); } } } @@ -263,34 +239,20 @@ private void testShorterSchemaDeserialization1(Random r) throws Throwable { int num = 100; for (int itest = 0; itest < num; itest++) { - int randField = r.nextInt(12); - Byte b = randField > 0 ? null : Byte.valueOf((byte) r.nextInt()); - Short s = randField > 1 ? null : Short.valueOf((short) r.nextInt()); - Integer n = randField > 2 ? null : Integer.valueOf(r.nextInt()); - Long l = randField > 3 ? null : Long.valueOf(r.nextLong()); - Float f = randField > 4 ? null : Float.valueOf(r.nextFloat()); - Double d = randField > 5 ? null : Double.valueOf(r.nextDouble()); - String st = randField > 6 ? null : TestBinarySortableSerDe - .getRandString(r); - HiveDecimal bd = randField > 7 ? null : TestBinarySortableSerDe.getRandHiveDecimal(r); - Date date = randField > 8 ? null : TestBinarySortableSerDe.getRandDate(r); - MyTestInnerStruct is = randField > 9 ? null : new MyTestInnerStruct(r - .nextInt(5) - 2, r.nextInt(5) - 2); - List li = randField > 10 ? null : TestBinarySortableSerDe - .getRandIntegerArray(r); - byte[] ba = TestBinarySortableSerDe.getRandBA(r, itest); - MyTestClass input = new MyTestClass(b, s, n, l, f, d, st, bd, date, is, li, ba); - BytesWritable bw = (BytesWritable) serde1.serialize(input, rowOI1); + MyTestClass t = new MyTestClass(); + ExtraTypeInfo extraTypeInfo = new ExtraTypeInfo(); + t.randomFill(r, extraTypeInfo); + BytesWritable bw = (BytesWritable) serde1.serialize(t, rowOI1); Object output = serde2.deserialize(bw); - if (0 != compareDiffSizedStructs(input, rowOI1, output, serdeOI2)) { + if (0 != compareDiffSizedStructs(t, rowOI1, output, serdeOI2)) { System.out.println("structs = " - + SerDeUtils.getJSONString(input, rowOI1)); + + SerDeUtils.getJSONString(t, rowOI1)); System.out.println("deserialized = " + SerDeUtils.getJSONString(output, serdeOI2)); System.out.println("serialized = " + TestBinarySortableSerDe.hexString(bw)); - assertEquals(input, output); + assertEquals(t, output); } } } @@ -320,34 +282,21 @@ void testLongerSchemaDeserialization(Random r) throws Throwable { int num = 100; for (int itest = 0; itest < num; itest++) { - int randField = r.nextInt(12); - Byte b = randField > 0 ? null : Byte.valueOf((byte) r.nextInt()); - Short s = randField > 1 ? null : Short.valueOf((short) r.nextInt()); - Integer n = randField > 2 ? null : Integer.valueOf(r.nextInt()); - Long l = randField > 3 ? null : Long.valueOf(r.nextLong()); - Float f = randField > 4 ? null : Float.valueOf(r.nextFloat()); - Double d = randField > 5 ? null : Double.valueOf(r.nextDouble()); - String st = randField > 6 ? null : TestBinarySortableSerDe - .getRandString(r); - HiveDecimal bd = randField > 7 ? null : TestBinarySortableSerDe.getRandHiveDecimal(r); - Date date = randField > 8 ? null : TestBinarySortableSerDe.getRandDate(r); - MyTestInnerStruct is = randField > 9 ? null : new MyTestInnerStruct(r - .nextInt(5) - 2, r.nextInt(5) - 2); - List li = randField > 10 ? null : TestBinarySortableSerDe - .getRandIntegerArray(r); - byte[] ba = TestBinarySortableSerDe.getRandBA(r, itest); - MyTestClass input = new MyTestClass(b, s, n, l, f, d, st, bd, date, is, li,ba); - BytesWritable bw = (BytesWritable) serde1.serialize(input, rowOI1); + MyTestClass t = new MyTestClass(); + ExtraTypeInfo extraTypeInfo = new ExtraTypeInfo(); + t.randomFill(r, extraTypeInfo); + + BytesWritable bw = (BytesWritable) serde1.serialize(t, rowOI1); Object output = serde2.deserialize(bw); - if (0 != compareDiffSizedStructs(input, rowOI1, output, serdeOI2)) { + if (0 != compareDiffSizedStructs(t, rowOI1, output, serdeOI2)) { System.out.println("structs = " - + SerDeUtils.getJSONString(input, rowOI1)); + + SerDeUtils.getJSONString(t, rowOI1)); System.out.println("deserialized = " + SerDeUtils.getJSONString(output, serdeOI2)); System.out.println("serialized = " + TestBinarySortableSerDe.hexString(bw)); - assertEquals(input, output); + assertEquals(t, output); } } } @@ -377,33 +326,20 @@ void testLongerSchemaDeserialization1(Random r) throws Throwable { int num = 100; for (int itest = 0; itest < num; itest++) { - int randField = r.nextInt(9); - Byte b = randField > 0 ? null : Byte.valueOf((byte) r.nextInt()); - Short s = randField > 1 ? null : Short.valueOf((short) r.nextInt()); - Integer n = randField > 2 ? null : Integer.valueOf(r.nextInt()); - Long l = randField > 3 ? null : Long.valueOf(r.nextLong()); - Float f = randField > 4 ? null : Float.valueOf(r.nextFloat()); - Double d = randField > 5 ? null : Double.valueOf(r.nextDouble()); - String st = randField > 6 ? null : TestBinarySortableSerDe - .getRandString(r); - HiveDecimal bd = randField > 7 ? null : TestBinarySortableSerDe.getRandHiveDecimal(r); - Date date = randField > 7 ? null : TestBinarySortableSerDe.getRandDate(r); - MyTestInnerStruct is = randField > 7 ? null : new MyTestInnerStruct(r - .nextInt(5) - 2, r.nextInt(5) - 2); - - MyTestClassSmaller input = new MyTestClassSmaller(b, s, n, l, f, d, st, bd, date, - is); - BytesWritable bw = (BytesWritable) serde1.serialize(input, rowOI1); + MyTestClassSmaller t = new MyTestClassSmaller(); + ExtraTypeInfo extraTypeInfo = new ExtraTypeInfo(); + t.randomFill(r, extraTypeInfo); + BytesWritable bw = (BytesWritable) serde1.serialize(t, rowOI1); Object output = serde2.deserialize(bw); - if (0 != compareDiffSizedStructs(input, rowOI1, output, serdeOI2)) { + if (0 != compareDiffSizedStructs(t, rowOI1, output, serdeOI2)) { System.out.println("structs = " - + SerDeUtils.getJSONString(input, rowOI1)); + + SerDeUtils.getJSONString(t, rowOI1)); System.out.println("deserialized = " + SerDeUtils.getJSONString(output, serdeOI2)); System.out.println("serialized = " + TestBinarySortableSerDe.hexString(bw)); - assertEquals(input, output); + assertEquals(t, output); } } } @@ -421,13 +357,13 @@ void testLazyBinaryMap(Random r) throws Throwable { StructObjectInspector soi1 = (StructObjectInspector) serdeOI; List fields1 = soi1.getAllStructFieldRefs(); LazyBinaryMapObjectInspector lazympoi = (LazyBinaryMapObjectInspector) fields1 - .get(12).getFieldObjectInspector(); + .get(MyTestClassBigger.mapPos).getFieldObjectInspector(); ObjectInspector lazympkeyoi = lazympoi.getMapKeyObjectInspector(); ObjectInspector lazympvalueoi = lazympoi.getMapValueObjectInspector(); StructObjectInspector soi2 = rowOI; List fields2 = soi2.getAllStructFieldRefs(); - MapObjectInspector inputmpoi = (MapObjectInspector) fields2.get(12) + MapObjectInspector inputmpoi = (MapObjectInspector) fields2.get(MyTestClassBigger.mapPos) .getFieldObjectInspector(); ObjectInspector inputmpkeyoi = inputmpoi.getMapKeyObjectInspector(); ObjectInspector inputmpvalueoi = inputmpoi.getMapValueObjectInspector(); @@ -439,18 +375,19 @@ void testLazyBinaryMap(Random r) throws Throwable { int randFields = r.nextInt(10); for (int i = 0; i < randFields; i++) { - String key = TestBinarySortableSerDe.getRandString(r); + String key = MyTestPrimitiveClass.getRandString(r); int randField = r.nextInt(10); List value = randField > 4 ? null : getRandStructArray(r); mp.put(key, value); + } - MyTestClassBigger input = new MyTestClassBigger(null, null, null, null, - null, null, null, null, null, null, null, null, mp); - BytesWritable bw = (BytesWritable) serde.serialize(input, rowOI); + MyTestClassBigger t = new MyTestClassBigger(); + t.myMap = mp; + BytesWritable bw = (BytesWritable) serde.serialize(t, rowOI); Object output = serde.deserialize(bw); - Object lazyobj = soi1.getStructFieldData(output, fields1.get(12)); + Object lazyobj = soi1.getStructFieldData(output, fields1.get(MyTestClassBigger.mapPos)); Map outputmp = lazympoi.getMap(lazyobj); if (outputmp.size() != mp.size()) { @@ -496,24 +433,10 @@ public void testLazyBinarySerDe() throws Throwable { int num = 1000; Random r = new Random(1234); MyTestClass rows[] = new MyTestClass[num]; + ExtraTypeInfo extraTypeInfos[] = MyTestPrimitiveClass.createExtraTypeInfos(num); for (int i = 0; i < num; i++) { - int randField = r.nextInt(12); - Byte b = randField > 0 ? null : Byte.valueOf((byte) r.nextInt()); - Short s = randField > 1 ? null : Short.valueOf((short) r.nextInt()); - Integer n = randField > 2 ? null : Integer.valueOf(r.nextInt()); - Long l = randField > 3 ? null : Long.valueOf(r.nextLong()); - Float f = randField > 4 ? null : Float.valueOf(r.nextFloat()); - Double d = randField > 5 ? null : Double.valueOf(r.nextDouble()); - String st = randField > 6 ? null : TestBinarySortableSerDe - .getRandString(r); - HiveDecimal bd = randField > 7 ? null : TestBinarySortableSerDe.getRandHiveDecimal(r); - Date date = randField > 8 ? null : TestBinarySortableSerDe.getRandDate(r); - MyTestInnerStruct is = randField > 9 ? null : new MyTestInnerStruct(r - .nextInt(5) - 2, r.nextInt(5) - 2); - List li = randField > 10 ? null : TestBinarySortableSerDe - .getRandIntegerArray(r); - byte[] ba = TestBinarySortableSerDe.getRandBA(r, i); - MyTestClass t = new MyTestClass(b, s, n, l, f, d, st, bd, date, is, li, ba); + MyTestClass t = new MyTestClass(); + t.randomFill(r, extraTypeInfos[i]); rows[i] = t; }