diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java index 51acae0..a15b664 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java @@ -298,8 +298,8 @@ public void set(BytesBytesMultiHashMap hashMap, long firstOffset, boolean hasLis } /** - * Read the current value. - * + * Read the current value. + * * @return * The ByteSegmentRef to the current value read. */ @@ -380,29 +380,6 @@ public void set(BytesBytesMultiHashMap hashMap, long firstOffset, boolean hasLis } /** - * @return Whether we have read all the values or not. - */ - public boolean isEof() { - // LOG.info("BytesBytesMultiHashMap isEof hasRows " + hasRows + " hasList " + hasList + " readIndex " + readIndex + " nextTailOffset " + nextTailOffset); - if (!hasRows) { - return true; - } - - if (!hasList) { - return (readIndex > 0); - } else { - // Multiple values. - if (readIndex <= 1) { - // Careful: We have not read the list record and 2nd value yet, so nextTailOffset - // is not valid yet. - return false; - } else { - return (nextTailOffset <= 0); - } - } - } - - /** * Lets go of any references to a hash map. */ public void forget() { @@ -741,7 +718,7 @@ private void expandAndRehash() { long capacity = refs.length << 1; expandAndRehashImpl(capacity); } - + private void expandAndRehashImpl(long capacity) { long expandTime = System.currentTimeMillis(); final long[] oldRefs = refs; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java index e966ff1..d66f815 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.Iterator; import java.util.List; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator; @@ -413,7 +414,14 @@ private void processVectorGroup(BytesWritable keyWritable, // VectorizedBatchUtil.displayBytes(keyBytes, 0, keyLength)); keyBinarySortableDeserializeToRow.setBytes(keyBytes, 0, keyLength); - keyBinarySortableDeserializeToRow.deserialize(batch, 0); + try { + keyBinarySortableDeserializeToRow.deserialize(batch, 0); + } catch (Exception e) { + throw new HiveException( + "DeserializeRead details: " + + keyBinarySortableDeserializeToRow.getDetailedReadPositionString() + "\n", + e); + } for(int i = 0; i < firstValueColumnOffset; i++) { VectorizedBatchUtil.setRepeatingColumn(batch, i); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java index 2e8331a..7fb79fd 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java @@ -644,38 +644,26 @@ public void setBytes(byte[] bytes, int offset, int length) { /** * Deserialize a row from the range of bytes specified by setBytes. * + * Use getDetailedReadPositionString to get detailed read position information to help + * diagnose exceptions that are thrown... + * * @param batch * @param batchIndex * @throws IOException */ public void deserialize(VectorizedRowBatch batch, int batchIndex) throws IOException { final int count = isConvert.length; - int i = 0; - try { - while (i < count) { - if (isConvert[i]) { - deserializeConvertRowColumn(batch, batchIndex, i); - } else { - deserializeRowColumn(batch, batchIndex, i); - } - i++; // Increment after the apply which could throw an exception. + for (int i = 0; i < count; i++) { + if (isConvert[i]) { + deserializeConvertRowColumn(batch, batchIndex, i); + } else { + deserializeRowColumn(batch, batchIndex, i); } - } catch (EOFException e) { - throwMoreDetailedException(e, i); } deserializeRead.extraFieldsCheck(); } - private void throwMoreDetailedException(IOException e, int index) throws EOFException { - StringBuilder sb = new StringBuilder(); - sb.append("Detail: \"" + e.toString() + "\" occured for field " + index + " of " + sourceTypeInfos.length + " fields ("); - for (int i = 0; i < sourceTypeInfos.length; i++) { - if (i > 0) { - sb.append(", "); - } - sb.append(((PrimitiveTypeInfo) sourceTypeInfos[i]).getPrimitiveCategory().name()); - } - sb.append(")"); - throw new EOFException(sb.toString()); + public String getDetailedReadPositionString() { + return deserializeRead.getDetailedReadPositionString(); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java index 6979956..771386c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java @@ -811,7 +811,13 @@ public void process(Writable value) throws HiveException { currentDeserializeRead.set(binComp.getBytes(), 0, binComp.getLength()); // Deserialize and append new row using the current batch size as the index. - currentVectorDeserializeRow.deserialize(deserializerBatch, deserializerBatch.size++); + try { + currentVectorDeserializeRow.deserialize( + deserializerBatch, deserializerBatch.size++); + } catch (Exception e) { + throw new HiveException( + currentVectorDeserializeRow.getDetailedReadPositionString(), e); + } } break; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java index 6a3d64b..6d9e40c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java @@ -134,6 +134,27 @@ protected void performValueExpressions(VectorizedRowBatch batch, batch.selectedInUse = saveSelectedInUse; } + protected void doSmallTableDeserializeRow(VectorizedRowBatch batch, int batchIndex, + ByteSegmentRef byteSegmentRef, VectorMapJoinHashMapResult hashMapResult) + throws HiveException { + + byte[] bytes = byteSegmentRef.getBytes(); + int offset = (int) byteSegmentRef.getOffset(); + int length = byteSegmentRef.getLength(); + smallTableVectorDeserializeRow.setBytes(bytes, offset, length); + + try { + smallTableVectorDeserializeRow.deserialize(batch, batchIndex); + } catch (Exception e) { + throw new HiveException( + "HashMapResult detail: " + + hashMapResult.getDetailedHashMapResultPositionString() + "\n" + + "DeserializeRead detail: " + + smallTableVectorDeserializeRow.getDetailedReadPositionString() + "\n", + e); + } + } + //------------------------------------------------------------------------------------------------ /* @@ -180,13 +201,8 @@ protected int generateHashMapResultSingleValue(VectorizedRowBatch batch, } if (smallTableVectorDeserializeRow != null) { - - byte[] bytes = byteSegmentRef.getBytes(); - int offset = (int) byteSegmentRef.getOffset(); - int length = byteSegmentRef.getLength(); - smallTableVectorDeserializeRow.setBytes(bytes, offset, length); - - smallTableVectorDeserializeRow.deserialize(batch, batchIndex); + doSmallTableDeserializeRow(batch, batchIndex, + byteSegmentRef, hashMapResult); } // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, "generateHashMapResultSingleValue big table"); @@ -248,12 +264,8 @@ protected void generateHashMapResultMultiValue(VectorizedRowBatch batch, if (smallTableVectorDeserializeRow != null) { - byte[] bytes = byteSegmentRef.getBytes(); - int offset = (int) byteSegmentRef.getOffset(); - int length = byteSegmentRef.getLength(); - smallTableVectorDeserializeRow.setBytes(bytes, offset, length); - - smallTableVectorDeserializeRow.deserialize(overflowBatch, overflowBatch.size); + doSmallTableDeserializeRow(overflowBatch, overflowBatch.size, + byteSegmentRef, hashMapResult); } // VectorizedBatchUtil.debugDisplayOneRow(overflowBatch, overflowBatch.size, "generateHashMapResultMultiValue overflow"); @@ -298,13 +310,8 @@ private void generateHashMapResultLargeMultiValue(VectorizedRowBatch batch, while (byteSegmentRef != null) { if (smallTableVectorDeserializeRow != null) { - - byte[] bytes = byteSegmentRef.getBytes(); - int offset = (int) byteSegmentRef.getOffset(); - int length = byteSegmentRef.getLength(); - smallTableVectorDeserializeRow.setBytes(bytes, offset, length); - - smallTableVectorDeserializeRow.deserialize(overflowBatch, overflowBatch.DEFAULT_SIZE); + doSmallTableDeserializeRow(overflowBatch, overflowBatch.size, + byteSegmentRef, hashMapResult); } overflowBatch.size++; @@ -348,10 +355,10 @@ private void generateHashMapResultLargeMultiValue(VectorizedRowBatch batch, } } - if (hashMapResult.isEof()) { + byteSegmentRef = hashMapResult.next(); + if (byteSegmentRef == null) { break; } - byteSegmentRef = hashMapResult.next(); // Get ready for a another round of small table values. overflowBatch.reset(); @@ -542,14 +549,16 @@ protected void reProcessBigTable(int partitionId) int offset = bigTable.currentOffset(); int length = bigTable.currentLength(); -// LOG.debug(CLASS_NAME + " reProcessBigTable serialized row #" + rowCount + ", offset " + offset + ", length " + length); - bigTableVectorDeserializeRow.setBytes(bytes, offset, length); - bigTableVectorDeserializeRow.deserialize(spillReplayBatch, spillReplayBatch.size); + try { + bigTableVectorDeserializeRow.deserialize(spillReplayBatch, spillReplayBatch.size); + } catch (Exception e) { + throw new HiveException( + bigTableVectorDeserializeRow.getDetailedReadPositionString(), e); + } spillReplayBatch.size++; if (spillReplayBatch.size == VectorizedRowBatch.DEFAULT_SIZE) { - // LOG.debug("reProcessBigTable going to call process with spillReplayBatch.size " + spillReplayBatch.size + " rows"); process(spillReplayBatch, posBigTable); // call process once we have a full batch spillReplayBatch.reset(); batchCount++; @@ -557,7 +566,6 @@ protected void reProcessBigTable(int partitionId) } // Process the row batch that has less than DEFAULT_SIZE rows if (spillReplayBatch.size > 0) { - // LOG.debug("reProcessBigTable going to call process with spillReplayBatch.size " + spillReplayBatch.size + " rows"); process(spillReplayBatch, posBigTable); spillReplayBatch.reset(); batchCount++; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java index 0a502e0..811f884 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java @@ -78,8 +78,15 @@ public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws byte[] keyBytes = currentKey.getBytes(); int keyLength = currentKey.getLength(); keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - if (keyBinarySortableDeserializeRead.readCheckNull()) { - return; + try { + if (keyBinarySortableDeserializeRead.readCheckNull()) { + return; + } + } catch (Exception e) { + throw new HiveException( + "DeserializeRead details: " + + keyBinarySortableDeserializeRead.getDetailedReadPositionString() + "\n", + e); } long key = VectorMapJoinFastLongHashUtil.deserializeLongKey( @@ -260,7 +267,8 @@ public VectorMapJoinFastLongHashTable( this.isOuterJoin = isOuterJoin; this.hashTableKeyType = hashTableKeyType; PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.longTypeInfo }; - keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); + keyBinarySortableDeserializeRead = + new BinarySortableDeserializeRead(primitiveTypeInfos); allocateBucketArray(); useMinMax = minMaxEnabled; min = Long.MAX_VALUE; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java index 985fb1c..97654b1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java @@ -45,8 +45,15 @@ public void adaptPutRow(VectorMapJoinFastBytesHashTable hashTable, byte[] keyBytes = currentKey.getBytes(); int keyLength = currentKey.getLength(); keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - if (keyBinarySortableDeserializeRead.readCheckNull()) { - return; + try { + if (keyBinarySortableDeserializeRead.readCheckNull()) { + return; + } + } catch (Exception e) { + throw new HiveException( + "DeserializeRead details: " + + keyBinarySortableDeserializeRead.getDetailedReadPositionString() + "\n", + e); } hashTable.add( @@ -59,6 +66,7 @@ public void adaptPutRow(VectorMapJoinFastBytesHashTable hashTable, public VectorMapJoinFastStringCommon(boolean isOuterJoin) { this.isOuterJoin = isOuterJoin; PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.stringTypeInfo }; - keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); + keyBinarySortableDeserializeRead = + new BinarySortableDeserializeRead(primitiveTypeInfos); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java index f96e32b..f9c5b34 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java @@ -122,9 +122,7 @@ public WriteBuffers writeBuffers() { private boolean isSingleRow; private int cappedCount; - private boolean haveReadCurrent; private int readIndex; - private boolean isEof; private boolean isNextEof; private boolean isNextLast; @@ -153,9 +151,48 @@ public void set(VectorMapJoinFastValueStore valueStore, long valueRefWord) { cappedCount = (int) ((valueRefWord & CappedCount.bitMask) >> CappedCount.bitShift); // Position to beginning. - haveReadCurrent = false; readIndex = 0; - isEof = false; + } + + /** + * Get detailed HashMap result position information to help diagnose exceptions. + */ + @Override + public String getDetailedHashMapResultPositionString() { + StringBuffer sb = new StringBuffer(); + + sb.append("Read index "); + sb.append(readIndex); + if (isSingleRow) { + sb.append(" single row"); + } else { + sb.append(" capped count "); + sb.append(cappedCount); + } + + if (readIndex > 0) { + sb.append(" byteSegmentRef is byte[] of length "); + sb.append(byteSegmentRef.getBytes().length); + sb.append(" at offset "); + sb.append(byteSegmentRef.getOffset()); + sb.append(" for length "); + sb.append(byteSegmentRef.getLength()); + if (!isSingleRow) { + sb.append(" (isNextEof "); + sb.append(isNextEof); + sb.append(" isNextLast "); + sb.append(isNextLast); + sb.append(" nextAbsoluteValueOffset "); + sb.append(nextAbsoluteValueOffset); + sb.append(" isNextValueLengthSmall "); + sb.append(isNextValueLengthSmall); + sb.append(" nextSmallValueLength "); + sb.append(nextSmallValueLength); + sb.append(")"); + } + } + + return sb.toString(); } @Override @@ -193,9 +230,7 @@ public ByteSegmentRef first() { } // Position to beginning. - haveReadCurrent = false; readIndex = 0; - isEof = false; return internalRead(); } @@ -363,18 +398,9 @@ public ByteSegmentRef internalRead() { } @Override - public boolean isEof() { - if (!hasRows) { - return true; - } - return isEof; - } - - @Override public void forget() { } - @Override public String toString() { StringBuilder sb = new StringBuilder(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java index fa6dedb..a5dfba8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java @@ -57,7 +57,7 @@ public abstract ByteSegmentRef next(); /** - * @return Whether reading is at the end. + * Get detailed HashMap result position information to help diagnose exceptions. */ - public abstract boolean isEof(); + public abstract String getDetailedHashMapResultPositionString(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java index e56c821..0dc8aee 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java @@ -88,11 +88,6 @@ public ByteSegmentRef next() { } @Override - public boolean isEof() { - return bytesBytesMultiHashMapResult.isEof(); - } - - @Override public void forget() { bytesBytesMultiHashMapResult.forget(); super.forget(); @@ -105,6 +100,11 @@ public String toString() { sb.append("isSingleRow " + (joinResult() == JoinUtil.JoinResult.MATCH ? isSingleRow() : "") + ")"); return sb.toString(); } + + @Override + public String getDetailedHashMapResultPositionString() { + return "(Not supported yet)"; + } } @Override diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java index aed9214..c1d7c72 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java @@ -133,8 +133,6 @@ private void verifyHashMapResult(BytesBytesMultiHashMap map, byte[] key, byte[]. hs.add(ref.copy()); ref = hashMapResult.next(); } - } else { - assertTrue(hashMapResult.isEof()); } assertEquals(state, count); assertEquals(values.length, count); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java index c55d951..9c4a973 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorRowObject.java @@ -22,7 +22,6 @@ import java.util.Random; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.fast.RandomRowObjectSource; import junit.framework.TestCase; @@ -51,7 +50,7 @@ void testVectorRowObject(int caseNum, boolean sort, Random r) throws HiveExcepti String[] emptyScratchTypeNames = new String[0]; - RandomRowObjectSource source = new RandomRowObjectSource(); + VectorRandomRowSource source = new VectorRandomRowSource(); source.init(r); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java index da69ee3..c6704f9 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java @@ -48,7 +48,6 @@ import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; import org.apache.hadoop.hive.serde2.fast.DeserializeRead; -import org.apache.hadoop.hive.serde2.fast.RandomRowObjectSource; import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead; @@ -85,7 +84,7 @@ } void deserializeAndVerify(Output output, DeserializeRead deserializeRead, - RandomRowObjectSource source, Object[] expectedRow) + VectorRandomRowSource source, Object[] expectedRow) throws HiveException, IOException { deserializeRead.set(output.getData(), 0, output.getLength()); PrimitiveCategory[] primitiveCategories = source.primitiveCategories(); @@ -281,12 +280,11 @@ void deserializeAndVerify(Output output, DeserializeRead deserializeRead, } deserializeRead.extraFieldsCheck(); TestCase.assertTrue(!deserializeRead.readBeyondConfiguredFieldsWarned()); - TestCase.assertTrue(!deserializeRead.readBeyondBufferRangeWarned()); TestCase.assertTrue(!deserializeRead.bufferRangeHasExtraDataWarned()); } void serializeBatch(VectorizedRowBatch batch, VectorSerializeRow vectorSerializeRow, - DeserializeRead deserializeRead, RandomRowObjectSource source, Object[][] randomRows, + DeserializeRead deserializeRead, VectorRandomRowSource source, Object[][] randomRows, int firstRandomRowIndex) throws HiveException, IOException { Output output = new Output(); @@ -311,7 +309,7 @@ void testVectorSerializeRow(int caseNum, Random r, SerializationType serializati String[] emptyScratchTypeNames = new String[0]; - RandomRowObjectSource source = new RandomRowObjectSource(); + VectorRandomRowSource source = new VectorRandomRowSource(); source.init(r); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); @@ -389,7 +387,7 @@ void examineBatch(VectorizedRowBatch batch, VectorExtractRow vectorExtractRow, } } - private Output serializeRow(Object[] row, RandomRowObjectSource source, SerializeWrite serializeWrite) throws HiveException, IOException { + private Output serializeRow(Object[] row, VectorRandomRowSource source, SerializeWrite serializeWrite) throws HiveException, IOException { Output output = new Output(); serializeWrite.set(output); PrimitiveTypeInfo[] primitiveTypeInfos = source.primitiveTypeInfos(); @@ -542,7 +540,7 @@ void testVectorDeserializeRow(int caseNum, Random r, SerializationType serializa String[] emptyScratchTypeNames = new String[0]; - RandomRowObjectSource source = new RandomRowObjectSource(); + VectorRandomRowSource source = new VectorRandomRowSource(); source.init(r); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/VectorRandomRowSource.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/VectorRandomRowSource.java new file mode 100644 index 0000000..70ce5fd --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/VectorRandomRowSource.java @@ -0,0 +1,423 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.sql.Date; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Random; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; +import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.common.type.RandomTypeUtil; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveIntervalDayTimeObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveIntervalYearMonthObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hive.common.util.DateUtils; + +/** + * Generate object inspector and random row object[]. + */ +public class VectorRandomRowSource { + + private Random r; + + private int columnCount; + + private List typeNames; + + private PrimitiveCategory[] primitiveCategories; + + private PrimitiveTypeInfo[] primitiveTypeInfos; + + private List primitiveObjectInspectorList; + + private StructObjectInspector rowStructObjectInspector; + + public List typeNames() { + return typeNames; + } + + public PrimitiveCategory[] primitiveCategories() { + return primitiveCategories; + } + + public PrimitiveTypeInfo[] primitiveTypeInfos() { + return primitiveTypeInfos; + } + + public StructObjectInspector rowStructObjectInspector() { + return rowStructObjectInspector; + } + + public StructObjectInspector partialRowStructObjectInspector(int partialFieldCount) { + ArrayList partialPrimitiveObjectInspectorList = + new ArrayList(partialFieldCount); + List columnNames = new ArrayList(partialFieldCount); + for (int i = 0; i < partialFieldCount; i++) { + columnNames.add(String.format("partial%d", i)); + partialPrimitiveObjectInspectorList.add( + PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + primitiveTypeInfos[i])); + } + + return ObjectInspectorFactory.getStandardStructObjectInspector( + columnNames, primitiveObjectInspectorList); + } + + public void init(Random r) { + this.r = r; + chooseSchema(); + } + + /* + * For now, exclude CHAR until we determine why there is a difference (blank padding) + * serializing with LazyBinarySerializeWrite and the regular SerDe... + */ + private static String[] possibleHiveTypeNames = { + "boolean", + "tinyint", + "smallint", + "int", + "bigint", + "date", + "float", + "double", + "string", +// "char", + "varchar", + "binary", + "date", + "timestamp", + "interval_year_month", + "interval_day_time", + "decimal" + }; + + private void chooseSchema() { + HashSet hashSet = null; + boolean allTypes; + boolean onlyOne = (r.nextInt(100) == 7); + if (onlyOne) { + columnCount = 1; + allTypes = false; + } else { + allTypes = r.nextBoolean(); + if (allTypes) { + // One of each type. + columnCount = possibleHiveTypeNames.length; + hashSet = new HashSet(); + } else { + columnCount = 1 + r.nextInt(20); + } + } + typeNames = new ArrayList(columnCount); + primitiveCategories = new PrimitiveCategory[columnCount]; + primitiveTypeInfos = new PrimitiveTypeInfo[columnCount]; + primitiveObjectInspectorList = new ArrayList(columnCount); + List columnNames = new ArrayList(columnCount); + for (int c = 0; c < columnCount; c++) { + columnNames.add(String.format("col%d", c)); + String typeName; + + if (onlyOne) { + typeName = possibleHiveTypeNames[r.nextInt(possibleHiveTypeNames.length)]; + } else { + int typeNum; + if (allTypes) { + while (true) { + typeNum = r.nextInt(possibleHiveTypeNames.length); + Integer typeNumInteger = new Integer(typeNum); + if (!hashSet.contains(typeNumInteger)) { + hashSet.add(typeNumInteger); + break; + } + } + } else { + typeNum = r.nextInt(possibleHiveTypeNames.length); + } + typeName = possibleHiveTypeNames[typeNum]; + } + if (typeName.equals("char")) { + int maxLength = 1 + r.nextInt(100); + typeName = String.format("char(%d)", maxLength); + } else if (typeName.equals("varchar")) { + int maxLength = 1 + r.nextInt(100); + typeName = String.format("varchar(%d)", maxLength); + } else if (typeName.equals("decimal")) { + typeName = String.format("decimal(%d,%d)", HiveDecimal.SYSTEM_DEFAULT_PRECISION, HiveDecimal.SYSTEM_DEFAULT_SCALE); + } + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); + primitiveTypeInfos[c] = primitiveTypeInfo; + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + primitiveCategories[c] = primitiveCategory; + primitiveObjectInspectorList.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo)); + typeNames.add(typeName); + } + rowStructObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, primitiveObjectInspectorList); + } + + public Object[][] randomRows(int n) { + Object[][] result = new Object[n][]; + for (int i = 0; i < n; i++) { + result[i] = randomRow(); + } + return result; + } + + public Object[] randomRow() { + Object row[] = new Object[columnCount]; + for (int c = 0; c < columnCount; c++) { + Object object = randomObject(c); + if (object == null) { + throw new Error("Unexpected null for column " + c); + } + row[c] = getWritableObject(c, object); + if (row[c] == null) { + throw new Error("Unexpected null for writable for column " + c); + } + } + return row; + } + + public static void sort(Object[][] rows, ObjectInspector oi) { + for (int i = 0; i < rows.length; i++) { + for (int j = i + 1; j < rows.length; j++) { + if (ObjectInspectorUtils.compare(rows[i], oi, rows[j], oi) > 0) { + Object[] t = rows[i]; + rows[i] = rows[j]; + rows[j] = t; + } + } + } + } + + public void sort(Object[][] rows) { + VectorRandomRowSource.sort(rows, rowStructObjectInspector); + } + + public Object getWritableObject(int column, Object object) { + ObjectInspector objectInspector = primitiveObjectInspectorList.get(column); + PrimitiveCategory primitiveCategory = primitiveCategories[column]; + PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; + switch (primitiveCategory) { + case BOOLEAN: + return ((WritableBooleanObjectInspector) objectInspector).create((boolean) object); + case BYTE: + return ((WritableByteObjectInspector) objectInspector).create((byte) object); + case SHORT: + return ((WritableShortObjectInspector) objectInspector).create((short) object); + case INT: + return ((WritableIntObjectInspector) objectInspector).create((int) object); + case LONG: + return ((WritableLongObjectInspector) objectInspector).create((long) object); + case DATE: + return ((WritableDateObjectInspector) objectInspector).create((Date) object); + case FLOAT: + return ((WritableFloatObjectInspector) objectInspector).create((float) object); + case DOUBLE: + return ((WritableDoubleObjectInspector) objectInspector).create((double) object); + case STRING: + return ((WritableStringObjectInspector) objectInspector).create((String) object); + case CHAR: + { + WritableHiveCharObjectInspector writableCharObjectInspector = + new WritableHiveCharObjectInspector( (CharTypeInfo) primitiveTypeInfo); + return writableCharObjectInspector.create(new HiveChar(StringUtils.EMPTY, -1)); + } + case VARCHAR: + { + WritableHiveVarcharObjectInspector writableVarcharObjectInspector = + new WritableHiveVarcharObjectInspector( (VarcharTypeInfo) primitiveTypeInfo); + return writableVarcharObjectInspector.create(new HiveVarchar(StringUtils.EMPTY, -1)); + } + case BINARY: + return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector.create(ArrayUtils.EMPTY_BYTE_ARRAY); + case TIMESTAMP: + return ((WritableTimestampObjectInspector) objectInspector).create(new Timestamp(0)); + case INTERVAL_YEAR_MONTH: + return ((WritableHiveIntervalYearMonthObjectInspector) objectInspector).create(new HiveIntervalYearMonth(0)); + case INTERVAL_DAY_TIME: + return ((WritableHiveIntervalDayTimeObjectInspector) objectInspector).create(new HiveIntervalDayTime(0, 0)); + case DECIMAL: + { + WritableHiveDecimalObjectInspector writableDecimalObjectInspector = + new WritableHiveDecimalObjectInspector((DecimalTypeInfo) primitiveTypeInfo); + return writableDecimalObjectInspector.create(HiveDecimal.ZERO); + } + default: + throw new Error("Unknown primitive category " + primitiveCategory); + } + } + + public Object randomObject(int column) { + PrimitiveCategory primitiveCategory = primitiveCategories[column]; + PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; + switch (primitiveCategory) { + case BOOLEAN: + return Boolean.valueOf(r.nextInt(1) == 1); + case BYTE: + return Byte.valueOf((byte) r.nextInt()); + case SHORT: + return Short.valueOf((short) r.nextInt()); + case INT: + return Integer.valueOf(r.nextInt()); + case LONG: + return Long.valueOf(r.nextLong()); + case DATE: + return RandomTypeUtil.getRandDate(r); + case FLOAT: + return Float.valueOf(r.nextFloat() * 10 - 5); + case DOUBLE: + return Double.valueOf(r.nextDouble() * 10 - 5); + case STRING: + return RandomTypeUtil.getRandString(r); + case CHAR: + return getRandHiveChar(r, (CharTypeInfo) primitiveTypeInfo); + case VARCHAR: + return getRandHiveVarchar(r, (VarcharTypeInfo) primitiveTypeInfo); + case BINARY: + return getRandBinary(r, 1 + r.nextInt(100)); + case TIMESTAMP: + return RandomTypeUtil.getRandTimestamp(r); + case INTERVAL_YEAR_MONTH: + return getRandIntervalYearMonth(r); + case INTERVAL_DAY_TIME: + return getRandIntervalDayTime(r); + case DECIMAL: + return getRandHiveDecimal(r, (DecimalTypeInfo) primitiveTypeInfo); + default: + throw new Error("Unknown primitive category " + primitiveCategory); + } + } + + public static HiveChar getRandHiveChar(Random r, CharTypeInfo charTypeInfo) { + int maxLength = 1 + r.nextInt(charTypeInfo.getLength()); + String randomString = RandomTypeUtil.getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); + HiveChar hiveChar = new HiveChar(randomString, maxLength); + return hiveChar; + } + + public static HiveVarchar getRandHiveVarchar(Random r, VarcharTypeInfo varcharTypeInfo) { + int maxLength = 1 + r.nextInt(varcharTypeInfo.getLength()); + String randomString = RandomTypeUtil.getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); + HiveVarchar hiveVarchar = new HiveVarchar(randomString, maxLength); + return hiveVarchar; + } + + public static byte[] getRandBinary(Random r, int len){ + byte[] bytes = new byte[len]; + for (int j = 0; j < len; j++){ + bytes[j] = Byte.valueOf((byte) r.nextInt()); + } + return bytes; + } + + private static final String DECIMAL_CHARS = "0123456789"; + + public static HiveDecimal getRandHiveDecimal(Random r, DecimalTypeInfo decimalTypeInfo) { + while (true) { + StringBuilder sb = new StringBuilder(); + int precision = 1 + r.nextInt(18); + int scale = 0 + r.nextInt(precision + 1); + + int integerDigits = precision - scale; + + if (r.nextBoolean()) { + sb.append("-"); + } + + if (integerDigits == 0) { + sb.append("0"); + } else { + sb.append(RandomTypeUtil.getRandString(r, DECIMAL_CHARS, integerDigits)); + } + if (scale != 0) { + sb.append("."); + sb.append(RandomTypeUtil.getRandString(r, DECIMAL_CHARS, scale)); + } + + HiveDecimal bd = HiveDecimal.create(sb.toString()); + if (bd.scale() > bd.precision()) { + // Sometimes weird decimals are produced? + continue; + } + + return bd; + } + } + + public static HiveIntervalYearMonth getRandIntervalYearMonth(Random r) { + String yearMonthSignStr = r.nextInt(2) == 0 ? "" : "-"; + String intervalYearMonthStr = String.format("%s%d-%d", + yearMonthSignStr, + Integer.valueOf(1800 + r.nextInt(500)), // year + Integer.valueOf(0 + r.nextInt(12))); // month + HiveIntervalYearMonth intervalYearMonthVal = HiveIntervalYearMonth.valueOf(intervalYearMonthStr); + return intervalYearMonthVal; + } + + public static HiveIntervalDayTime getRandIntervalDayTime(Random r) { + String optionalNanos = ""; + if (r.nextInt(2) == 1) { + optionalNanos = String.format(".%09d", + Integer.valueOf(0 + r.nextInt(DateUtils.NANOS_PER_SEC))); + } + String yearMonthSignStr = r.nextInt(2) == 0 ? "" : "-"; + String dayTimeStr = String.format("%s%d %02d:%02d:%02d%s", + yearMonthSignStr, + Integer.valueOf(1 + r.nextInt(28)), // day + Integer.valueOf(0 + r.nextInt(24)), // hour + Integer.valueOf(0 + r.nextInt(60)), // minute + Integer.valueOf(0 + r.nextInt(60)), // second + optionalNanos); + HiveIntervalDayTime intervalDayTimeVal = HiveIntervalDayTime.valueOf(dayTimeStr); + return intervalDayTimeVal; + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastHashTable.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastHashTable.java index 3a23584..4768f35 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastHashTable.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastHashTable.java @@ -43,12 +43,14 @@ public class CheckFastHashTable { - public static boolean findMatch(byte[] valueBytes, List actualValues, int actualCount, boolean[] taken) { + public static boolean findMatch(int valueIndex, byte[] valueBytes, List actualValues, + int actualCount, boolean[] actualTaken, int[] actualToValueMap) { for (int i = 0; i < actualCount; i++) { - if (!taken[i]) { + if (!actualTaken[i]) { byte[] actualBytes = actualValues.get(i); if (StringExpr.compare(valueBytes, 0, valueBytes.length, actualBytes, 0, actualBytes.length) == 0) { - taken[i] = true; + actualToValueMap[i] = valueIndex; + actualTaken[i] = true; return true; } } @@ -56,7 +58,7 @@ public static boolean findMatch(byte[] valueBytes, List actualValues, in return false; } - public static void verifyHashMapValues(VectorMapJoinHashMapResult hashMapResult, + public static int[] verifyHashMapValues(VectorMapJoinHashMapResult hashMapResult, List values) { int valueCount = values.size(); @@ -87,15 +89,16 @@ public static void verifyHashMapValues(VectorMapJoinHashMapResult hashMapResult, TestCase.fail("values.size() " + valueCount + " does not match actualCount " + actualCount); } - boolean[] taken = new boolean[actualCount]; + boolean[] actualTaken = new boolean[actualCount]; + int[] actualToValueMap = new int[actualCount]; for (int i = 0; i < actualCount; i++) { byte[] valueBytes = values.get(i); - if (!findMatch(valueBytes, actualValues, actualCount, taken)) { + if (!findMatch(i, valueBytes, actualValues, actualCount, actualTaken, actualToValueMap)) { List availableLengths = new ArrayList(); for (int a = 0; a < actualCount; a++) { - if (!taken[a]) { + if (!actualTaken[a]) { availableLengths.add(actualValues.get(a).length); } } @@ -103,6 +106,7 @@ public static void verifyHashMapValues(VectorMapJoinHashMapResult hashMapResult, ", availableLengths " + availableLengths.toString() + " of " + actualCount + " total)"); } } + return actualToValueMap; } /* diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastRowHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastRowHashMap.java new file mode 100644 index 0000000..ea5e8b3 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastRowHashMap.java @@ -0,0 +1,309 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import static org.junit.Assert.assertTrue; + +import java.io.EOFException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.TreeMap; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; +import org.apache.hadoop.hive.serde2.WriteBuffers; +import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Writable; + +import com.google.common.base.Preconditions; + +public class CheckFastRowHashMap extends CheckFastHashTable { + + public static void verifyHashMapRows(List rows, int[] actualToValueMap, + VectorMapJoinHashMapResult hashMapResult, TypeInfo[] typeInfos) throws IOException { + + final int count = rows.size(); + final int columnCount = typeInfos.length; + + WriteBuffers.ByteSegmentRef ref = hashMapResult.first(); + + for (int a = 0; a < count; a++) { + + int valueIndex = actualToValueMap[a]; + + Object[] row = rows.get(valueIndex); + + byte[] bytes = ref.getBytes(); + int offset = (int) ref.getOffset(); + int length = ref.getLength(); + + LazyBinaryDeserializeRead lazyBinaryDeserializeRead = + new LazyBinaryDeserializeRead(typeInfos); + + lazyBinaryDeserializeRead.set(bytes, offset, length); + + for (int index = 0; index < columnCount; index++) { + Writable writable = (Writable) row[index]; + VerifyFastRow.verifyDeserializeRead(lazyBinaryDeserializeRead, (PrimitiveTypeInfo) typeInfos[index], writable); + } + lazyBinaryDeserializeRead.extraFieldsCheck(); + TestCase.assertTrue(!lazyBinaryDeserializeRead.readBeyondConfiguredFieldsWarned()); + + TestCase.assertTrue(!lazyBinaryDeserializeRead.bufferRangeHasExtraDataWarned()); + + ref = hashMapResult.next(); + if (a == count - 1) { + TestCase.assertTrue (ref == null); + } else { + TestCase.assertTrue (ref != null); + } + } + } + + private static String debugDetailedReadPositionString; + + private static String debugDetailedHashMapResultPositionString; + + private static String debugExceptionMessage; + private static StackTraceElement[] debugStackTrace; + + public static void verifyHashMapRowsMore(List rows, int[] actualToValueMap, + VectorMapJoinHashMapResult hashMapResult, TypeInfo[] typeInfos, + int clipIndex, boolean useExactBytes) throws IOException { + + final int count = rows.size(); + final int columnCount = typeInfos.length; + + WriteBuffers.ByteSegmentRef ref = hashMapResult.first(); + + for (int a = 0; a < count; a++) { + + int valueIndex = actualToValueMap[a]; + + Object[] row = rows.get(valueIndex); + + byte[] bytes = ref.getBytes(); + int offset = (int) ref.getOffset(); + int length = ref.getLength(); + if (a == clipIndex) { + length--; + } + + if (useExactBytes) { + // Use exact byte array which might generate array out of bounds... + bytes = Arrays.copyOfRange(bytes, offset, offset + length); + offset = 0; + } + + LazyBinaryDeserializeRead lazyBinaryDeserializeRead = + new LazyBinaryDeserializeRead(typeInfos); + + lazyBinaryDeserializeRead.set(bytes, offset, length); + + boolean thrown = false; + Exception saveException = null; + try { + for (int index = 0; index < columnCount; index++) { + Writable writable = (Writable) row[index]; + VerifyFastRow.verifyDeserializeRead(lazyBinaryDeserializeRead, (PrimitiveTypeInfo) typeInfos[index], writable); + } + } catch (Exception e) { + thrown = true; + saveException = e; + debugDetailedReadPositionString = lazyBinaryDeserializeRead.getDetailedReadPositionString(); + + debugDetailedHashMapResultPositionString = hashMapResult.getDetailedHashMapResultPositionString(); + + debugExceptionMessage = saveException.getMessage(); + debugStackTrace = saveException.getStackTrace(); + } + if (a == clipIndex) { + if (!thrown) { + TestCase.fail("Expecting an exception to be thrown for the clipped case..."); + } else { + TestCase.assertTrue(saveException != null); + if (!(saveException instanceof EOFException)) { + TestCase.fail("Expecting an EOFException to be thrown for the clipped case..."); + } + } + } else { + if (thrown) { + TestCase.fail("Not expecting an exception to be thrown for the non-clipped case..."); + } + lazyBinaryDeserializeRead.extraFieldsCheck(); + TestCase.assertTrue(!lazyBinaryDeserializeRead.readBeyondConfiguredFieldsWarned()); + + TestCase.assertTrue(!lazyBinaryDeserializeRead.bufferRangeHasExtraDataWarned()); + } + + ref = hashMapResult.next(); + if (a == count - 1) { + TestCase.assertTrue (ref == null); + } else { + TestCase.assertTrue (ref != null); + } + } + } + + /* + * Element for Key: row and byte[] x Hash Table: HashMap + */ + public static class FastRowHashMapElement { + private byte[] key; + private List values; + private List rows; + + public FastRowHashMapElement(byte[] key, byte[] firstValue, Object[] row) { + this.key = key; + values = new ArrayList(); + values.add(firstValue); + rows = new ArrayList(); + rows.add(row); + } + + public byte[] getKey() { + return key; + } + + public int getCount() { + return values.size(); + } + + public List getValues() { + return values; + } + + public List getRows() { + return rows; + } + + public void add(byte[] value, Object[] row) { + values.add(value); + rows.add(row); + } + } + + /* + * Verify table for Key: row and byte[] x Hash Table: HashMap + */ + public static class VerifyFastRowHashMap { + + private int count; + + private FastRowHashMapElement[] array; + + private TreeMap keyValueMap; + + public VerifyFastRowHashMap() { + count = 0; + array = new FastRowHashMapElement[50]; + + // We use BytesWritable because it supports Comparable for our TreeMap. + keyValueMap = new TreeMap(); + } + + public int getCount() { + return count; + } + + public boolean contains(byte[] key) { + BytesWritable keyBytesWritable = new BytesWritable(key, key.length); + return keyValueMap.containsKey(keyBytesWritable); + } + + public void add(byte[] key, byte[] value, Object[] row) { + BytesWritable keyBytesWritable = new BytesWritable(key, key.length); + if (keyValueMap.containsKey(keyBytesWritable)) { + int index = keyValueMap.get(keyBytesWritable); + array[index].add(value, row); + } else { + if (count >= array.length) { + // Grow. + FastRowHashMapElement[] newArray = new FastRowHashMapElement[array.length * 2]; + System.arraycopy(array, 0, newArray, 0, count); + array = newArray; + } + array[count] = new FastRowHashMapElement(key, value, row); + keyValueMap.put(keyBytesWritable, count); + count++; + } + } + + public byte[] addRandomExisting(byte[] value, Object[] row, Random r) { + Preconditions.checkState(count > 0); + int index = r.nextInt(count); + array[index].add(value, row); + return array[index].getKey(); + } + + public byte[] getKey(int index) { + return array[index].getKey(); + } + + public List getValues(int index) { + return array[index].getValues(); + } + + public void verify(VectorMapJoinFastBytesHashMap map, PrimitiveTypeInfo[] primitiveTypeInfos, + boolean doClipping, boolean useExactBytes, Random r) throws IOException { + int mapSize = map.size(); + if (mapSize != count) { + TestCase.fail("map.size() does not match expected count"); + } + + for (int index = 0; index < count; index++) { + FastRowHashMapElement element = array[index]; + byte[] key = element.getKey(); + List values = element.getValues(); + + VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); + JoinUtil.JoinResult joinResult = map.lookup(key, 0, key.length, hashMapResult); + if (joinResult != JoinUtil.JoinResult.MATCH) { + assertTrue(false); + } + + int[] actualToValueMap = verifyHashMapValues(hashMapResult, values); + + // Start with a fresh one. + hashMapResult = map.createHashMapResult(); + joinResult = map.lookup(key, 0, key.length, hashMapResult); + if (joinResult != JoinUtil.JoinResult.MATCH) { + assertTrue(false); + } + + List rows = element.getRows(); + if (!doClipping && !useExactBytes) { + verifyHashMapRows(rows, actualToValueMap, hashMapResult, primitiveTypeInfos); + } else { + int clipIndex = r.nextInt(rows.size()); + verifyHashMapRowsMore(rows, actualToValueMap, hashMapResult, primitiveTypeInfos, + clipIndex, useExactBytes); + } + } + } + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMap.java index bbfa65f..8525e99 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMap.java @@ -269,4 +269,19 @@ public void testLargeAndExpand() throws Exception { int keyCount = 1000; addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); } + + @Test + public void testReallyBig() throws Exception { + random = new Random(42662); + + // Use a large capacity that doesn't require expansion, yet. + VectorMapJoinFastMultiKeyHashMap map = + new VectorMapJoinFastMultiKeyHashMap( + false,LARGE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + + VerifyFastBytesHashMap verifyTable = new VerifyFastBytesHashMap(); + + int keyCount = 1000000; + addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java new file mode 100644 index 0000000..ce7aa11 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java @@ -0,0 +1,169 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Random; + +import org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.CheckFastRowHashMap.VerifyFastRowHashMap; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +/* + * An multi-key value hash map optimized for vector map join. + * + * The key is uninterpreted bytes. + */ +public class TestVectorMapJoinFastRowHashMap extends CommonFastHashTable { + + public void addAndVerifyRows(VectorRandomRowSource source, Object[][] rows, + VectorMapJoinFastMultiKeyHashMap map, VerifyFastRowHashMap verifyTable, + boolean doClipping, boolean useExactBytes) throws HiveException, IOException { + + PrimitiveTypeInfo[] primitiveTypeInfos = source.primitiveTypeInfos(); + final int columnCount = primitiveTypeInfos.length; + + LazyBinarySerializeWrite lazyBinarySerializeWrite = new LazyBinarySerializeWrite(columnCount); + + final int count = rows.length; + for (int i = 0; i < count; i++) { + + Object[] row = rows[i]; + Output output = new Output(); + lazyBinarySerializeWrite.set(output); + + for (int index = 0; index < columnCount; index++) { + + Writable writable = (Writable) row[index]; + + VerifyFastRow.serializeWrite(lazyBinarySerializeWrite, primitiveTypeInfos[index], writable); + } + + byte[] value = Arrays.copyOf(output.getData(), output.getLength()); + + // Add a new key or add a value to an existing key? + if (random.nextBoolean() || verifyTable.getCount() == 0) { + byte[] key; + while (true) { + key = new byte[random.nextInt(MAX_KEY_LENGTH)]; + random.nextBytes(key); + if (!verifyTable.contains(key)) { + // Unique keys for this test. + break; + } + } + + map.testPutRow(key, value); + verifyTable.add(key, value, row); + // verifyTable.verify(map); + } else { + byte[] randomExistingKey = verifyTable.addRandomExisting(value, row, random); + map.testPutRow(randomExistingKey, value); + // verifyTable.verify(map); + } + } + verifyTable.verify(map, primitiveTypeInfos, doClipping, useExactBytes, random); + } + + @Test + public void testRows() throws Exception { + random = new Random(927337); + + // Use a large capacity that doesn't require expansion, yet. + VectorMapJoinFastMultiKeyHashMap map = + new VectorMapJoinFastMultiKeyHashMap( + false,LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + + VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); + + VectorRandomRowSource source = new VectorRandomRowSource(); + source.init(random); + + int rowCount = 10000; + Object[][] rows = source.randomRows(rowCount); + + addAndVerifyRows(source, rows, map, verifyTable, false, false); + } + + @Test + public void testRowsClipped() throws Exception { + random = new Random(326232); + + // Use a large capacity that doesn't require expansion, yet. + VectorMapJoinFastMultiKeyHashMap map = + new VectorMapJoinFastMultiKeyHashMap( + false,LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + + VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); + + VectorRandomRowSource source = new VectorRandomRowSource(); + source.init(random); + + int rowCount = 10000; + Object[][] rows = source.randomRows(rowCount); + + addAndVerifyRows(source, rows, map, verifyTable, true, false); + } + + @Test + public void testRowsExact() throws Exception { + random = new Random(2454); + + // Use a large capacity that doesn't require expansion, yet. + VectorMapJoinFastMultiKeyHashMap map = + new VectorMapJoinFastMultiKeyHashMap( + false,LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + + VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); + + VectorRandomRowSource source = new VectorRandomRowSource(); + source.init(random); + + int rowCount = 10000; + Object[][] rows = source.randomRows(rowCount); + + addAndVerifyRows(source, rows, map, verifyTable, false, true); + } + + @Test + public void testRowsClippedExact() throws Exception { + random = new Random(2454); + + // Use a large capacity that doesn't require expansion, yet. + VectorMapJoinFastMultiKeyHashMap map = + new VectorMapJoinFastMultiKeyHashMap( + false,LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + + VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); + + VectorRandomRowSource source = new VectorRandomRowSource(); + source.init(random); + + int rowCount = 10000; + Object[][] rows = source.randomRows(rowCount); + + addAndVerifyRows(source, rows, map, verifyTable, true, true); + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VerifyFastRow.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VerifyFastRow.java new file mode 100644 index 0000000..118e9e2 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VerifyFastRow.java @@ -0,0 +1,397 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import java.io.IOException; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Arrays; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; +import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.fast.DeserializeRead; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +/** + * (Copy of VerifyFast from serde). + * + */ +public class VerifyFastRow { + + public static void verifyDeserializeRead(DeserializeRead deserializeRead, + PrimitiveTypeInfo primitiveTypeInfo, Writable writable) throws IOException { + + boolean isNull; + + isNull = deserializeRead.readCheckNull(); + if (isNull) { + if (writable != null) { + TestCase.fail("Field reports null but object is not null"); + } + return; + } else if (writable == null) { + TestCase.fail("Field report not null but object is null"); + } + switch (primitiveTypeInfo.getPrimitiveCategory()) { + case BOOLEAN: + { + boolean value = deserializeRead.currentBoolean; + if (!(writable instanceof BooleanWritable)) { + TestCase.fail("Boolean expected writable not Boolean"); + } + boolean expected = ((BooleanWritable) writable).get(); + if (value != expected) { + TestCase.fail("Boolean field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case BYTE: + { + byte value = deserializeRead.currentByte; + if (!(writable instanceof ByteWritable)) { + TestCase.fail("Byte expected writable not Byte"); + } + byte expected = ((ByteWritable) writable).get(); + if (value != expected) { + TestCase.fail("Byte field mismatch (expected " + (int) expected + " found " + (int) value + ")"); + } + } + break; + case SHORT: + { + short value = deserializeRead.currentShort; + if (!(writable instanceof ShortWritable)) { + TestCase.fail("Short expected writable not Short"); + } + short expected = ((ShortWritable) writable).get(); + if (value != expected) { + TestCase.fail("Short field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case INT: + { + int value = deserializeRead.currentInt; + if (!(writable instanceof IntWritable)) { + TestCase.fail("Integer expected writable not Integer"); + } + int expected = ((IntWritable) writable).get(); + if (value != expected) { + TestCase.fail("Int field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case LONG: + { + long value = deserializeRead.currentLong; + if (!(writable instanceof LongWritable)) { + TestCase.fail("Long expected writable not Long"); + } + Long expected = ((LongWritable) writable).get(); + if (value != expected) { + TestCase.fail("Long field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case FLOAT: + { + float value = deserializeRead.currentFloat; + if (!(writable instanceof FloatWritable)) { + TestCase.fail("Float expected writable not Float"); + } + float expected = ((FloatWritable) writable).get(); + if (value != expected) { + TestCase.fail("Float field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case DOUBLE: + { + double value = deserializeRead.currentDouble; + if (!(writable instanceof DoubleWritable)) { + TestCase.fail("Double expected writable not Double"); + } + double expected = ((DoubleWritable) writable).get(); + if (value != expected) { + TestCase.fail("Double field mismatch (expected " + expected + " found " + value + ")"); + } + } + break; + case STRING: + { + byte[] stringBytes = Arrays.copyOfRange( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); + Text text = new Text(stringBytes); + String string = text.toString(); + String expected = ((Text) writable).toString(); + if (!string.equals(expected)) { + TestCase.fail("String field mismatch (expected '" + expected + "' found '" + string + "')"); + } + } + break; + case CHAR: + { + byte[] stringBytes = Arrays.copyOfRange( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); + Text text = new Text(stringBytes); + String string = text.toString(); + + HiveChar hiveChar = new HiveChar(string, ((CharTypeInfo) primitiveTypeInfo).getLength()); + + HiveChar expected = ((HiveCharWritable) writable).getHiveChar(); + if (!hiveChar.equals(expected)) { + TestCase.fail("Char field mismatch (expected '" + expected + "' found '" + hiveChar + "')"); + } + } + break; + case VARCHAR: + { + byte[] stringBytes = Arrays.copyOfRange( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); + Text text = new Text(stringBytes); + String string = text.toString(); + + HiveVarchar hiveVarchar = new HiveVarchar(string, ((VarcharTypeInfo) primitiveTypeInfo).getLength()); + + HiveVarchar expected = ((HiveVarcharWritable) writable).getHiveVarchar(); + if (!hiveVarchar.equals(expected)) { + TestCase.fail("Varchar field mismatch (expected '" + expected + "' found '" + hiveVarchar + "')"); + } + } + break; + case DECIMAL: + { + HiveDecimal value = deserializeRead.currentHiveDecimalWritable.getHiveDecimal(); + if (value == null) { + TestCase.fail("Decimal field evaluated to NULL"); + } + HiveDecimal expected = ((HiveDecimalWritable) writable).getHiveDecimal(); + if (!value.equals(expected)) { + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) primitiveTypeInfo; + int precision = decimalTypeInfo.getPrecision(); + int scale = decimalTypeInfo.getScale(); + TestCase.fail("Decimal field mismatch (expected " + expected.toString() + " found " + value.toString() + ") precision " + precision + ", scale " + scale); + } + } + break; + case DATE: + { + Date value = deserializeRead.currentDateWritable.get(); + Date expected = ((DateWritable) writable).get(); + if (!value.equals(expected)) { + TestCase.fail("Date field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case TIMESTAMP: + { + Timestamp value = deserializeRead.currentTimestampWritable.getTimestamp(); + Timestamp expected = ((TimestampWritable) writable).getTimestamp(); + if (!value.equals(expected)) { + TestCase.fail("Timestamp field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case INTERVAL_YEAR_MONTH: + { + HiveIntervalYearMonth value = deserializeRead.currentHiveIntervalYearMonthWritable.getHiveIntervalYearMonth(); + HiveIntervalYearMonth expected = ((HiveIntervalYearMonthWritable) writable).getHiveIntervalYearMonth(); + if (!value.equals(expected)) { + TestCase.fail("HiveIntervalYearMonth field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case INTERVAL_DAY_TIME: + { + HiveIntervalDayTime value = deserializeRead.currentHiveIntervalDayTimeWritable.getHiveIntervalDayTime(); + HiveIntervalDayTime expected = ((HiveIntervalDayTimeWritable) writable).getHiveIntervalDayTime(); + if (!value.equals(expected)) { + TestCase.fail("HiveIntervalDayTime field mismatch (expected " + expected.toString() + " found " + value.toString() + ")"); + } + } + break; + case BINARY: + { + byte[] byteArray = Arrays.copyOfRange( + deserializeRead.currentBytes, + deserializeRead.currentBytesStart, + deserializeRead.currentBytesStart + deserializeRead.currentBytesLength); + BytesWritable bytesWritable = (BytesWritable) writable; + byte[] expected = Arrays.copyOfRange(bytesWritable.getBytes(), 0, bytesWritable.getLength()); + if (byteArray.length != expected.length){ + TestCase.fail("Byte Array field mismatch (expected " + Arrays.toString(expected) + + " found " + Arrays.toString(byteArray) + ")"); + } + for (int b = 0; b < byteArray.length; b++) { + if (byteArray[b] != expected[b]) { + TestCase.fail("Byte Array field mismatch (expected " + Arrays.toString(expected) + + " found " + Arrays.toString(byteArray) + ")"); + } + } + } + break; + default: + throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory()); + } + } + + public static void serializeWrite(SerializeWrite serializeWrite, + PrimitiveTypeInfo primitiveTypeInfo, Writable writable) throws IOException { + if (writable == null) { + serializeWrite.writeNull(); + return; + } + switch (primitiveTypeInfo.getPrimitiveCategory()) { + case BOOLEAN: + { + boolean value = ((BooleanWritable) writable).get(); + serializeWrite.writeBoolean(value); + } + break; + case BYTE: + { + byte value = ((ByteWritable) writable).get(); + serializeWrite.writeByte(value); + } + break; + case SHORT: + { + short value = ((ShortWritable) writable).get(); + serializeWrite.writeShort(value); + } + break; + case INT: + { + int value = ((IntWritable) writable).get(); + serializeWrite.writeInt(value); + } + break; + case LONG: + { + long value = ((LongWritable) writable).get(); + serializeWrite.writeLong(value); + } + break; + case FLOAT: + { + float value = ((FloatWritable) writable).get(); + serializeWrite.writeFloat(value); + } + break; + case DOUBLE: + { + double value = ((DoubleWritable) writable).get(); + serializeWrite.writeDouble(value); + } + break; + case STRING: + { + Text value = (Text) writable; + byte[] stringBytes = value.getBytes(); + int stringLength = stringBytes.length; + serializeWrite.writeString(stringBytes, 0, stringLength); + } + break; + case CHAR: + { + HiveChar value = ((HiveCharWritable) writable).getHiveChar(); + serializeWrite.writeHiveChar(value); + } + break; + case VARCHAR: + { + HiveVarchar value = ((HiveVarcharWritable) writable).getHiveVarchar(); + serializeWrite.writeHiveVarchar(value); + } + break; + case DECIMAL: + { + HiveDecimal value = ((HiveDecimalWritable) writable).getHiveDecimal(); + DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)primitiveTypeInfo; + serializeWrite.writeHiveDecimal(value, decTypeInfo.scale()); + } + break; + case DATE: + { + Date value = ((DateWritable) writable).get(); + serializeWrite.writeDate(value); + } + break; + case TIMESTAMP: + { + Timestamp value = ((TimestampWritable) writable).getTimestamp(); + serializeWrite.writeTimestamp(value); + } + break; + case INTERVAL_YEAR_MONTH: + { + HiveIntervalYearMonth value = ((HiveIntervalYearMonthWritable) writable).getHiveIntervalYearMonth(); + serializeWrite.writeHiveIntervalYearMonth(value); + } + break; + case INTERVAL_DAY_TIME: + { + HiveIntervalDayTime value = ((HiveIntervalDayTimeWritable) writable).getHiveIntervalDayTime(); + serializeWrite.writeHiveIntervalDayTime(value); + } + break; + case BINARY: + { + BytesWritable byteWritable = (BytesWritable) writable; + byte[] binaryBytes = byteWritable.getBytes(); + int length = byteWritable.getLength(); + serializeWrite.writeBinary(binaryBytes, 0, length); + } + break; + default: + throw new Error("Unknown primitive category " + primitiveTypeInfo.getPrimitiveCategory().name()); + } + } +} \ No newline at end of file diff --git serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java index be36ba4..003a2d4 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java @@ -42,7 +42,7 @@ * * Reading some fields require a results object to receive value information. A separate * results object is created by the caller at initialization per different field even for the same - * type. + * type. * * Some type values are by reference to either bytes in the deserialization buffer or to * other type specific buffers. So, those references are only valid until the next time set is @@ -61,6 +61,8 @@ private int fieldCount; private int start; + private int end; + private int fieldStart; private byte[] tempTimestampBytes; private Text tempText; @@ -68,7 +70,6 @@ private byte[] tempDecimalBuffer; private boolean readBeyondConfiguredFieldsWarned; - private boolean readBeyondBufferRangeWarned; private boolean bufferRangeHasExtraDataWarned; private InputByteBuffer inputByteBuffer = new InputByteBuffer(); @@ -92,7 +93,6 @@ public BinarySortableDeserializeRead(TypeInfo[] typeInfos, } inputByteBuffer = new InputByteBuffer(); readBeyondConfiguredFieldsWarned = false; - readBeyondBufferRangeWarned = false; bufferRangeHasExtraDataWarned = false; } @@ -107,8 +107,40 @@ private BinarySortableDeserializeRead() { @Override public void set(byte[] bytes, int offset, int length) { fieldIndex = -1; - inputByteBuffer.reset(bytes, offset, offset + length); start = offset; + end = offset + length; + inputByteBuffer.reset(bytes, start, end); + } + + /* + * Get detailed read position information to help diagnose exceptions. + */ + public String getDetailedReadPositionString() { + StringBuffer sb = new StringBuffer(); + + sb.append("Reading inputByteBuffer of length "); + sb.append(inputByteBuffer.getEnd()); + sb.append(" at start offset "); + sb.append(start); + sb.append(" for length "); + sb.append(end - start); + sb.append(" to read "); + sb.append(fieldCount); + sb.append(" fields with types "); + sb.append(Arrays.toString(typeInfos)); + sb.append(". "); + if (fieldIndex == -1) { + sb.append("Before first field?"); + } else { + sb.append("Read field #"); + sb.append(fieldIndex); + sb.append(" at field start position "); + sb.append(fieldStart); + sb.append(" current read offset "); + sb.append(inputByteBuffer.tell()); + } + + return sb.toString(); } /* @@ -133,12 +165,11 @@ public boolean readCheckNull() throws IOException { } if (inputByteBuffer.isEof()) { // Also, reading beyond our byte range produces NULL. - if (!readBeyondBufferRangeWarned) { - doReadBeyondBufferRangeWarned(); - } - // We cannot read beyond so we must return NULL here. return true; } + + fieldStart = inputByteBuffer.tell(); + byte isNullByte = inputByteBuffer.read(columnSortOrderIsDesc[fieldIndex]); if (isNullByte == 0) { @@ -298,7 +329,7 @@ public boolean readCheckNull() throws IOException { factor = -factor; } - int start = inputByteBuffer.tell(); + int decimalStart = inputByteBuffer.tell(); int length = 0; do { @@ -317,7 +348,7 @@ public boolean readCheckNull() throws IOException { tempDecimalBuffer = new byte[length]; } - inputByteBuffer.seek(start); + inputByteBuffer.seek(decimalStart); for (int i = 0; i < length; ++i) { tempDecimalBuffer[i] = inputByteBuffer.read(positive ? invert : !invert); } @@ -392,10 +423,6 @@ public boolean readBeyondConfiguredFieldsWarned() { return readBeyondConfiguredFieldsWarned; } @Override - public boolean readBeyondBufferRangeWarned() { - return readBeyondBufferRangeWarned; - } - @Override public boolean bufferRangeHasExtraDataWarned() { return bufferRangeHasExtraDataWarned; } @@ -410,14 +437,4 @@ private void doReadBeyondConfiguredFieldsWarned() { + " reading more (NULLs returned). Ignoring similar problems."); readBeyondConfiguredFieldsWarned = true; } - - private void doReadBeyondBufferRangeWarned() { - // Warn only once. - int length = inputByteBuffer.tell() - start; - LOG.info("Reading beyond buffer range! Buffer range " + start - + " for length " + length + " but reading more... " - + "(total buffer length " + inputByteBuffer.getData().length + ")" - + " Ignoring similar problems."); - readBeyondBufferRangeWarned = true; - } } diff --git serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java index 2fad2af..8f3e771 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java @@ -143,10 +143,14 @@ public void setColumnsToInclude(boolean[] columnsToInclude) { * Read integrity warning flags. */ public abstract boolean readBeyondConfiguredFieldsWarned(); - public abstract boolean readBeyondBufferRangeWarned(); public abstract boolean bufferRangeHasExtraDataWarned(); /* + * Get detailed read position information to help diagnose exceptions. + */ + public abstract String getDetailedReadPositionString(); + + /* * These members hold the current value that was read when readCheckNull return false. */ diff --git serde/src/java/org/apache/hadoop/hive/serde2/fast/RandomRowObjectSource.java serde/src/java/org/apache/hadoop/hive/serde2/fast/RandomRowObjectSource.java deleted file mode 100644 index 1bb990c..0000000 --- serde/src/java/org/apache/hadoop/hive/serde2/fast/RandomRowObjectSource.java +++ /dev/null @@ -1,423 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.serde2.fast; - -import java.sql.Date; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Random; - -import org.apache.commons.lang.ArrayUtils; -import org.apache.commons.lang.StringUtils; -import org.apache.hadoop.hive.common.type.HiveChar; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; -import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; -import org.apache.hadoop.hive.common.type.HiveVarchar; -import org.apache.hadoop.hive.common.type.RandomTypeUtil; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveCharObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveIntervalDayTimeObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveIntervalYearMonthObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveVarcharObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; -import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; -import org.apache.hive.common.util.DateUtils; - -/** - * Generate object inspector and random row object[]. - */ -public class RandomRowObjectSource { - - private Random r; - - private int columnCount; - - private List typeNames; - - private PrimitiveCategory[] primitiveCategories; - - private PrimitiveTypeInfo[] primitiveTypeInfos; - - private List primitiveObjectInspectorList; - - private StructObjectInspector rowStructObjectInspector; - - public List typeNames() { - return typeNames; - } - - public PrimitiveCategory[] primitiveCategories() { - return primitiveCategories; - } - - public PrimitiveTypeInfo[] primitiveTypeInfos() { - return primitiveTypeInfos; - } - - public StructObjectInspector rowStructObjectInspector() { - return rowStructObjectInspector; - } - - public StructObjectInspector partialRowStructObjectInspector(int partialFieldCount) { - ArrayList partialPrimitiveObjectInspectorList = - new ArrayList(partialFieldCount); - List columnNames = new ArrayList(partialFieldCount); - for (int i = 0; i < partialFieldCount; i++) { - columnNames.add(String.format("partial%d", i)); - partialPrimitiveObjectInspectorList.add( - PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( - primitiveTypeInfos[i])); - } - - return ObjectInspectorFactory.getStandardStructObjectInspector( - columnNames, primitiveObjectInspectorList); - } - - public void init(Random r) { - this.r = r; - chooseSchema(); - } - - /* - * For now, exclude CHAR until we determine why there is a difference (blank padding) - * serializing with LazyBinarySerializeWrite and the regular SerDe... - */ - private static String[] possibleHiveTypeNames = { - "boolean", - "tinyint", - "smallint", - "int", - "bigint", - "date", - "float", - "double", - "string", -// "char", - "varchar", - "binary", - "date", - "timestamp", - "interval_year_month", - "interval_day_time", - "decimal" - }; - - private void chooseSchema() { - HashSet hashSet = null; - boolean allTypes; - boolean onlyOne = (r.nextInt(100) == 7); - if (onlyOne) { - columnCount = 1; - allTypes = false; - } else { - allTypes = r.nextBoolean(); - if (allTypes) { - // One of each type. - columnCount = possibleHiveTypeNames.length; - hashSet = new HashSet(); - } else { - columnCount = 1 + r.nextInt(20); - } - } - typeNames = new ArrayList(columnCount); - primitiveCategories = new PrimitiveCategory[columnCount]; - primitiveTypeInfos = new PrimitiveTypeInfo[columnCount]; - primitiveObjectInspectorList = new ArrayList(columnCount); - List columnNames = new ArrayList(columnCount); - for (int c = 0; c < columnCount; c++) { - columnNames.add(String.format("col%d", c)); - String typeName; - - if (onlyOne) { - typeName = possibleHiveTypeNames[r.nextInt(possibleHiveTypeNames.length)]; - } else { - int typeNum; - if (allTypes) { - while (true) { - typeNum = r.nextInt(possibleHiveTypeNames.length); - Integer typeNumInteger = new Integer(typeNum); - if (!hashSet.contains(typeNumInteger)) { - hashSet.add(typeNumInteger); - break; - } - } - } else { - typeNum = r.nextInt(possibleHiveTypeNames.length); - } - typeName = possibleHiveTypeNames[typeNum]; - } - if (typeName.equals("char")) { - int maxLength = 1 + r.nextInt(100); - typeName = String.format("char(%d)", maxLength); - } else if (typeName.equals("varchar")) { - int maxLength = 1 + r.nextInt(100); - typeName = String.format("varchar(%d)", maxLength); - } else if (typeName.equals("decimal")) { - typeName = String.format("decimal(%d,%d)", HiveDecimal.SYSTEM_DEFAULT_PRECISION, HiveDecimal.SYSTEM_DEFAULT_SCALE); - } - PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); - primitiveTypeInfos[c] = primitiveTypeInfo; - PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); - primitiveCategories[c] = primitiveCategory; - primitiveObjectInspectorList.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo)); - typeNames.add(typeName); - } - rowStructObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, primitiveObjectInspectorList); - } - - public Object[][] randomRows(int n) { - Object[][] result = new Object[n][]; - for (int i = 0; i < n; i++) { - result[i] = randomRow(); - } - return result; - } - - public Object[] randomRow() { - Object row[] = new Object[columnCount]; - for (int c = 0; c < columnCount; c++) { - Object object = randomObject(c); - if (object == null) { - throw new Error("Unexpected null for column " + c); - } - row[c] = getWritableObject(c, object); - if (row[c] == null) { - throw new Error("Unexpected null for writable for column " + c); - } - } - return row; - } - - public static void sort(Object[][] rows, ObjectInspector oi) { - for (int i = 0; i < rows.length; i++) { - for (int j = i + 1; j < rows.length; j++) { - if (ObjectInspectorUtils.compare(rows[i], oi, rows[j], oi) > 0) { - Object[] t = rows[i]; - rows[i] = rows[j]; - rows[j] = t; - } - } - } - } - - public void sort(Object[][] rows) { - RandomRowObjectSource.sort(rows, rowStructObjectInspector); - } - - public Object getWritableObject(int column, Object object) { - ObjectInspector objectInspector = primitiveObjectInspectorList.get(column); - PrimitiveCategory primitiveCategory = primitiveCategories[column]; - PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; - switch (primitiveCategory) { - case BOOLEAN: - return ((WritableBooleanObjectInspector) objectInspector).create((boolean) object); - case BYTE: - return ((WritableByteObjectInspector) objectInspector).create((byte) object); - case SHORT: - return ((WritableShortObjectInspector) objectInspector).create((short) object); - case INT: - return ((WritableIntObjectInspector) objectInspector).create((int) object); - case LONG: - return ((WritableLongObjectInspector) objectInspector).create((long) object); - case DATE: - return ((WritableDateObjectInspector) objectInspector).create((Date) object); - case FLOAT: - return ((WritableFloatObjectInspector) objectInspector).create((float) object); - case DOUBLE: - return ((WritableDoubleObjectInspector) objectInspector).create((double) object); - case STRING: - return ((WritableStringObjectInspector) objectInspector).create((String) object); - case CHAR: - { - WritableHiveCharObjectInspector writableCharObjectInspector = - new WritableHiveCharObjectInspector( (CharTypeInfo) primitiveTypeInfo); - return writableCharObjectInspector.create(new HiveChar(StringUtils.EMPTY, -1)); - } - case VARCHAR: - { - WritableHiveVarcharObjectInspector writableVarcharObjectInspector = - new WritableHiveVarcharObjectInspector( (VarcharTypeInfo) primitiveTypeInfo); - return writableVarcharObjectInspector.create(new HiveVarchar(StringUtils.EMPTY, -1)); - } - case BINARY: - return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector.create(ArrayUtils.EMPTY_BYTE_ARRAY); - case TIMESTAMP: - return ((WritableTimestampObjectInspector) objectInspector).create(new Timestamp(0)); - case INTERVAL_YEAR_MONTH: - return ((WritableHiveIntervalYearMonthObjectInspector) objectInspector).create(new HiveIntervalYearMonth(0)); - case INTERVAL_DAY_TIME: - return ((WritableHiveIntervalDayTimeObjectInspector) objectInspector).create(new HiveIntervalDayTime(0, 0)); - case DECIMAL: - { - WritableHiveDecimalObjectInspector writableDecimalObjectInspector = - new WritableHiveDecimalObjectInspector((DecimalTypeInfo) primitiveTypeInfo); - return writableDecimalObjectInspector.create(HiveDecimal.ZERO); - } - default: - throw new Error("Unknown primitive category " + primitiveCategory); - } - } - - public Object randomObject(int column) { - PrimitiveCategory primitiveCategory = primitiveCategories[column]; - PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; - switch (primitiveCategory) { - case BOOLEAN: - return Boolean.valueOf(r.nextInt(1) == 1); - case BYTE: - return Byte.valueOf((byte) r.nextInt()); - case SHORT: - return Short.valueOf((short) r.nextInt()); - case INT: - return Integer.valueOf(r.nextInt()); - case LONG: - return Long.valueOf(r.nextLong()); - case DATE: - return RandomTypeUtil.getRandDate(r); - case FLOAT: - return Float.valueOf(r.nextFloat() * 10 - 5); - case DOUBLE: - return Double.valueOf(r.nextDouble() * 10 - 5); - case STRING: - return RandomTypeUtil.getRandString(r); - case CHAR: - return getRandHiveChar(r, (CharTypeInfo) primitiveTypeInfo); - case VARCHAR: - return getRandHiveVarchar(r, (VarcharTypeInfo) primitiveTypeInfo); - case BINARY: - return getRandBinary(r, 1 + r.nextInt(100)); - case TIMESTAMP: - return RandomTypeUtil.getRandTimestamp(r); - case INTERVAL_YEAR_MONTH: - return getRandIntervalYearMonth(r); - case INTERVAL_DAY_TIME: - return getRandIntervalDayTime(r); - case DECIMAL: - return getRandHiveDecimal(r, (DecimalTypeInfo) primitiveTypeInfo); - default: - throw new Error("Unknown primitive category " + primitiveCategory); - } - } - - public static HiveChar getRandHiveChar(Random r, CharTypeInfo charTypeInfo) { - int maxLength = 1 + r.nextInt(charTypeInfo.getLength()); - String randomString = RandomTypeUtil.getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); - HiveChar hiveChar = new HiveChar(randomString, maxLength); - return hiveChar; - } - - public static HiveVarchar getRandHiveVarchar(Random r, VarcharTypeInfo varcharTypeInfo) { - int maxLength = 1 + r.nextInt(varcharTypeInfo.getLength()); - String randomString = RandomTypeUtil.getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); - HiveVarchar hiveVarchar = new HiveVarchar(randomString, maxLength); - return hiveVarchar; - } - - public static byte[] getRandBinary(Random r, int len){ - byte[] bytes = new byte[len]; - for (int j = 0; j < len; j++){ - bytes[j] = Byte.valueOf((byte) r.nextInt()); - } - return bytes; - } - - private static final String DECIMAL_CHARS = "0123456789"; - - public static HiveDecimal getRandHiveDecimal(Random r, DecimalTypeInfo decimalTypeInfo) { - while (true) { - StringBuilder sb = new StringBuilder(); - int precision = 1 + r.nextInt(18); - int scale = 0 + r.nextInt(precision + 1); - - int integerDigits = precision - scale; - - if (r.nextBoolean()) { - sb.append("-"); - } - - if (integerDigits == 0) { - sb.append("0"); - } else { - sb.append(RandomTypeUtil.getRandString(r, DECIMAL_CHARS, integerDigits)); - } - if (scale != 0) { - sb.append("."); - sb.append(RandomTypeUtil.getRandString(r, DECIMAL_CHARS, scale)); - } - - HiveDecimal bd = HiveDecimal.create(sb.toString()); - if (bd.scale() > bd.precision()) { - // Sometimes weird decimals are produced? - continue; - } - - return bd; - } - } - - public static HiveIntervalYearMonth getRandIntervalYearMonth(Random r) { - String yearMonthSignStr = r.nextInt(2) == 0 ? "" : "-"; - String intervalYearMonthStr = String.format("%s%d-%d", - yearMonthSignStr, - Integer.valueOf(1800 + r.nextInt(500)), // year - Integer.valueOf(0 + r.nextInt(12))); // month - HiveIntervalYearMonth intervalYearMonthVal = HiveIntervalYearMonth.valueOf(intervalYearMonthStr); - return intervalYearMonthVal; - } - - public static HiveIntervalDayTime getRandIntervalDayTime(Random r) { - String optionalNanos = ""; - if (r.nextInt(2) == 1) { - optionalNanos = String.format(".%09d", - Integer.valueOf(0 + r.nextInt(DateUtils.NANOS_PER_SEC))); - } - String yearMonthSignStr = r.nextInt(2) == 0 ? "" : "-"; - String dayTimeStr = String.format("%s%d %02d:%02d:%02d%s", - yearMonthSignStr, - Integer.valueOf(1 + r.nextInt(28)), // day - Integer.valueOf(0 + r.nextInt(24)), // hour - Integer.valueOf(0 + r.nextInt(60)), // minute - Integer.valueOf(0 + r.nextInt(60)), // second - optionalNanos); - HiveIntervalDayTime intervalDayTimeVal = HiveIntervalDayTime.valueOf(dayTimeStr); - return intervalDayTimeVal; - } -} diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java index b375d26..9869452 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java @@ -21,6 +21,8 @@ import java.io.UnsupportedEncodingException; import java.nio.charset.CharacterCodingException; import java.sql.Date; +import java.util.Arrays; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -72,6 +74,7 @@ private int end; private int fieldCount; private int fieldIndex; + private int parseFieldIndex; private int fieldStart; private int fieldLength; @@ -123,6 +126,41 @@ public void set(byte[] bytes, int offset, int length) { fieldIndex = -1; } + /* + * Get detailed read position information to help diagnose exceptions. + */ + public String getDetailedReadPositionString() { + StringBuffer sb = new StringBuffer(); + + sb.append("Reading byte[] of length "); + sb.append(bytes.length); + sb.append(" at start offset "); + sb.append(start); + sb.append(" for length "); + sb.append(end - start); + sb.append(" to read "); + sb.append(fieldCount); + sb.append(" fields with types "); + sb.append(Arrays.toString(typeInfos)); + sb.append(". "); + if (fieldIndex == -1) { + sb.append("Error during field delimitor parsing of field #"); + sb.append(parseFieldIndex); + } else { + sb.append("Read field #"); + sb.append(fieldIndex); + sb.append(" at field start position "); + sb.append(startPosition[fieldIndex]); + int currentFieldLength = startPosition[fieldIndex + 1] - startPosition[fieldIndex] - 1; + sb.append(" for field length "); + sb.append(currentFieldLength); + sb.append(" current read offset "); + sb.append(offset); + } + + return sb.toString(); + } + /** * Parse the byte[] and fill each field. * @@ -132,27 +170,29 @@ public void set(byte[] bytes, int offset, int length) { private void parse() { int structByteEnd = end; - int fieldId = 0; int fieldByteBegin = start; int fieldByteEnd = start; + // Kept as a member variable to support getDetailedReadPositionString. + parseFieldIndex = 0; + // Go through all bytes in the byte[] while (fieldByteEnd <= structByteEnd) { if (fieldByteEnd == structByteEnd || bytes[fieldByteEnd] == separator) { // Reached the end of a field? - if (lastColumnTakesRest && fieldId == fieldCount - 1) { + if (lastColumnTakesRest && parseFieldIndex == fieldCount - 1) { fieldByteEnd = structByteEnd; } - startPosition[fieldId] = fieldByteBegin; - fieldId++; - if (fieldId == fieldCount || fieldByteEnd == structByteEnd) { + startPosition[parseFieldIndex] = fieldByteBegin; + parseFieldIndex++; + if (parseFieldIndex == fieldCount || fieldByteEnd == structByteEnd) { // All fields have been parsed, or bytes have been parsed. // We need to set the startPosition of fields.length to ensure we // can use the same formula to calculate the length of each field. // For missing fields, their starting positions will all be the same, // which will make their lengths to be -1 and uncheckedGetField will // return these fields as NULLs. - for (int i = fieldId; i <= fieldCount; i++) { + for (int i = parseFieldIndex; i <= fieldCount; i++) { startPosition[i] = fieldByteEnd + 1; } break; @@ -176,8 +216,8 @@ private void parse() { } // Missing fields? - if (!missingFieldWarned && fieldId < fieldCount) { - doMissingFieldWarned(fieldId); + if (!missingFieldWarned && parseFieldIndex < fieldCount) { + doMissingFieldWarned(parseFieldIndex); } } @@ -484,12 +524,8 @@ public boolean readBeyondConfiguredFieldsWarned() { return missingFieldWarned; } @Override - public boolean readBeyondBufferRangeWarned() { - return extraFieldWarned; - } - @Override public boolean bufferRangeHasExtraDataWarned() { - return false; // UNDONE: Get rid of... + return false; } private void doExtraFieldWarned() { diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java index bbb35c7..9ff90fc 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java @@ -20,6 +20,8 @@ import java.io.EOFException; import java.io.IOException; +import java.util.Arrays; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -54,6 +56,7 @@ private int offset; private int end; private int fieldCount; + private int fieldStart; private int fieldIndex; private byte nullByte; @@ -62,7 +65,6 @@ private VLong tempVLong; private boolean readBeyondConfiguredFieldsWarned; - private boolean readBeyondBufferRangeWarned; private boolean bufferRangeHasExtraDataWarned; public LazyBinaryDeserializeRead(TypeInfo[] typeInfos) { @@ -71,7 +73,6 @@ public LazyBinaryDeserializeRead(TypeInfo[] typeInfos) { tempVInt = new VInt(); tempVLong = new VLong(); readBeyondConfiguredFieldsWarned = false; - readBeyondBufferRangeWarned = false; bufferRangeHasExtraDataWarned = false; } @@ -93,6 +94,32 @@ public void set(byte[] bytes, int offset, int length) { } /* + * Get detailed read position information to help diagnose exceptions. + */ + public String getDetailedReadPositionString() { + StringBuffer sb = new StringBuffer(); + + sb.append("Reading byte[] of length "); + sb.append(bytes.length); + sb.append(" at start offset "); + sb.append(start); + sb.append(" for length "); + sb.append(end - start); + sb.append(" to read "); + sb.append(fieldCount); + sb.append(" fields with types "); + sb.append(Arrays.toString(typeInfos)); + sb.append(". Read field #"); + sb.append(fieldIndex); + sb.append(" at field start position "); + sb.append(fieldStart); + sb.append(" current read offset "); + sb.append(offset); + + return sb.toString(); + } + + /* * Reads the NULL information for a field. * * @return Returns true when the field is NULL; reading is positioned to the next field. @@ -111,11 +138,13 @@ public boolean readCheckNull() throws IOException { return true; } + fieldStart = offset; + if (fieldIndex == 0) { // The rest of the range check for fields after the first is below after checking // the NULL byte. if (offset >= end) { - warnBeyondEof(); + throw new EOFException(); } nullByte = bytes[offset++]; } @@ -129,9 +158,7 @@ public boolean readCheckNull() throws IOException { // Make sure there is at least one byte that can be read for a value. if (offset >= end) { - // Careful: since we may be dealing with NULLs in the final NULL byte, we check after - // the NULL byte check.. - warnBeyondEof(); + throw new EOFException(); } /* @@ -149,7 +176,7 @@ public boolean readCheckNull() throws IOException { case SHORT: // Last item -- ok to be at end. if (offset + 2 > end) { - warnBeyondEof(); + throw new EOFException(); } currentShort = LazyBinaryUtils.byteArrayToShort(bytes, offset); offset += 2; @@ -159,7 +186,7 @@ public boolean readCheckNull() throws IOException { offset += tempVInt.length; // Last item -- ok to be at end. if (offset > end) { - warnBeyondEof(); + throw new EOFException(); } currentInt = tempVInt.value; break; @@ -168,14 +195,14 @@ public boolean readCheckNull() throws IOException { offset += tempVLong.length; // Last item -- ok to be at end. if (offset > end) { - warnBeyondEof(); + throw new EOFException(); } currentLong = tempVLong.value; break; case FLOAT: // Last item -- ok to be at end. if (offset + 4 > end) { - warnBeyondEof(); + throw new EOFException(); } currentFloat = Float.intBitsToFloat(LazyBinaryUtils.byteArrayToInt(bytes, offset)); offset += 4; @@ -183,7 +210,7 @@ public boolean readCheckNull() throws IOException { case DOUBLE: // Last item -- ok to be at end. if (offset + 8 > end) { - warnBeyondEof(); + throw new EOFException(); } currentDouble = Double.longBitsToDouble(LazyBinaryUtils.byteArrayToLong(bytes, offset)); offset += 8; @@ -199,14 +226,14 @@ public boolean readCheckNull() throws IOException { offset += tempVInt.length; // Could be last item for empty string -- ok to be at end. if (offset > end) { - warnBeyondEof(); + throw new EOFException(); } int saveStart = offset; int length = tempVInt.value; offset += length; // Last item -- ok to be at end. if (offset > end) { - warnBeyondEof(); + throw new EOFException(); } currentBytes = bytes; @@ -219,7 +246,7 @@ public boolean readCheckNull() throws IOException { offset += tempVInt.length; // Last item -- ok to be at end. if (offset > end) { - warnBeyondEof(); + throw new EOFException(); } currentDateWritable.set(tempVInt.value); @@ -231,7 +258,7 @@ public boolean readCheckNull() throws IOException { offset += length; // Last item -- ok to be at end. if (offset > end) { - warnBeyondEof(); + throw new EOFException(); } currentTimestampWritable.set(bytes, saveStart); @@ -242,7 +269,7 @@ public boolean readCheckNull() throws IOException { offset += tempVInt.length; // Last item -- ok to be at end. if (offset > end) { - warnBeyondEof(); + throw new EOFException(); } currentHiveIntervalYearMonthWritable.set(tempVInt.value); break; @@ -251,13 +278,13 @@ public boolean readCheckNull() throws IOException { offset += tempVLong.length; if (offset >= end) { // Overshoot or not enough for next item. - warnBeyondEof(); + throw new EOFException(); } LazyBinaryUtils.readVInt(bytes, offset, tempVInt); offset += tempVInt.length; // Last item -- ok to be at end. if (offset > end) { - warnBeyondEof(); + throw new EOFException(); } currentHiveIntervalDayTimeWritable.set(tempVLong.value, tempVInt.value); @@ -274,18 +301,18 @@ public boolean readCheckNull() throws IOException { offset += tempVInt.length; if (offset >= end) { // Overshoot or not enough for next item. - warnBeyondEof(); + throw new EOFException(); } LazyBinaryUtils.readVInt(bytes, offset, tempVInt); offset += tempVInt.length; if (offset >= end) { // Overshoot or not enough for next item. - warnBeyondEof(); + throw new EOFException(); } offset += tempVInt.value; // Last item -- ok to be at end. if (offset > end) { - warnBeyondEof(); + throw new EOFException(); } int length = offset - saveStart; @@ -327,7 +354,7 @@ public boolean readCheckNull() throws IOException { if ((fieldIndex % 8) == 0) { // Get next null byte. if (offset >= end) { - warnBeyondEof(); + throw new EOFException(); } nullByte = bytes[offset++]; } @@ -363,23 +390,7 @@ public boolean readBeyondConfiguredFieldsWarned() { return readBeyondConfiguredFieldsWarned; } @Override - public boolean readBeyondBufferRangeWarned() { - return readBeyondBufferRangeWarned; - } - @Override public boolean bufferRangeHasExtraDataWarned() { return bufferRangeHasExtraDataWarned; } - - private void warnBeyondEof() throws EOFException { - if (!readBeyondBufferRangeWarned) { - // Warn only once. - int length = end - start; - LOG.info("Reading beyond buffer range! Buffer range " + start - + " for length " + length + " but reading more... " - + "(total buffer length " + bytes.length + ")" - + " Ignoring similar problems."); - readBeyondBufferRangeWarned = true; - } - } } diff --git serde/src/test/org/apache/hadoop/hive/serde2/SerdeRandomRowSource.java serde/src/test/org/apache/hadoop/hive/serde2/SerdeRandomRowSource.java new file mode 100644 index 0000000..f08a075 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/SerdeRandomRowSource.java @@ -0,0 +1,423 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2; + +import java.sql.Date; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Random; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; +import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.common.type.RandomTypeUtil; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveIntervalDayTimeObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveIntervalYearMonthObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hive.common.util.DateUtils; + +/** + * Generate object inspector and random row object[]. + */ +public class SerdeRandomRowSource { + + private Random r; + + private int columnCount; + + private List typeNames; + + private PrimitiveCategory[] primitiveCategories; + + private PrimitiveTypeInfo[] primitiveTypeInfos; + + private List primitiveObjectInspectorList; + + private StructObjectInspector rowStructObjectInspector; + + public List typeNames() { + return typeNames; + } + + public PrimitiveCategory[] primitiveCategories() { + return primitiveCategories; + } + + public PrimitiveTypeInfo[] primitiveTypeInfos() { + return primitiveTypeInfos; + } + + public StructObjectInspector rowStructObjectInspector() { + return rowStructObjectInspector; + } + + public StructObjectInspector partialRowStructObjectInspector(int partialFieldCount) { + ArrayList partialPrimitiveObjectInspectorList = + new ArrayList(partialFieldCount); + List columnNames = new ArrayList(partialFieldCount); + for (int i = 0; i < partialFieldCount; i++) { + columnNames.add(String.format("partial%d", i)); + partialPrimitiveObjectInspectorList.add( + PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + primitiveTypeInfos[i])); + } + + return ObjectInspectorFactory.getStandardStructObjectInspector( + columnNames, primitiveObjectInspectorList); + } + + public void init(Random r) { + this.r = r; + chooseSchema(); + } + + /* + * For now, exclude CHAR until we determine why there is a difference (blank padding) + * serializing with LazyBinarySerializeWrite and the regular SerDe... + */ + private static String[] possibleHiveTypeNames = { + "boolean", + "tinyint", + "smallint", + "int", + "bigint", + "date", + "float", + "double", + "string", +// "char", + "varchar", + "binary", + "date", + "timestamp", + "interval_year_month", + "interval_day_time", + "decimal" + }; + + private void chooseSchema() { + HashSet hashSet = null; + boolean allTypes; + boolean onlyOne = (r.nextInt(100) == 7); + if (onlyOne) { + columnCount = 1; + allTypes = false; + } else { + allTypes = r.nextBoolean(); + if (allTypes) { + // One of each type. + columnCount = possibleHiveTypeNames.length; + hashSet = new HashSet(); + } else { + columnCount = 1 + r.nextInt(20); + } + } + typeNames = new ArrayList(columnCount); + primitiveCategories = new PrimitiveCategory[columnCount]; + primitiveTypeInfos = new PrimitiveTypeInfo[columnCount]; + primitiveObjectInspectorList = new ArrayList(columnCount); + List columnNames = new ArrayList(columnCount); + for (int c = 0; c < columnCount; c++) { + columnNames.add(String.format("col%d", c)); + String typeName; + + if (onlyOne) { + typeName = possibleHiveTypeNames[r.nextInt(possibleHiveTypeNames.length)]; + } else { + int typeNum; + if (allTypes) { + while (true) { + typeNum = r.nextInt(possibleHiveTypeNames.length); + Integer typeNumInteger = new Integer(typeNum); + if (!hashSet.contains(typeNumInteger)) { + hashSet.add(typeNumInteger); + break; + } + } + } else { + typeNum = r.nextInt(possibleHiveTypeNames.length); + } + typeName = possibleHiveTypeNames[typeNum]; + } + if (typeName.equals("char")) { + int maxLength = 1 + r.nextInt(100); + typeName = String.format("char(%d)", maxLength); + } else if (typeName.equals("varchar")) { + int maxLength = 1 + r.nextInt(100); + typeName = String.format("varchar(%d)", maxLength); + } else if (typeName.equals("decimal")) { + typeName = String.format("decimal(%d,%d)", HiveDecimal.SYSTEM_DEFAULT_PRECISION, HiveDecimal.SYSTEM_DEFAULT_SCALE); + } + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(typeName); + primitiveTypeInfos[c] = primitiveTypeInfo; + PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + primitiveCategories[c] = primitiveCategory; + primitiveObjectInspectorList.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo)); + typeNames.add(typeName); + } + rowStructObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, primitiveObjectInspectorList); + } + + public Object[][] randomRows(int n) { + Object[][] result = new Object[n][]; + for (int i = 0; i < n; i++) { + result[i] = randomRow(); + } + return result; + } + + public Object[] randomRow() { + Object row[] = new Object[columnCount]; + for (int c = 0; c < columnCount; c++) { + Object object = randomObject(c); + if (object == null) { + throw new Error("Unexpected null for column " + c); + } + row[c] = getWritableObject(c, object); + if (row[c] == null) { + throw new Error("Unexpected null for writable for column " + c); + } + } + return row; + } + + public static void sort(Object[][] rows, ObjectInspector oi) { + for (int i = 0; i < rows.length; i++) { + for (int j = i + 1; j < rows.length; j++) { + if (ObjectInspectorUtils.compare(rows[i], oi, rows[j], oi) > 0) { + Object[] t = rows[i]; + rows[i] = rows[j]; + rows[j] = t; + } + } + } + } + + public void sort(Object[][] rows) { + SerdeRandomRowSource.sort(rows, rowStructObjectInspector); + } + + public Object getWritableObject(int column, Object object) { + ObjectInspector objectInspector = primitiveObjectInspectorList.get(column); + PrimitiveCategory primitiveCategory = primitiveCategories[column]; + PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; + switch (primitiveCategory) { + case BOOLEAN: + return ((WritableBooleanObjectInspector) objectInspector).create((boolean) object); + case BYTE: + return ((WritableByteObjectInspector) objectInspector).create((byte) object); + case SHORT: + return ((WritableShortObjectInspector) objectInspector).create((short) object); + case INT: + return ((WritableIntObjectInspector) objectInspector).create((int) object); + case LONG: + return ((WritableLongObjectInspector) objectInspector).create((long) object); + case DATE: + return ((WritableDateObjectInspector) objectInspector).create((Date) object); + case FLOAT: + return ((WritableFloatObjectInspector) objectInspector).create((float) object); + case DOUBLE: + return ((WritableDoubleObjectInspector) objectInspector).create((double) object); + case STRING: + return ((WritableStringObjectInspector) objectInspector).create((String) object); + case CHAR: + { + WritableHiveCharObjectInspector writableCharObjectInspector = + new WritableHiveCharObjectInspector( (CharTypeInfo) primitiveTypeInfo); + return writableCharObjectInspector.create(new HiveChar(StringUtils.EMPTY, -1)); + } + case VARCHAR: + { + WritableHiveVarcharObjectInspector writableVarcharObjectInspector = + new WritableHiveVarcharObjectInspector( (VarcharTypeInfo) primitiveTypeInfo); + return writableVarcharObjectInspector.create(new HiveVarchar(StringUtils.EMPTY, -1)); + } + case BINARY: + return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector.create(ArrayUtils.EMPTY_BYTE_ARRAY); + case TIMESTAMP: + return ((WritableTimestampObjectInspector) objectInspector).create(new Timestamp(0)); + case INTERVAL_YEAR_MONTH: + return ((WritableHiveIntervalYearMonthObjectInspector) objectInspector).create(new HiveIntervalYearMonth(0)); + case INTERVAL_DAY_TIME: + return ((WritableHiveIntervalDayTimeObjectInspector) objectInspector).create(new HiveIntervalDayTime(0, 0)); + case DECIMAL: + { + WritableHiveDecimalObjectInspector writableDecimalObjectInspector = + new WritableHiveDecimalObjectInspector((DecimalTypeInfo) primitiveTypeInfo); + return writableDecimalObjectInspector.create(HiveDecimal.ZERO); + } + default: + throw new Error("Unknown primitive category " + primitiveCategory); + } + } + + public Object randomObject(int column) { + PrimitiveCategory primitiveCategory = primitiveCategories[column]; + PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[column]; + switch (primitiveCategory) { + case BOOLEAN: + return Boolean.valueOf(r.nextInt(1) == 1); + case BYTE: + return Byte.valueOf((byte) r.nextInt()); + case SHORT: + return Short.valueOf((short) r.nextInt()); + case INT: + return Integer.valueOf(r.nextInt()); + case LONG: + return Long.valueOf(r.nextLong()); + case DATE: + return RandomTypeUtil.getRandDate(r); + case FLOAT: + return Float.valueOf(r.nextFloat() * 10 - 5); + case DOUBLE: + return Double.valueOf(r.nextDouble() * 10 - 5); + case STRING: + return RandomTypeUtil.getRandString(r); + case CHAR: + return getRandHiveChar(r, (CharTypeInfo) primitiveTypeInfo); + case VARCHAR: + return getRandHiveVarchar(r, (VarcharTypeInfo) primitiveTypeInfo); + case BINARY: + return getRandBinary(r, 1 + r.nextInt(100)); + case TIMESTAMP: + return RandomTypeUtil.getRandTimestamp(r); + case INTERVAL_YEAR_MONTH: + return getRandIntervalYearMonth(r); + case INTERVAL_DAY_TIME: + return getRandIntervalDayTime(r); + case DECIMAL: + return getRandHiveDecimal(r, (DecimalTypeInfo) primitiveTypeInfo); + default: + throw new Error("Unknown primitive category " + primitiveCategory); + } + } + + public static HiveChar getRandHiveChar(Random r, CharTypeInfo charTypeInfo) { + int maxLength = 1 + r.nextInt(charTypeInfo.getLength()); + String randomString = RandomTypeUtil.getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); + HiveChar hiveChar = new HiveChar(randomString, maxLength); + return hiveChar; + } + + public static HiveVarchar getRandHiveVarchar(Random r, VarcharTypeInfo varcharTypeInfo) { + int maxLength = 1 + r.nextInt(varcharTypeInfo.getLength()); + String randomString = RandomTypeUtil.getRandString(r, "abcdefghijklmnopqrstuvwxyz", 100); + HiveVarchar hiveVarchar = new HiveVarchar(randomString, maxLength); + return hiveVarchar; + } + + public static byte[] getRandBinary(Random r, int len){ + byte[] bytes = new byte[len]; + for (int j = 0; j < len; j++){ + bytes[j] = Byte.valueOf((byte) r.nextInt()); + } + return bytes; + } + + private static final String DECIMAL_CHARS = "0123456789"; + + public static HiveDecimal getRandHiveDecimal(Random r, DecimalTypeInfo decimalTypeInfo) { + while (true) { + StringBuilder sb = new StringBuilder(); + int precision = 1 + r.nextInt(18); + int scale = 0 + r.nextInt(precision + 1); + + int integerDigits = precision - scale; + + if (r.nextBoolean()) { + sb.append("-"); + } + + if (integerDigits == 0) { + sb.append("0"); + } else { + sb.append(RandomTypeUtil.getRandString(r, DECIMAL_CHARS, integerDigits)); + } + if (scale != 0) { + sb.append("."); + sb.append(RandomTypeUtil.getRandString(r, DECIMAL_CHARS, scale)); + } + + HiveDecimal bd = HiveDecimal.create(sb.toString()); + if (bd.scale() > bd.precision()) { + // Sometimes weird decimals are produced? + continue; + } + + return bd; + } + } + + public static HiveIntervalYearMonth getRandIntervalYearMonth(Random r) { + String yearMonthSignStr = r.nextInt(2) == 0 ? "" : "-"; + String intervalYearMonthStr = String.format("%s%d-%d", + yearMonthSignStr, + Integer.valueOf(1800 + r.nextInt(500)), // year + Integer.valueOf(0 + r.nextInt(12))); // month + HiveIntervalYearMonth intervalYearMonthVal = HiveIntervalYearMonth.valueOf(intervalYearMonthStr); + return intervalYearMonthVal; + } + + public static HiveIntervalDayTime getRandIntervalDayTime(Random r) { + String optionalNanos = ""; + if (r.nextInt(2) == 1) { + optionalNanos = String.format(".%09d", + Integer.valueOf(0 + r.nextInt(DateUtils.NANOS_PER_SEC))); + } + String yearMonthSignStr = r.nextInt(2) == 0 ? "" : "-"; + String dayTimeStr = String.format("%s%d %02d:%02d:%02d%s", + yearMonthSignStr, + Integer.valueOf(1 + r.nextInt(28)), // day + Integer.valueOf(0 + r.nextInt(24)), // hour + Integer.valueOf(0 + r.nextInt(60)), // minute + Integer.valueOf(0 + r.nextInt(60)), // second + optionalNanos); + HiveIntervalDayTime intervalDayTimeVal = HiveIntervalDayTime.valueOf(dayTimeStr); + return intervalDayTimeVal; + } +} diff --git serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java index 58937db..7babf7a 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java +++ serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java @@ -25,10 +25,10 @@ import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerdeRandomRowSource; import org.apache.hadoop.hive.serde2.VerifyFast; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; -import org.apache.hadoop.hive.serde2.fast.RandomRowObjectSource; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; @@ -39,8 +39,11 @@ public class TestBinarySortableFast extends TestCase { + private static String debugDetailedReadPositionString; + private static StackTraceElement[] debugStackTrace; + private void testBinarySortableFast( - RandomRowObjectSource source, Object[][] rows, + SerdeRandomRowSource source, Object[][] rows, boolean[] columnSortOrderIsDesc, byte[] columnNullMarker, byte[] columnNotNullMarker, SerDe serde, StructObjectInspector rowOI, SerDe serde_fewer, StructObjectInspector writeRowOI, @@ -134,11 +137,6 @@ private void testBinarySortableFast( } binarySortableDeserializeRead.extraFieldsCheck(); TestCase.assertTrue(!binarySortableDeserializeRead.readBeyondConfiguredFieldsWarned()); - if (doWriteFewerColumns) { - TestCase.assertTrue(binarySortableDeserializeRead.readBeyondBufferRangeWarned()); - } else { - TestCase.assertTrue(!binarySortableDeserializeRead.readBeyondBufferRangeWarned()); - } TestCase.assertTrue(!binarySortableDeserializeRead.bufferRangeHasExtraDataWarned()); /* @@ -161,6 +159,8 @@ private void testBinarySortableFast( try { VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead2, primitiveTypeInfos[index], writable); } catch (EOFException e) { +// debugDetailedReadPositionString = binarySortableDeserializeRead2.getDetailedReadPositionString(); +// debugStackTrace = e.getStackTrace(); threw = true; } TestCase.assertTrue(threw); @@ -268,11 +268,6 @@ private void testBinarySortableFast( } binarySortableDeserializeRead.extraFieldsCheck(); TestCase.assertTrue(!binarySortableDeserializeRead.readBeyondConfiguredFieldsWarned()); - if (doWriteFewerColumns) { - TestCase.assertTrue(binarySortableDeserializeRead.readBeyondBufferRangeWarned()); - } else { - TestCase.assertTrue(!binarySortableDeserializeRead.readBeyondBufferRangeWarned()); - } TestCase.assertTrue(!binarySortableDeserializeRead.bufferRangeHasExtraDataWarned()); } } @@ -280,7 +275,7 @@ private void testBinarySortableFast( private void testBinarySortableFastCase(int caseNum, boolean doNonRandomFill, Random r) throws Throwable { - RandomRowObjectSource source = new RandomRowObjectSource(); + SerdeRandomRowSource source = new SerdeRandomRowSource(); source.init(r); int rowCount = 1000; diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java index 76b93c6..66c6203 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java +++ serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java @@ -17,7 +17,6 @@ */ package org.apache.hadoop.hive.serde2.lazy; -import java.io.EOFException; import java.util.Arrays; import java.util.Properties; import java.util.Random; @@ -25,19 +24,14 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ByteStream.Output; -import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.SerdeRandomRowSource; import org.apache.hadoop.hive.serde2.VerifyFast; import org.apache.hadoop.hive.serde2.binarysortable.MyTestClass; -import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass; -import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; -import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; -import org.apache.hadoop.hive.serde2.fast.RandomRowObjectSource; import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead; import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleSerializeWrite; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.io.BytesWritable; @@ -49,7 +43,7 @@ public class TestLazySimpleFast extends TestCase { private void testLazySimpleFast( - RandomRowObjectSource source, Object[][] rows, + SerdeRandomRowSource source, Object[][] rows, LazySimpleSerDe serde, StructObjectInspector rowOI, LazySimpleSerDe serde_fewer, StructObjectInspector writeRowOI, byte separator, LazySerDeParameters serdeParams, LazySerDeParameters serdeParams_fewer, @@ -131,11 +125,6 @@ private void testLazySimpleFast( } lazySimpleDeserializeRead.extraFieldsCheck(); TestCase.assertTrue(!lazySimpleDeserializeRead.readBeyondConfiguredFieldsWarned()); - if (doWriteFewerColumns) { - TestCase.assertTrue(lazySimpleDeserializeRead.readBeyondBufferRangeWarned()); - } else { - TestCase.assertTrue(!lazySimpleDeserializeRead.readBeyondBufferRangeWarned()); - } TestCase.assertTrue(!lazySimpleDeserializeRead.bufferRangeHasExtraDataWarned()); } @@ -219,11 +208,6 @@ private void testLazySimpleFast( } lazySimpleDeserializeRead.extraFieldsCheck(); TestCase.assertTrue(!lazySimpleDeserializeRead.readBeyondConfiguredFieldsWarned()); - if (doWriteFewerColumns) { - TestCase.assertTrue(lazySimpleDeserializeRead.readBeyondBufferRangeWarned()); - } else { - TestCase.assertTrue(!lazySimpleDeserializeRead.readBeyondBufferRangeWarned()); - } TestCase.assertTrue(!lazySimpleDeserializeRead.bufferRangeHasExtraDataWarned()); } } @@ -266,7 +250,7 @@ private LazySerDeParameters getSerDeParams(String fieldNames, String fieldTypes) public void testLazySimpleFastCase(int caseNum, boolean doNonRandomFill, Random r) throws Throwable { - RandomRowObjectSource source = new RandomRowObjectSource(); + SerdeRandomRowSource source = new SerdeRandomRowSource(); source.init(r); int rowCount = 1000; diff --git serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java index d7c4999..5af11cd 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java +++ serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java @@ -17,43 +17,28 @@ */ package org.apache.hadoop.hive.serde2.lazybinary; -import java.io.EOFException; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; import java.util.Random; import junit.framework.TestCase; import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerdeRandomRowSource; import org.apache.hadoop.hive.serde2.VerifyFast; import org.apache.hadoop.hive.serde2.binarysortable.MyTestClass; -import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass; -import org.apache.hadoop.hive.serde2.binarysortable.MyTestPrimitiveClass.ExtraTypeInfo; -import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; -import org.apache.hadoop.hive.serde2.fast.RandomRowObjectSource; -import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; -import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Writable; public class TestLazyBinaryFast extends TestCase { private void testLazyBinaryFast( - RandomRowObjectSource source, Object[][] rows, + SerdeRandomRowSource source, Object[][] rows, SerDe serde, StructObjectInspector rowOI, SerDe serde_fewer, StructObjectInspector writeRowOI, PrimitiveTypeInfo[] primitiveTypeInfos, @@ -131,7 +116,6 @@ private void testLazyBinaryFast( } else { TestCase.assertTrue(!lazyBinaryDeserializeRead.readBeyondConfiguredFieldsWarned()); } - TestCase.assertTrue(!lazyBinaryDeserializeRead.readBeyondBufferRangeWarned()); TestCase.assertTrue(!lazyBinaryDeserializeRead.bufferRangeHasExtraDataWarned()); } @@ -228,19 +212,13 @@ private void testLazyBinaryFast( } lazyBinaryDeserializeRead.extraFieldsCheck(); TestCase.assertTrue(!lazyBinaryDeserializeRead.readBeyondConfiguredFieldsWarned()); - if (doWriteFewerColumns) { - // The nullByte may cause this to not be true... - // TestCase.assertTrue(lazyBinaryDeserializeRead.readBeyondBufferRangeWarned()); - } else { - TestCase.assertTrue(!lazyBinaryDeserializeRead.readBeyondBufferRangeWarned()); - } TestCase.assertTrue(!lazyBinaryDeserializeRead.bufferRangeHasExtraDataWarned()); } } public void testLazyBinaryFastCase(int caseNum, boolean doNonRandomFill, Random r) throws Throwable { - RandomRowObjectSource source = new RandomRowObjectSource(); + SerdeRandomRowSource source = new SerdeRandomRowSource(); source.init(r); int rowCount = 1000;