diff --git common/src/java/org/apache/hive/common/util/HashCodeUtil.java common/src/java/org/apache/hive/common/util/HashCodeUtil.java index fa30273..90136b8 100644 --- common/src/java/org/apache/hive/common/util/HashCodeUtil.java +++ common/src/java/org/apache/hive/common/util/HashCodeUtil.java @@ -69,6 +69,14 @@ public static int calculateBytesHashCode(byte[] keyBytes, int keyStart, int keyL } public static void calculateBytesArrayHashCodes(byte[][] bytesArrays, + int[] starts, int[] lengths, int[] hashCodes, final int count) { + + for (int i = 0; i < count; i++) { + hashCodes[i] = murmurHash(bytesArrays[i], starts[i], lengths[i]); + } + } + + public static void calculateBytesArrayHashCodes(byte[][] bytesArrays, int[] starts, int[] lengths, int[] valueSelected, int[] hashCodes, final int count) { for (int i = 0; i < count; i++) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java index e79fccd..21b8898 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java @@ -35,12 +35,11 @@ import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.HashTableLoaderFactory; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; +import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMapFactory; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition; import org.apache.hadoop.hive.ql.exec.persistence.KeyValueContainer; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer.KeyValueHelper; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinRowContainer; @@ -49,6 +48,9 @@ import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe; import org.apache.hadoop.hive.ql.exec.persistence.ObjectContainer; import org.apache.hadoop.hive.ql.exec.persistence.UnwrapRowContainer; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.spark.SparkUtilities; import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.hive.ql.log.PerfLogger; @@ -91,7 +93,7 @@ protected transient ReusableGetAdaptor[] hashMapRowGetters; private UnwrapRowContainer[] unwrapContainer; - private transient Configuration hconf; + protected transient Configuration hconf; private transient boolean hybridMapJoinLeftover; // whether there's spilled data to be processed protected transient MapJoinBytesTableContainer[] spilledMapJoinTables; // used to hold restored // spilled small tables @@ -334,8 +336,8 @@ public void cleanUpInputFileChangedOp() throws HiveException { loadHashTable(getExecContext(), MapredContext.get()); } - protected JoinUtil.JoinResult setMapJoinKey( - ReusableGetAdaptor dest, Object row, byte alias) throws HiveException { + protected MapJoinResult setMapJoinKey( + ReusableGetAdaptor dest, Object row, byte alias) throws HiveException, IOException { return dest.setFromRow(row, joinKeys[alias], joinKeysObjectInspectors[alias]); } @@ -383,7 +385,7 @@ public void process(Object row, int tag) throws HiveException { boolean bigTableRowSpilled = false; for (byte pos = 0; pos < order.length; pos++) { if (pos != alias) { - JoinUtil.JoinResult joinResult; + MapJoinResult joinResult; ReusableGetAdaptor adaptor; if (firstSetKey == null) { adaptor = firstSetKey = hashMapRowGetters[pos]; @@ -394,7 +396,7 @@ public void process(Object row, int tag) throws HiveException { joinResult = adaptor.setFromOther(firstSetKey); } MapJoinRowContainer rowContainer = adaptor.getCurrentRows(); - if (joinResult != JoinUtil.JoinResult.MATCH) { + if (joinResult != MapJoinResult.MATCH) { assert (rowContainer == null || !rowContainer.hasRows()) : "Expecting an empty result set for no match"; } @@ -408,7 +410,7 @@ public void process(Object row, int tag) throws HiveException { // For Hybrid Grace Hash Join, during the 1st round processing, // we only keep the LEFT side if the row is not spilled if (!conf.isHybridHashJoin() || hybridMapJoinLeftover || - (joinResult != JoinUtil.JoinResult.SPILL && !bigTableRowSpilled)) { + (joinResult != MapJoinResult.SPILL && !bigTableRowSpilled)) { joinNeeded = true; storage[pos] = dummyObjVectors[pos]; } else { @@ -426,7 +428,7 @@ public void process(Object row, int tag) throws HiveException { // When the JoinResult is SPILL, it means the corresponding small table row may have been // spilled to disk (at least the partition that holds this row is on disk). So we need to // postpone the join processing for this pair by also spilling this big table row. - if (joinResult == JoinUtil.JoinResult.SPILL && + if (joinResult == MapJoinResult.SPILL && !bigTableRowSpilled) { // For n-way join, only spill big table rows once spillBigTableRow(mapJoinTables[pos], row); bigTableRowSpilled = true; @@ -512,8 +514,8 @@ public void closeOp(boolean abort) throws HiveException { if (!hashPartitions[i].isHashMapOnDisk()) { hybridHtContainer.setTotalInMemRowCount( hybridHtContainer.getTotalInMemRowCount() - - hashPartitions[i].getHashMapFromMemory().getNumValues()); - hashPartitions[i].getHashMapFromMemory().clear(); + hashPartitions[i].getHashTableFromMemory().getNumValues()); + hashPartitions[i].getHashTableFromMemory().clear(); } } assert hybridHtContainer.getTotalInMemRowCount() == 0; @@ -624,7 +626,7 @@ protected void reloadHashTable(byte pos, int partitionId) // as the initialCapacity which cannot be 0, we provide a reasonable // positive number here } - BytesBytesMultiHashMap restoredHashMap = partition.getHashMapFromDisk(rowCount); + MapJoinHashTable restoredHashMap = partition.getHashMapFromDisk(rowCount); rowCount += restoredHashMap.getNumValues(); LOG.info("Hybrid Grace Hash Join: Deserializing spilled hash partition..."); LOG.info("Hybrid Grace Hash Join: Number of rows in hashmap: " + rowCount); @@ -637,20 +639,22 @@ protected void reloadHashTable(byte pos, int partitionId) " will be greater than memory limit. Recursive spilling is currently not supported"); } - KeyValueHelper writeHelper = container.getWriteHelper(); + KeyValuePut writeHelper = container.getKeyValuePutHelper(); while (kvContainer.hasNext()) { ObjectPair pair = kvContainer.next(); Writable key = pair.getFirst(); Writable val = pair.getSecond(); writeHelper.setKeyValue(key, val); - restoredHashMap.put(writeHelper, -1); + restoredHashMap.put(writeHelper); } container.setTotalInMemRowCount(container.getTotalInMemRowCount() + restoredHashMap.getNumValues()); kvContainer.clear(); - spilledMapJoinTables[pos] = new MapJoinBytesTableContainer(restoredHashMap); + // Use the BytesBytesMultiHashMap hash table. + spilledMapJoinTables[pos] = new MapJoinBytesTableContainer( + new BytesBytesMultiHashMapFactory(), restoredHashMap); spilledMapJoinTables[pos].setInternalValueOi(container.getInternalValueOi()); spilledMapJoinTables[pos].setSortableSortOrders(container.getSortableSortOrders()); spilledMapJoinTables[pos].setNullMarkers(container.getNullMarkers()); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java index 51acae0..047459f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec.persistence; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -27,9 +28,15 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.debug.Utils; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResultImpl; import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.WriteBuffers; +import org.apache.hive.common.util.HashCodeUtil; import com.google.common.annotations.VisibleForTesting; @@ -45,7 +52,7 @@ * Initially inspired by HPPC LongLongOpenHashMap; however, the code is almost completely reworked * and there's very little in common left save for quadratic probing (and that with some changes). */ -public final class BytesBytesMultiHashMap { +public final class BytesBytesMultiHashMap implements MapJoinHashTable { public static final Logger LOG = LoggerFactory.getLogger(BytesBytesMultiHashMap.class); /* @@ -193,8 +200,11 @@ public BytesBytesMultiHashMap(int initialCapacity, * The result of looking up a key in the multi-hash map. * * This object can read through the 0, 1, or more values found for the key. + * + * It implements the standard map join hash map result interface. + * */ - public static class Result { + public static class Result extends MapJoinHashTableResultImpl implements MapJoinHashMapResult { // Whether there are more than 0 rows. private boolean hasRows; @@ -221,6 +231,9 @@ public BytesBytesMultiHashMap(int initialCapacity, // A reference to the current row. private WriteBuffers.ByteSegmentRef byteSegmentRef; + // The associated alias filter value. + private byte aliasFilter; + public Result() { hasRows = false; byteSegmentRef = new WriteBuffers.ByteSegmentRef(); @@ -262,13 +275,16 @@ public boolean isSingleRow() { * The offset of just after the key length in the list record. Or, 0 when single row. */ public void set(BytesBytesMultiHashMap hashMap, long firstOffset, boolean hasList, - long offsetAfterListRecordKeyLen) { + long offsetAfterListRecordKeyLen, byte aliasFilter) { + + this.mapJoinResult = MapJoinResult.MATCH; this.hashMap = hashMap; this.firstOffset = firstOffset; this.hasList = hasList; this.offsetAfterListRecordKeyLen = offsetAfterListRecordKeyLen; + this.aliasFilter = aliasFilter; // Position at first row. readIndex = 0; @@ -412,22 +428,26 @@ public void forget() { readIndex = 0; nextTailOffset = -1; } - } - /** The source of keys and values to put into hashtable; avoids byte copying. */ - public static interface KvSource { - /** Write key into output. */ - public void writeKey(RandomAccessOutput dest) throws SerDeException; + @Override + public int cappedCount() { + return 0; + } + + @Override + public boolean isCappedCountAvailable() { + return false; + } - /** Write value into output. */ - public void writeValue(RandomAccessOutput dest) throws SerDeException; + @Override + public boolean isAliasFilterAvailable() { + return true; + } - /** - * Provide updated value for state byte for a key. - * @param previousValue Previous value; null if this is the first call per key. - * @return The updated value. - */ - public byte updateStateByte(Byte previousValue); + @Override + public byte aliasFilter() { + return aliasFilter; + } } /** @@ -435,11 +455,15 @@ public void forget() { * @param kv Keyvalue writer. Each method will be called at most once. */ private static final byte[] FOUR_ZEROES = new byte[] { 0, 0, 0, 0 }; - public void put(KvSource kv, int keyHashCode) throws SerDeException { + + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { if (resizeThreshold <= keysAssigned) { expandAndRehash(); } + KeyValuePutWriter keyValuePutWriter = (KeyValuePutWriter) keyValuePut; + // Reserve 4 bytes for the hash (don't just reserve, there may be junk there) writeBuffers.write(FOUR_ZEROES); @@ -447,9 +471,11 @@ public void put(KvSource kv, int keyHashCode) throws SerDeException { // become part of the record; otherwise, we will just write over it later. long keyOffset = writeBuffers.getWritePoint(); - kv.writeKey(writeBuffers); + keyValuePutWriter.writeKey(writeBuffers); int keyLength = (int)(writeBuffers.getWritePoint() - keyOffset); - int hashCode = (keyHashCode == -1) ? writeBuffers.hashCode(keyOffset, keyLength) : keyHashCode; + int hashCode = (keyValuePut.hasHashCode()) ? + keyValuePut.getKeyHashCode() : + writeBuffers.hashCode(keyOffset, keyLength); int slot = findKeySlotToWrite(keyOffset, keyLength, hashCode); // LOG.info("Write hash code is " + Integer.toBinaryString(hashCode) + " - " + slot); @@ -457,18 +483,18 @@ public void put(KvSource kv, int keyHashCode) throws SerDeException { long ref = refs[slot]; if (ref == 0) { // This is a new key, keep writing the first record. - long tailOffset = writeFirstValueRecord(kv, keyOffset, keyLength, hashCode); - byte stateByte = kv.updateStateByte(null); + long tailOffset = writeFirstValueRecord(keyValuePutWriter, keyOffset, keyLength, hashCode); + byte stateByte = keyValuePutWriter.updateStateByte(null); refs[slot] = Ref.makeFirstRef(tailOffset, stateByte, hashCode, startingHashBitCount); ++keysAssigned; } else { // This is not a new key; we'll overwrite the key and hash bytes - not needed anymore. writeBuffers.setWritePoint(keyOffset - 4); long lrPtrOffset = createOrGetListRecord(ref); - long tailOffset = writeValueAndLength(kv); + long tailOffset = writeValueAndLength(keyValuePutWriter); addRecordToList(lrPtrOffset, tailOffset); byte oldStateByte = Ref.getStateByte(ref); - byte stateByte = kv.updateStateByte(oldStateByte); + byte stateByte = keyValuePutWriter.updateStateByte(oldStateByte); if (oldStateByte != stateByte) { ref = Ref.setStateByte(ref, stateByte); } @@ -486,18 +512,22 @@ public void put(KvSource kv, int keyHashCode) throws SerDeException { * @param key Key buffer. * @param offset the offset to the key in the buffer * @param hashMapResult The object to fill in that can read the values. - * @return The state byte. */ - public byte getValueResult(byte[] key, int offset, int length, Result hashMapResult) { + @Override + public void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, + int hashCode, MapJoinHashMapResult hashMapResult) { hashMapResult.forget(); - WriteBuffers.Position readPos = hashMapResult.getReadPos(); + Result internalHashMapResult = (Result) hashMapResult; + + WriteBuffers.Position readPos = internalHashMapResult.getReadPos(); // First, find first record for the key. - long ref = findKeyRefToRead(key, offset, length, readPos); + long ref = findKeyRefToRead(keyBytes, keyStart, keyLength, readPos); if (ref == 0) { - return 0; + hashMapResult.setNoMatch(); + return; } boolean hasList = Ref.hasList(ref); @@ -505,9 +535,8 @@ public byte getValueResult(byte[] key, int offset, int length, Result hashMapRes // This relies on findKeyRefToRead doing key equality check and leaving read ptr where needed. long offsetAfterListRecordKeyLen = hasList ? writeBuffers.getReadPoint(readPos) : 0; - hashMapResult.set(this, Ref.getOffset(ref), hasList, offsetAfterListRecordKeyLen); - - return Ref.getStateByte(ref); + internalHashMapResult.set(this, Ref.getOffset(ref), hasList, offsetAfterListRecordKeyLen, + Ref.getStateByte(ref)); } /** @@ -523,6 +552,7 @@ public void populateValue(WriteBuffers.ByteSegmentRef valueRef) { * Number of keys in the hashmap * @return number of keys */ + @Override public int size() { return keysAssigned; } @@ -532,6 +562,7 @@ public int size() { * This is equal to or bigger than number of keys, since some values may share the same key * @return number of values */ + @Override public int getNumValues() { return numValues; } @@ -542,6 +573,7 @@ public int getNumValues() { * Others include instance fields: 100 * @return number of bytes */ + @Override public long memorySize() { return writeBuffers.size() + refs.length * 8 + 100; } @@ -558,6 +590,7 @@ public void clear() { this.numValues = 0; } + @Override public void expandAndRehashToTarget(int estimateNewRowCount) { int oldRefsCount = refs.length; int newRefsCount = oldRefsCount + estimateNewRowCount; @@ -741,7 +774,7 @@ private void expandAndRehash() { long capacity = refs.length << 1; expandAndRehashImpl(capacity); } - + private void expandAndRehashImpl(long capacity) { long expandTime = System.currentTimeMillis(); final long[] oldRefs = refs; @@ -834,9 +867,9 @@ private void addRecordToList(long lrPtrOffset, long tailOffset) { * @return The offset of the new record. */ private long writeFirstValueRecord( - KvSource kv, long keyOffset, int keyLength, int hashCode) throws SerDeException { + KeyValuePutWriter keyValuePutWriter, long keyOffset, int keyLength, int hashCode) throws SerDeException { long valueOffset = writeBuffers.getWritePoint(); - kv.writeValue(writeBuffers); + keyValuePutWriter.writeValue(writeBuffers); long tailOffset = writeBuffers.getWritePoint(); int valueLength = (int)(tailOffset - valueOffset); // LOG.info("Writing value at " + valueOffset + " length " + valueLength); @@ -863,9 +896,9 @@ private long writeFirstValueRecord( * @param kv Key-value writer. * @return The offset of the new record. */ - private long writeValueAndLength(KvSource kv) throws SerDeException { + private long writeValueAndLength(KeyValuePutWriter keyValuePutWriter) throws SerDeException { long valueOffset = writeBuffers.getWritePoint(); - kv.writeValue(writeBuffers); + keyValuePutWriter.writeValue(writeBuffers); long tailOffset = writeBuffers.getWritePoint(); writeBuffers.writeVLong(tailOffset - valueOffset); // LOG.info("Writing value at " + valueOffset + " length " + (tailOffset - valueOffset)); @@ -901,7 +934,8 @@ public void debugDumpTable() { dump.append(Utils.toStringBinary(key, 0, key.length)).append(" ref [").append(dumpRef(ref)) .append("]: "); Result hashMapResult = new Result(); - getValueResult(key, 0, key.length, hashMapResult); + int hashCode = HashCodeUtil.calculateBytesHashCode(key, 0, key.length); + hashMapLookup(key, 0, key.length, hashCode, hashMapResult); List results = new ArrayList(); WriteBuffers.ByteSegmentRef byteSegmentRef = hashMapResult.first(); while (byteSegmentRef != null) { @@ -998,6 +1032,7 @@ private static String dumpRef(long ref) { + " h=" + Long.toBinaryString(Ref.getHashBits(ref)); } + @Override public void debugDumpMetrics() { LOG.info("Map metrics: keys allocated " + this.refs.length +", keys assigned " + keysAssigned + ", write conflict " + metricPutConflict + ", write max dist " + largestNumberOfSteps @@ -1024,4 +1059,51 @@ private void debugDumpKeyProbe(long keyOffset, int keyLength, int hashCode, int } LOG.info(sb.toString()); } + + // New methods for Native Vector Map Join not implemented here. + @Override + public boolean useMinMax() { + return false; + } + + @Override + public long min() { + throw new RuntimeException("Not supported"); + } + + @Override + public long max() { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashMapLookup(long key, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashMultiSetContains(byte[] keyBytes, int keyStart, + int keyLength, int hashCode, MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashMultiSetContains(long key, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) throws IOException { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, + int hashCode, MapJoinHashSetResult hashSetResult) throws IOException { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashSetContains(long key, int hashCode, + MapJoinHashSetResult hashSetResult) throws IOException { + throw new RuntimeException("Not supported"); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMapFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMapFactory.java new file mode 100644 index 0000000..1cc1c97 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMapFactory.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; + +/* + * Factory for creating BytesBytesMultiHashMap. + */ +public class BytesBytesMultiHashMapFactory implements MapJoinHashTableFactory { + + public static final Log LOG = LogFactory.getLog(BytesBytesMultiHashMapFactory.class); + + @Override + public MapJoinHashTable createHashTable(int initialCapacity, float loadFactor, + int writeBuffersSize, long memUsage) { + return new BytesBytesMultiHashMap(initialCapacity, loadFactor, writeBuffersSize, memUsage); + } + + /* + * @return A new hash map result implementation specific object. + * + * The object can be used to access the values when there is a match, or + * access spill information when the partition with the key is currently spilled. + */ + @Override + public MapJoinHashMapResult createHashMapResult() { + return (MapJoinHashMapResult) new BytesBytesMultiHashMap.Result(); + } + + /* + * @return A new hash multi-set result implementation specific object. + * + * The object can be used to access the *count* of values when the key is contained in the + * multi-set, or access spill information when the partition with the key is currently spilled. + */ + @Override + public MapJoinHashMultiSetResult createHashMultiSetResult() { + throw new RuntimeException("Not supported"); + } + + /* + * @return A new hash set result implementation specific object. + * + * The object can be used to access access spill information when the partition with the key + * is currently spilled. + */ + @Override + public MapJoinHashSetResult createHashSetResult() { + throw new RuntimeException("Not supported"); + } + + @Override + public boolean keyValuePutHelperIsExternal() { + // MapJoinBytesTableContainer will implement a KeyValuePutWriter for BytesBytesMultiHashMap + // to use. + return false; + } + + @Override + public KeyValuePut createKeyValuePut() { + // Not supplied when keyValuePutHelperIsExternal is false. + return null; + } + + @Override + public boolean useMinMax() { + // Min/Max not supported for BytesBytesMultiHashMap. + return false; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java index a3bccc6..67a9f0e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java @@ -32,6 +32,9 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapper; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapperBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; @@ -155,7 +158,7 @@ public GetAdaptor(MapJoinKey key) { } @Override - public JoinUtil.JoinResult setFromVector(VectorHashKeyWrapper kw, + public MapJoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, VectorHashKeyWrapperBatch keyWrapperBatch) throws HiveException { if (currentKey == null) { @@ -172,15 +175,15 @@ public GetAdaptor(MapJoinKey key) { isFirstKey = false; this.currentValue = mHash.get(key); if (this.currentValue == null) { - return JoinUtil.JoinResult.NOMATCH; + return MapJoinResult.NO_MATCH; } else { - return JoinUtil.JoinResult.MATCH; + return MapJoinResult.MATCH; } } @Override - public JoinUtil.JoinResult setFromRow(Object row, List fields, + public MapJoinResult setFromRow(Object row, List fields, List ois) throws HiveException { if (currentKey == null) { currentKey = new Object[fields.size()]; @@ -192,25 +195,25 @@ public GetAdaptor(MapJoinKey key) { isFirstKey = false; this.currentValue = mHash.get(key); if (this.currentValue == null) { - return JoinUtil.JoinResult.NOMATCH; + return MapJoinResult.NO_MATCH; } else { - return JoinUtil.JoinResult.MATCH; + return MapJoinResult.MATCH; } } @Override - public JoinUtil.JoinResult setFromOther(ReusableGetAdaptor other) { + public MapJoinResult setFromOther(ReusableGetAdaptor other) { assert other instanceof GetAdaptor; GetAdaptor other2 = (GetAdaptor)other; this.key = other2.key; this.isFirstKey = other2.isFirstKey; this.currentValue = mHash.get(key); if (this.currentValue == null) { - return JoinUtil.JoinResult.NOMATCH; + return MapJoinResult.NO_MATCH; } else { - return JoinUtil.JoinResult.MATCH; + return MapJoinResult.MATCH; } } @@ -250,10 +253,19 @@ public boolean hasSpill() { return false; } - @Override public void setSerde(MapJoinObjectSerDeContext keyCtx, MapJoinObjectSerDeContext valCtx) throws SerDeException { this.keyContext = keyCtx; this.valueContext = valCtx; } + + @Override + public MapJoinHashTableFind getMapJoinHashTableFind() { + throw new RuntimeException("Not supported"); + } + + @Override + public MapJoinHashTableFactory getMapJoinHashTableFactory() { + throw new RuntimeException("Not supported"); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HybridHashTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HybridHashTableContainer.java index f5da5a4..1906fc2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HybridHashTableContainer.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HybridHashTableContainer.java @@ -35,10 +35,15 @@ import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer.KeyValueHelper; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapper; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapperBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; @@ -73,11 +78,23 @@ * * Partitions that can fit in memory will be processed first, and then every spilled partition will * be restored and processed one by one. + * + * It implements the standard map join hash map find interface. + * */ public class HybridHashTableContainer - implements MapJoinTableContainer, MapJoinTableContainerDirectAccess { + implements MapJoinTableContainer, MapJoinHashTableFind { private static final Logger LOG = LoggerFactory.getLogger(HybridHashTableContainer.class); + private final MapJoinHashTableFactory mapJoinHashTableFactory; + // Factory for creating hash tables. + + private final boolean useMinMax; + // Whether we should maintain min/max for + // small table for optimizing lookup. + private long longMin; + private long longMax; + private final HashPartition[] hashPartitions; // an array of partitions holding the triplets private int totalInMemRowCount = 0; // total number of small table rows in memory private long memoryThreshold; // the max memory limit that can be allocated @@ -96,8 +113,8 @@ private boolean[] sortableSortOrders; private byte[] nullMarkers; private byte[] notNullMarkers; - private MapJoinBytesTableContainer.KeyValueHelper writeHelper; - private final MapJoinBytesTableContainer.DirectKeyValueWriter directWriteHelper; + private KeyValuePut keyValuePutHelper; + /* * this is not a real bloom filter, but is a cheap version of the 1-memory * access bloom filters @@ -120,7 +137,10 @@ * The triplet: hashmap (either in memory or on disk), small table container, big table container */ public static class HashPartition { - BytesBytesMultiHashMap hashMap; // In memory hashMap + MapJoinHashTableFactory mapJoinHashTableFactory; + // Hash table factor to use. + MapJoinHashTable hashTable; // In memory hashTable + Class hashTableClass; // Class of hashTable so we can bring it back from disk. KeyValueContainer sidefileKVContainer; // Stores small table key/value pairs ObjectContainer matchfileObjContainer; // Stores big table rows VectorMapJoinRowBytesContainer matchfileRowBytesContainer; @@ -128,7 +148,7 @@ Path hashMapLocalPath; // Local file system path for spilled hashMap boolean hashMapOnDisk; // Status of hashMap. true: on disk, false: in memory boolean hashMapSpilledOnCreation; // When there's no enough memory, cannot create hashMap - int initialCapacity; // Used to create an empty BytesBytesMultiHashMap + int initialCapacity; // Used to create an empty MapJoinHashTable float loadFactor; // Same as above int wbSize; // Same as above int rowsOnDisk; // How many rows saved to the on-disk hashmap (if on disk) @@ -138,11 +158,13 @@ * In that case, we don't create the hashmap, but pretend the hashmap is directly "spilled". */ public HashPartition(int initialCapacity, float loadFactor, int wbSize, long maxProbeSize, - boolean createHashMap, String spillLocalDirs) { + boolean createHashMap, String spillLocalDirs, + MapJoinHashTableFactory mapJoinHashTableFactory) { + this.mapJoinHashTableFactory = mapJoinHashTableFactory; if (createHashMap) { // Probe space should be at least equal to the size of our designated wbSize maxProbeSize = Math.max(maxProbeSize, wbSize); - hashMap = new BytesBytesMultiHashMap(initialCapacity, loadFactor, wbSize, maxProbeSize); + hashTable = mapJoinHashTableFactory.createHashTable(initialCapacity, loadFactor, wbSize, maxProbeSize); } else { hashMapSpilledOnCreation = true; hashMapOnDisk = true; @@ -154,24 +176,24 @@ public HashPartition(int initialCapacity, float loadFactor, int wbSize, long max } /* Get the in memory hashmap */ - public BytesBytesMultiHashMap getHashMapFromMemory() { - return hashMap; + public MapJoinHashTable getHashTableFromMemory() { + return hashTable; } /* Restore the hashmap from disk by deserializing it. * Currently Kryo is used for this purpose. */ - public BytesBytesMultiHashMap getHashMapFromDisk(int rowCount) + public MapJoinHashTable getHashMapFromDisk(int rowCount) throws IOException, ClassNotFoundException { if (hashMapSpilledOnCreation) { - return new BytesBytesMultiHashMap(rowCount, loadFactor, wbSize, -1); + return mapJoinHashTableFactory.createHashTable(rowCount, loadFactor, wbSize, -1); } else { InputStream inputStream = Files.newInputStream(hashMapLocalPath); com.esotericsoftware.kryo.io.Input input = new com.esotericsoftware.kryo.io.Input(inputStream); Kryo kryo = SerializationUtilities.borrowKryo(); - BytesBytesMultiHashMap restoredHashMap = null; + MapJoinHashTable restoredHashMap = null; try { - restoredHashMap = kryo.readObject(input, BytesBytesMultiHashMap.class); + restoredHashMap = (MapJoinHashTable) kryo.readObject(input, hashTableClass); } finally { SerializationUtilities.releaseKryo(kryo); } @@ -221,9 +243,9 @@ public boolean isHashMapOnDisk() { } public void clear() { - if (hashMap != null) { - hashMap.clear(); - hashMap = null; + if (hashTable != null) { + hashTable.clear(); + hashTable = null; } if (hashMapLocalPath != null) { @@ -258,13 +280,14 @@ public int size() { return rowsOnDisk + (sidefileKVContainer != null ? sidefileKVContainer.size() : 0); } else { // All rows should be in the in-memory hashmap - return hashMap.size(); + return hashTable.size(); } } } public HybridHashTableContainer(Configuration hconf, long keyCount, long memoryAvailable, - long estimatedTableSize, HybridHashTableConf nwayConf) + long estimatedTableSize, HybridHashTableConf nwayConf, + MapJoinHashTableFactory mapJoinHashTableFactory) throws SerDeException, IOException { this(HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEKEYCOUNTADJUSTMENT), HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD), @@ -275,15 +298,24 @@ public HybridHashTableContainer(Configuration hconf, long keyCount, long memoryA HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHYBRIDGRACEHASHJOINMINNUMPARTITIONS), HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEMAPJOINOPTIMIZEDTABLEPROBEPERCENT), estimatedTableSize, keyCount, memoryAvailable, nwayConf, - RowContainer.getLocalDirsForSpillFiles(hconf)); + RowContainer.getLocalDirsForSpillFiles(hconf), mapJoinHashTableFactory); } private HybridHashTableContainer(float keyCountAdj, int threshold, float loadFactor, int memCheckFreq, int minWbSize, int maxWbSize, int minNumParts, float probePercent, long estimatedTableSize, long keyCount, long memoryAvailable, HybridHashTableConf nwayConf, - String spillLocalDirs) + String spillLocalDirs, MapJoinHashTableFactory mapJoinHashTableFactory) throws SerDeException, IOException { - directWriteHelper = new MapJoinBytesTableContainer.DirectKeyValueWriter(); + + this.mapJoinHashTableFactory = mapJoinHashTableFactory; + useMinMax = mapJoinHashTableFactory.useMinMax(); + if (useMinMax) { + // We have a single long key and the hash table supports min/max. + longMin = Long.MAX_VALUE; + longMax = Long.MIN_VALUE; + } else { + longMin = longMax = 0; + } int newKeyCount = HashMapWrapper.calculateTableSize( keyCountAdj, threshold, loadFactor, keyCount); @@ -353,22 +385,22 @@ private HybridHashTableContainer(float keyCountAdj, int threshold, float loadFac nwayConf.getLoadedContainerList().size() == 0) { // n-way join, first (biggest) small table if (i == 0) { // We unconditionally create a hashmap for the first hash partition hashPartitions[i] = new HashPartition(initialCapacity, loadFactor, writeBufferSize, - maxCapacity, true, spillLocalDirs); + maxCapacity, true, spillLocalDirs, mapJoinHashTableFactory); } else { // To check whether we have enough memory to allocate for another hash partition, // we need to get the size of the first hash partition to get an idea. hashPartitions[i] = new HashPartition(initialCapacity, loadFactor, writeBufferSize, - maxCapacity, memoryUsed + hashPartitions[0].hashMap.memorySize() < memoryThreshold, - spillLocalDirs); + maxCapacity, memoryUsed + hashPartitions[0].hashTable.memorySize() < memoryThreshold, + spillLocalDirs, mapJoinHashTableFactory); } } else { // n-way join, all later small tables // For all later small tables, follow the same pattern of the previously loaded tables. if (this.nwayConf.doSpillOnCreation(i)) { hashPartitions[i] = new HashPartition(initialCapacity, loadFactor, writeBufferSize, - maxCapacity, false, spillLocalDirs); + maxCapacity, false, spillLocalDirs, mapJoinHashTableFactory); } else { hashPartitions[i] = new HashPartition(initialCapacity, loadFactor, writeBufferSize, - maxCapacity, true, spillLocalDirs); + maxCapacity, true, spillLocalDirs, mapJoinHashTableFactory); } } @@ -380,7 +412,7 @@ private HybridHashTableContainer(float keyCountAdj, int threshold, float loadFac this.nwayConf.setNextSpillPartition(i - 1); } } else { - memoryUsed += hashPartitions[i].hashMap.memorySize(); + memoryUsed += hashPartitions[i].hashTable.memorySize(); } } assert numPartitionsSpilledOnCreation != numPartitions : "All partitions are directly spilled!" + @@ -395,9 +427,8 @@ private HybridHashTableContainer(float keyCountAdj, int threshold, float loadFac } } - - public MapJoinBytesTableContainer.KeyValueHelper getWriteHelper() { - return writeHelper; + public KeyValuePut getKeyValuePutHelper() { + return keyValuePutHelper; } public HashPartition[] getHashPartitions() { @@ -415,8 +446,8 @@ public long getMemoryThreshold() { public long refreshMemoryUsed() { long memUsed = 0; for (HashPartition hp : hashPartitions) { - if (hp.hashMap != null) { - memUsed += hp.hashMap.memorySize(); + if (hp.hashTable != null) { + memUsed += hp.hashTable.memorySize(); } } return memoryUsed = memUsed; @@ -447,25 +478,33 @@ public LazyBinaryStructObjectInspector getInternalValueOi() { @Override public MapJoinKey putRow(Writable currentKey, Writable currentValue) throws SerDeException, HiveException, IOException { - writeHelper.setKeyValue(currentKey, currentValue); - return internalPutRow(writeHelper, currentKey, currentValue); + keyValuePutHelper.setKeyValue(currentKey, currentValue); + return internalPutRow(keyValuePutHelper, currentKey, currentValue); } - private MapJoinKey internalPutRow(KeyValueHelper keyValueHelper, + private MapJoinKey internalPutRow(KeyValuePut keyValuePut, Writable currentKey, Writable currentValue) throws SerDeException, IOException { // Next, put row into corresponding hash partition - int keyHash = keyValueHelper.getHashFromKey(); - int partitionId = keyHash & (hashPartitions.length - 1); + keyValuePut.setKeyValue(currentKey, currentValue); + int hashCode = keyValuePut.getKeyHashCode(); + + int partitionId = hashCode & (hashPartitions.length - 1); HashPartition hashPartition = hashPartitions[partitionId]; - bloom1.addLong(keyHash); + bloom1.addLong(hashCode); + if (useMinMax) { + // We have a single long key and the hash table supports min/max. + long longValue = keyValuePut.getLongKey(); + longMin = Math.min(longValue, longMin); + longMax = Math.max(longValue, longMax); + } if (isOnDisk(partitionId) || isHashMapSpilledOnCreation(partitionId)) { KeyValueContainer kvContainer = hashPartition.getSidefileKVContainer(); kvContainer.add((HiveKey) currentKey, (BytesWritable) currentValue); } else { - hashPartition.hashMap.put(keyValueHelper, keyHash); // Pass along hashcode to avoid recalculation + hashPartition.hashTable.put(keyValuePut); totalInMemRowCount++; if ((totalInMemRowCount & (this.memoryCheckFrequency - 1)) == 0 && // check periodically @@ -505,7 +544,7 @@ public boolean isOnDisk(int partitionId) { /** * Check if the hash table of a specified partition has been "spilled" to disk when it was created. * In fact, in other words, check if a hashmap does exist or not. - * @param partitionId hashMap ID + * @param partitionId hashTable ID * @return true if it was not created at all, false if there is a hash table existing there */ public boolean isHashMapSpilledOnCreation(int partitionId) { @@ -534,7 +573,7 @@ private int biggestPartition() { if (isOnDisk(i)) { continue; } else { - size = hashPartitions[i].hashMap.getNumValues(); + size = hashPartitions[i].hashTable.getNumValues(); } if (size > maxSize) { maxSize = size; @@ -551,7 +590,7 @@ private int biggestPartition() { */ public long spillPartition(int partitionId) throws IOException { HashPartition partition = hashPartitions[partitionId]; - int inMemRowCount = partition.hashMap.getNumValues(); + int inMemRowCount = partition.hashTable.getNumValues(); File file = FileUtils.createLocalDirsTempFile( spillLocalDirs, "partition-" + partitionId + "-", null, false); @@ -561,7 +600,8 @@ public long spillPartition(int partitionId) throws IOException { new com.esotericsoftware.kryo.io.Output(outputStream); Kryo kryo = SerializationUtilities.borrowKryo(); try { - kryo.writeObject(output, partition.hashMap); // use Kryo to serialize hashmap + kryo.writeObject(output, partition.hashTable); // use Kryo to serialize hashmap + partition.hashTableClass = partition.hashTable.getClass(); // remember so we can defrost it. output.close(); outputStream.close(); } finally { @@ -572,16 +612,16 @@ public long spillPartition(int partitionId) throws IOException { partition.hashMapOnDisk = true; LOG.info("Spilling hash partition " + partitionId + " (Rows: " + inMemRowCount + - ", Mem size: " + partition.hashMap.memorySize() + "): " + file); + ", Mem size: " + partition.hashTable.memorySize() + "): " + file); LOG.info("Memory usage before spilling: " + memoryUsed); - long memFreed = partition.hashMap.memorySize(); + long memFreed = partition.hashTable.memorySize(); memoryUsed -= memFreed; LOG.info("Memory usage after spilling: " + memoryUsed); partition.rowsOnDisk = inMemRowCount; totalInMemRowCount -= inMemRowCount; - partition.hashMap.clear(); + partition.hashTable.clear(); return memFreed; } @@ -679,25 +719,16 @@ public ReusableGetAdaptor createGetter(MapJoinKey keyTypeFromLoader) { public void seal() { for (HashPartition hp : hashPartitions) { // Only seal those partitions that haven't been spilled and cleared, - // because once a hashMap is cleared, it will become unusable - if (hp.hashMap != null && hp.hashMap.size() != 0) { - hp.hashMap.seal(); + // because once a hashTable is cleared, it will become unusable + if (hp.hashTable != null && hp.hashTable.size() != 0) { + hp.hashTable.seal(); } } } - - // Direct access interfaces. - - @Override - public void put(Writable currentKey, Writable currentValue) throws SerDeException, IOException { - directWriteHelper.setKeyValue(currentKey, currentValue); - internalPutRow(directWriteHelper, currentKey, currentValue); - } - /** Implementation of ReusableGetAdaptor that has Output for key serialization; row * container is also created once and reused for every row. */ - private class GetAdaptor implements ReusableGetAdaptor, ReusableGetAdaptorDirectAccess { + private class GetAdaptor implements ReusableGetAdaptor { private Object[] currentKey; private boolean[] nulls; @@ -712,9 +743,9 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromVector(VectorHashKeyWrapper kw, + public MapJoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, VectorHashKeyWrapperBatch keyWrapperBatch) - throws HiveException { + throws HiveException, IOException { if (nulls == null) { nulls = new boolean[keyOutputWriters.length]; currentKey = new Object[keyOutputWriters.length]; @@ -735,8 +766,8 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromRow(Object row, List fields, - List ois) throws HiveException { + public MapJoinResult setFromRow(Object row, List fields, + List ois) throws HiveException, IOException { if (nulls == null) { nulls = new boolean[fields.size()]; currentKey = new Object[fields.size()]; @@ -751,7 +782,7 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromOther(ReusableGetAdaptor other) throws HiveException { + public MapJoinResult setFromOther(ReusableGetAdaptor other) throws HiveException, IOException { assert other instanceof GetAdaptor; GetAdaptor other2 = (GetAdaptor)other; nulls = other2.nulls; @@ -779,26 +810,14 @@ public MapJoinRowContainer getCurrentRows() { public Object[] getCurrentKey() { return currentKey; } - - // Direct access interfaces. - - @Override - public JoinUtil.JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult) { - return currentValue.setDirect(bytes, offset, length, hashMapResult); - } - - @Override - public int directSpillPartitionId() { - return currentValue.directSpillPartitionId(); - } } /** Row container that gets and deserializes the rows on demand from bytes provided. */ private class ReusableRowContainer implements MapJoinRowContainer, AbstractRowContainer.RowIterator> { private byte aliasFilter; - private final BytesBytesMultiHashMap.Result hashMapResult; + + private final MapJoinHashMapResult hashMapResult; /** * Sometimes, when container is empty in multi-table mapjoin, we need to add a dummy row. @@ -811,7 +830,7 @@ public int directSpillPartitionId() { private final boolean needsComplexObjectFixup; private final ArrayList complexObjectArrayBuffer; - private int partitionId; // Current hashMap in use + private int partitionId; // Current hashTable in use public ReusableRowContainer() { if (internalValueOi != null) { @@ -831,29 +850,31 @@ public ReusableRowContainer() { complexObjectArrayBuffer = null; } uselessIndirection = new ByteArrayRef(); - hashMapResult = new BytesBytesMultiHashMap.Result(); + + // Use the factory to create a hash map result. + hashMapResult = mapJoinHashTableFactory.createHashMapResult(); clearRows(); } /* Determine if there is a match between big table row and the corresponding hashtable * Three states can be returned: * MATCH: a match is found - * NOMATCH: no match is found from the specified partition + * NO_MATCH: no match is found from the specified partition * SPILL: the specified partition has been spilled to disk and is not available; * the evaluation for this big table row will be postponed. */ - public JoinUtil.JoinResult setFromOutput(Output output) throws HiveException { + public MapJoinResult setFromOutput(Output output) throws HiveException, IOException { int keyHash = HashCodeUtil.murmurHash(output.getData(), 0, output.getLength()); if (!bloom1.testLong(keyHash)) { /* * if the keyHash is missing in the bloom filter, then the value cannot - * exist in any of the spilled partition - return NOMATCH + * exist in any of the spilled partition - return NO_MATCH */ dummyRow = null; aliasFilter = (byte) 0xff; hashMapResult.forget(); - return JoinResult.NOMATCH; + return MapJoinResult.NO_MATCH; } partitionId = keyHash & (hashPartitions.length - 1); @@ -861,19 +882,25 @@ public ReusableRowContainer() { // If the target hash table is on disk, spill this row to disk as well to be processed later if (isOnDisk(partitionId)) { toSpillPartitionId = partitionId; - hashMapResult.forget(); - return JoinUtil.JoinResult.SPILL; + hashMapResult.setSpill(partitionId); + return MapJoinResult.SPILL; } else { - aliasFilter = hashPartitions[partitionId].hashMap.getValueResult(output.getData(), 0, - output.getLength(), hashMapResult); + hashPartitions[partitionId].hashTable.hashMapLookup(output.getData(), 0, output.getLength(), + keyHash, hashMapResult); + MapJoinResult mapJoinResult = hashMapResult.getMapJoinResult(); dummyRow = null; - if (hashMapResult.hasRows()) { - return JoinUtil.JoinResult.MATCH; - } else { + switch (mapJoinResult) { + case MATCH: + aliasFilter = hashMapResult.aliasFilter(); + break; + case NO_MATCH: aliasFilter = (byte) 0xff; - return JoinUtil.JoinResult.NOMATCH; + break; + default: + throw new RuntimeException("Unexpected map join result " + mapJoinResult.name()); } + return mapJoinResult; } } @@ -990,45 +1017,175 @@ public void addRow(Object[] value) { public void write(MapJoinObjectSerDeContext valueContext, ObjectOutputStream out) { throw new RuntimeException(this.getClass().getCanonicalName() + " cannot be serialized"); } + } - // Direct access. + @Override + public MapJoinHashTableFind getMapJoinHashTableFind() { + return (MapJoinHashTableFind) this; + } - public JoinUtil.JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult) { + @Override + public MapJoinHashTableFactory getMapJoinHashTableFactory() { + return mapJoinHashTableFactory; + } - int keyHash = HashCodeUtil.murmurHash(bytes, offset, length); - partitionId = keyHash & (hashPartitions.length - 1); + @Override + public boolean useMinMax() { + return useMinMax; + } - if (!bloom1.testLong(keyHash)) { - /* - * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the - * spilled partition - return NOMATCH - */ - dummyRow = null; - aliasFilter = (byte) 0xff; - hashMapResult.forget(); - return JoinResult.NOMATCH; - } + @Override + public long min() { + return longMin; + } - // If the target hash table is on disk, spill this row to disk as well to be processed later - if (isOnDisk(partitionId)) { - return JoinUtil.JoinResult.SPILL; - } - else { - aliasFilter = hashPartitions[partitionId].hashMap.getValueResult(bytes, offset, length, - hashMapResult); - dummyRow = null; - if (hashMapResult.hasRows()) { - return JoinUtil.JoinResult.MATCH; - } else { - aliasFilter = (byte) 0xff; - return JoinUtil.JoinResult.NOMATCH; - } - } + @Override + public long max() { + return longMax; + } + + @Override + public void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException { + + if (!bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - return NOMATCH + */ + hashMapResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashMapResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashMapLookup(keyBytes, keyStart, keyLength, + hashCode, hashMapResult); + } + } + + @Override + public void hashMapLookup(long key, int hashCode, MapJoinHashMapResult hashMapResult) + throws IOException { + + if (!bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - return NOMATCH + */ + hashMapResult.setNoMatch(); + return; } - public int directSpillPartitionId() { - return partitionId; + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashMapResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashMapLookup(key, + hashCode, hashMapResult); + } + } + + @Override + public void hashMultiSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException { + + if (!bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - return NOMATCH + */ + hashMultiSetResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashMultiSetResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashMultiSetContains(keyBytes, keyStart, keyLength, + hashCode, hashMultiSetResult); + } + } + + @Override + public void hashMultiSetContains(long key, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) throws IOException { + + if (!bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - return NOMATCH + */ + hashMultiSetResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashMultiSetResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashMultiSetContains(key, + hashCode, hashMultiSetResult); + } + } + + @Override + public void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashSetResult hashSetResult) throws IOException { + + if (!bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - return NOMATCH + */ + hashSetResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashSetResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashSetContains(keyBytes, keyStart, keyLength, + hashCode, hashSetResult); + } + } + + @Override + public void hashSetContains(long key, int hashCode, MapJoinHashSetResult hashSetResult) + throws IOException { + + if (!bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - return NOMATCH + */ + hashSetResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashSetResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashSetContains(key, + hashCode, hashSetResult); } } @@ -1036,8 +1193,8 @@ public int directSpillPartitionId() { public void dumpMetrics() { for (int i = 0; i < hashPartitions.length; i++) { HashPartition hp = hashPartitions[i]; - if (hp.hashMap != null) { - hp.hashMap.debugDumpMetrics(); + if (hp.hashTable != null) { + hp.hashTable.debugDumpMetrics(); } } } @@ -1071,22 +1228,25 @@ public int size() { @Override public void setSerde(MapJoinObjectSerDeContext keyCtx, MapJoinObjectSerDeContext valCtx) throws SerDeException { - SerDe keySerde = keyCtx.getSerDe(), valSerde = valCtx.getSerDe(); - - if (writeHelper == null) { - LOG.info("Initializing container with " + keySerde.getClass().getName() + " and " - + valSerde.getClass().getName()); - - // We assume this hashtable is loaded only when tez is enabled - LazyBinaryStructObjectInspector valSoi = - (LazyBinaryStructObjectInspector) valSerde.getObjectInspector(); - writeHelper = new MapJoinBytesTableContainer.LazyBinaryKvWriter(keySerde, valSoi, - valCtx.hasFilterTag()); - if (internalValueOi == null) { - internalValueOi = valSoi; - } - if (sortableSortOrders == null) { - sortableSortOrders = ((BinarySortableSerDe) keySerde).getSortOrders(); + if (mapJoinHashTableFactory.keyValuePutHelperIsExternal()) { + keyValuePutHelper = mapJoinHashTableFactory.createKeyValuePut(); + } else { + SerDe keySerde = keyCtx.getSerDe(), valSerde = valCtx.getSerDe(); + if (keyValuePutHelper == null) { + LOG.info("Initializing container with " + keySerde.getClass().getName() + " and " + + valSerde.getClass().getName()); + + // We assume this hashtable is loaded only when tez is enabled + LazyBinaryStructObjectInspector valSoi = + (LazyBinaryStructObjectInspector) valSerde.getObjectInspector(); + keyValuePutHelper = new MapJoinBytesTableContainer.LazyBinaryKvWriter(keySerde, valSoi, + valCtx.hasFilterTag()); + if (internalValueOi == null) { + internalValueOi = valSoi; + } + if (sortableSortOrders == null) { + sortableSortOrders = ((BinarySortableSerDe) keySerde).getSortOrders(); + } } if (nullMarkers == null) { nullMarkers = ((BinarySortableSerDe) keySerde).getNullMarkers(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java index a8aa71a..c10b77b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.exec.persistence; +import java.io.IOException; import java.io.ObjectOutputStream; import java.util.ArrayList; import java.util.Arrays; @@ -28,7 +29,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePutWriter; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapper; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapperBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; @@ -65,14 +72,16 @@ /** * Table container that serializes keys and values using LazyBinarySerDe into - * BytesBytesMultiHashMap, with very low memory overhead. However, + * a MapJoinHashTableFind, with very low memory overhead. However, * there may be some perf overhead when retrieving rows. */ -public class MapJoinBytesTableContainer - implements MapJoinTableContainer, MapJoinTableContainerDirectAccess { +public class MapJoinBytesTableContainer implements MapJoinTableContainer { private static final Logger LOG = LoggerFactory.getLogger(MapJoinTableContainer.class); - private final BytesBytesMultiHashMap hashMap; + private final MapJoinHashTableFactory mapJoinHashTableFactory; + + private final MapJoinHashTable hashTable; + /** The OI used to deserialize values. We never deserialize keys. */ private LazyBinaryStructObjectInspector internalValueOi; /** @@ -84,31 +93,35 @@ private boolean[] sortableSortOrders; private byte[] nullMarkers; private byte[] notNullMarkers; - private KeyValueHelper writeHelper; - private DirectKeyValueWriter directWriteHelper; + + private KeyValuePut keyValuePutHelper; private final List EMPTY_LIST = new ArrayList(0); public MapJoinBytesTableContainer(Configuration hconf, - MapJoinObjectSerDeContext valCtx, long keyCount, long memUsage) throws SerDeException { + MapJoinObjectSerDeContext valCtx, long keyCount, long memUsage, + MapJoinHashTableFactory mapJoinHashTableFactory) throws SerDeException { this(HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEKEYCOUNTADJUSTMENT), HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD), HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLELOADFACTOR), HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEWBSIZE), - valCtx, keyCount, memUsage); + valCtx, keyCount, memUsage, mapJoinHashTableFactory); } private MapJoinBytesTableContainer(float keyCountAdj, int threshold, float loadFactor, - int wbSize, MapJoinObjectSerDeContext valCtx, long keyCount, long memUsage) + int wbSize, MapJoinObjectSerDeContext valCtx, long keyCount, long memUsage, + MapJoinHashTableFactory mapJoinHashTableFactory) throws SerDeException { int newThreshold = HashMapWrapper.calculateTableSize( keyCountAdj, threshold, loadFactor, keyCount); - hashMap = new BytesBytesMultiHashMap(newThreshold, loadFactor, wbSize, memUsage); - directWriteHelper = new DirectKeyValueWriter(); + this.mapJoinHashTableFactory = mapJoinHashTableFactory; + hashTable = mapJoinHashTableFactory.createHashTable(newThreshold, loadFactor, wbSize, memUsage); } - public MapJoinBytesTableContainer(BytesBytesMultiHashMap hashMap) { - this.hashMap = hashMap; + public MapJoinBytesTableContainer(MapJoinHashTableFactory mapJoinHashTableFactory, + MapJoinHashTable hashTable) { + this.mapJoinHashTableFactory = mapJoinHashTableFactory; + this.hashTable = hashTable; } private LazyBinaryStructObjectInspector createInternalOi( @@ -147,13 +160,8 @@ public void setNotNullMarkers(byte[] notNullMarkers) { this.notNullMarkers = notNullMarkers; } - public static interface KeyValueHelper extends BytesBytesMultiHashMap.KvSource { - void setKeyValue(Writable key, Writable val) throws SerDeException; - /** Get hash value from the key. */ - int getHashFromKey() throws SerDeException; - } + private static class KeyValueWriter implements KeyValuePutWriter { - private static class KeyValueWriter implements KeyValueHelper { private final SerDe keySerDe, valSerDe; private final StructObjectInspector keySoi, valSoi; private final List keyOis, valOis; @@ -213,12 +221,22 @@ public byte updateStateByte(Byte previousValue) { } @Override - public int getHashFromKey() throws SerDeException { + public boolean hasHashCode() { + return false; + } + + @Override + public int getKeyHashCode() throws SerDeException { + throw new UnsupportedOperationException("Not supported for MapJoinBytesTableContainer"); + } + + @Override + public long getLongKey() { throw new UnsupportedOperationException("Not supported for MapJoinBytesTableContainer"); } } - static class LazyBinaryKvWriter implements KeyValueHelper { + static class LazyBinaryKvWriter implements KeyValuePutWriter { private final LazyBinaryStruct.SingleFieldGetter filterGetter; private Writable key, value; private final SerDe keySerDe; @@ -251,7 +269,12 @@ public void writeKey(RandomAccessOutput dest) throws SerDeException { } @Override - public int getHashFromKey() throws SerDeException { + public boolean hasHashCode() { + return true; + } + + @Override + public int getKeyHashCode() throws SerDeException { if (!(key instanceof BinaryComparable)) { throw new SerDeException("Unexpected type " + key.getClass().getCanonicalName()); } @@ -319,80 +342,48 @@ public byte updateStateByte(Byte previousValue) { aliasFilter &= filterGetter.getShort(); return aliasFilter; } - } - - /* - * An implementation of KvSource that can handle key and value as BytesWritable objects. - */ - protected static class DirectKeyValueWriter implements KeyValueHelper { - - private BytesWritable key; - private BytesWritable val; - - @Override - public void setKeyValue(Writable key, Writable val) throws SerDeException { - this.key = (BytesWritable) key; - this.val = (BytesWritable) val; - } - - @Override - public void writeKey(RandomAccessOutput dest) throws SerDeException { - byte[] keyBytes = key.getBytes(); - int keyLength = key.getLength(); - dest.write(keyBytes, 0, keyLength); - } - - @Override - public void writeValue(RandomAccessOutput dest) throws SerDeException { - byte[] valueBytes = val.getBytes(); - int valueLength = val.getLength(); - dest.write(valueBytes, 0 , valueLength); - } @Override - public byte updateStateByte(Byte previousValue) { - // Not used by the direct access client -- native vector map join. - throw new UnsupportedOperationException("Updating the state by not supported"); - } - - @Override - public int getHashFromKey() throws SerDeException { - byte[] keyBytes = key.getBytes(); - int keyLength = key.getLength(); - return HashCodeUtil.murmurHash(keyBytes, 0, keyLength); + public long getLongKey() { + throw new UnsupportedOperationException("Not supported for MapJoinBytesTableContainer"); } } @Override public void setSerde(MapJoinObjectSerDeContext keyContext, MapJoinObjectSerDeContext valueContext) throws SerDeException { - SerDe keySerde = keyContext.getSerDe(), valSerde = valueContext.getSerDe(); - if (writeHelper == null) { - LOG.info("Initializing container with " + keySerde.getClass().getName() + " and " - + valSerde.getClass().getName()); - if (keySerde instanceof BinarySortableSerDe && valSerde instanceof LazyBinarySerDe) { - LazyBinaryStructObjectInspector valSoi = - (LazyBinaryStructObjectInspector) valSerde.getObjectInspector(); - writeHelper = new LazyBinaryKvWriter(keySerde, valSoi, valueContext.hasFilterTag()); - internalValueOi = valSoi; - sortableSortOrders = ((BinarySortableSerDe) keySerde).getSortOrders(); - nullMarkers = ((BinarySortableSerDe) keySerde).getNullMarkers(); - notNullMarkers = ((BinarySortableSerDe) keySerde).getNotNullMarkers(); - } else { - writeHelper = new KeyValueWriter(keySerde, valSerde, valueContext.hasFilterTag()); - internalValueOi = createInternalOi(valueContext); - sortableSortOrders = null; - nullMarkers = null; - notNullMarkers = null; + if (mapJoinHashTableFactory.keyValuePutHelperIsExternal()) { + keyValuePutHelper = mapJoinHashTableFactory.createKeyValuePut(); + } else { + SerDe keySerde = keyContext.getSerDe(), valSerde = valueContext.getSerDe(); + if (keyValuePutHelper == null) { + LOG.info("Initializing container with " + keySerde.getClass().getName() + " and " + + valSerde.getClass().getName()); + if (keySerde instanceof BinarySortableSerDe && valSerde instanceof LazyBinarySerDe) { + LazyBinaryStructObjectInspector valSoi = + (LazyBinaryStructObjectInspector) valSerde.getObjectInspector(); + keyValuePutHelper = new LazyBinaryKvWriter(keySerde, valSoi, valueContext.hasFilterTag()); + internalValueOi = valSoi; + sortableSortOrders = ((BinarySortableSerDe) keySerde).getSortOrders(); + nullMarkers = ((BinarySortableSerDe) keySerde).getNullMarkers(); + notNullMarkers = ((BinarySortableSerDe) keySerde).getNotNullMarkers(); + } else { + keyValuePutHelper = new KeyValueWriter(keySerde, valSerde, valueContext.hasFilterTag()); + internalValueOi = createInternalOi(valueContext); + sortableSortOrders = null; + nullMarkers = null; + notNullMarkers = null; + } } } } @SuppressWarnings("deprecation") @Override - public MapJoinKey putRow(Writable currentKey, Writable currentValue) throws SerDeException { - writeHelper.setKeyValue(currentKey, currentValue); - hashMap.put(writeHelper, -1); + public MapJoinKey putRow(Writable currentKey, Writable currentValue) + throws SerDeException, IOException { + keyValuePutHelper.setKeyValue(currentKey, currentValue); + hashTable.put(keyValuePutHelper); return null; // there's no key to return } @@ -416,15 +407,7 @@ public ReusableGetAdaptor createGetter(MapJoinKey keyTypeFromLoader) { @Override public void seal() { - hashMap.seal(); - } - - // Direct access interfaces. - - @Override - public void put(Writable currentKey, Writable currentValue) throws SerDeException { - directWriteHelper.setKeyValue(currentKey, currentValue); - hashMap.put(directWriteHelper, -1); + hashTable.seal(); } public static boolean hasComplexObjects(LazyBinaryStructObjectInspector lazyBinaryStructObjectInspector) { @@ -463,7 +446,7 @@ public static boolean hasComplexObjects(LazyBinaryStructObjectInspector lazyBina /** Implementation of ReusableGetAdaptor that has Output for key serialization; row * container is also created once and reused for every row. */ - private class GetAdaptor implements ReusableGetAdaptor, ReusableGetAdaptorDirectAccess { + private class GetAdaptor implements ReusableGetAdaptor { private Object[] currentKey; private boolean[] nulls; @@ -478,9 +461,9 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromVector(VectorHashKeyWrapper kw, + public MapJoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, VectorHashKeyWrapperBatch keyWrapperBatch) - throws HiveException { + throws HiveException, IOException { if (nulls == null) { nulls = new boolean[keyOutputWriters.length]; currentKey = new Object[keyOutputWriters.length]; @@ -501,8 +484,8 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromRow(Object row, List fields, - List ois) throws HiveException { + public MapJoinResult setFromRow(Object row, List fields, + List ois) throws HiveException, IOException { if (nulls == null) { nulls = new boolean[fields.size()]; currentKey = new Object[fields.size()]; @@ -517,7 +500,7 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromOther(ReusableGetAdaptor other) { + public MapJoinResult setFromOther(ReusableGetAdaptor other) throws IOException { assert other instanceof GetAdaptor; GetAdaptor other2 = (GetAdaptor)other; nulls = other2.nulls; @@ -546,18 +529,6 @@ public MapJoinRowContainer getCurrentRows() { return currentKey; } - // Direct access interfaces. - - @Override - public JoinUtil.JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult) { - return currentValue.setDirect(bytes, offset, length, hashMapResult); - } - - @Override - public int directSpillPartitionId() { - throw new UnsupportedOperationException("Getting the spill hash partition not supported"); - } } /** Row container that gets and deserializes the rows on demand from bytes provided. */ @@ -566,7 +537,7 @@ public int directSpillPartitionId() { private byte aliasFilter; /** Hash table wrapper specific to the container. */ - private final BytesBytesMultiHashMap.Result hashMapResult; + private final MapJoinHashMapResult hashMapResult; /** * Sometimes, when container is empty in multi-table mapjoin, we need to add a dummy row. @@ -597,21 +568,28 @@ public ReusableRowContainer() { complexObjectArrayBuffer = null; } uselessIndirection = new ByteArrayRef(); - hashMapResult = new BytesBytesMultiHashMap.Result(); + hashMapResult = mapJoinHashTableFactory.createHashMapResult(); clearRows(); } - public JoinUtil.JoinResult setFromOutput(Output output) { + public MapJoinResult setFromOutput(Output output) throws IOException { + + int keyHash = HashCodeUtil.murmurHash(output.getData(), 0, output.getLength()); - aliasFilter = hashMap.getValueResult( - output.getData(), 0, output.getLength(), hashMapResult); + hashTable.hashMapLookup(output.getData(), 0, output.getLength(), keyHash, hashMapResult); + MapJoinResult mapJoinResult = hashMapResult.getMapJoinResult(); dummyRow = null; - if (hashMapResult.hasRows()) { - return JoinUtil.JoinResult.MATCH; - } else { + switch (mapJoinResult) { + case MATCH: + aliasFilter = hashMapResult.aliasFilter(); + break; + case NO_MATCH: aliasFilter = (byte) 0xff; - return JoinUtil.JoinResult.NOMATCH; + break; + default: + throw new RuntimeException("Unexpected map join result " + mapJoinResult.name()); } + return mapJoinResult; } @@ -728,20 +706,6 @@ public void addRow(Object[] value) { public void write(MapJoinObjectSerDeContext valueContext, ObjectOutputStream out) { throw new RuntimeException(this.getClass().getCanonicalName() + " cannot be serialized"); } - - // Direct access. - - public JoinUtil.JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult) { - aliasFilter = hashMap.getValueResult(bytes, offset, length, hashMapResult); - dummyRow = null; - if (hashMapResult.hasRows()) { - return JoinUtil.JoinResult.MATCH; - } else { - aliasFilter = (byte) 0xff; - return JoinUtil.JoinResult.NOMATCH; - } - } } public static boolean isSupportedKey(ObjectInspector keyOi) { @@ -756,7 +720,7 @@ public static boolean isSupportedKey(ObjectInspector keyOi) { @Override public void dumpMetrics() { - hashMap.debugDumpMetrics(); + hashTable.debugDumpMetrics(); } @Override @@ -766,6 +730,16 @@ public boolean hasSpill() { @Override public int size() { - return hashMap.size(); + return hashTable.size(); + } + + @Override + public MapJoinHashTable getMapJoinHashTableFind() { + return hashTable; + } + + @Override + public MapJoinHashTableFactory getMapJoinHashTableFactory() { + return mapJoinHashTableFactory; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainer.java index 6d71fef..66ba9b7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainer.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainer.java @@ -22,7 +22,9 @@ import java.util.List; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapper; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapperBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; @@ -42,21 +44,21 @@ * Changes current rows to which adaptor is referring to the rows corresponding to * the key represented by a VHKW object, and writers and batch used to interpret it. */ - JoinUtil.JoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, - VectorHashKeyWrapperBatch keyWrapperBatch) throws HiveException; + MapJoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, + VectorHashKeyWrapperBatch keyWrapperBatch) throws HiveException, IOException; /** * Changes current rows to which adaptor is referring to the rows corresponding to * the key represented by a row object, and fields and ois used to interpret it. */ - JoinUtil.JoinResult setFromRow(Object row, List fields, List ois) - throws HiveException; + MapJoinResult setFromRow(Object row, List fields, List ois) + throws HiveException, IOException; /** * Changes current rows to which adaptor is referring to the rows corresponding to * the key that another adaptor has already deserialized via setFromVector/setFromRow. */ - JoinUtil.JoinResult setFromOther(ReusableGetAdaptor other) throws HiveException; + MapJoinResult setFromOther(ReusableGetAdaptor other) throws HiveException, IOException; /** * Checks whether the current key has any nulls. @@ -93,6 +95,14 @@ MapJoinKey putRow(Writable currentKey, Writable currentValue) */ ReusableGetAdaptor createGetter(MapJoinKey keyTypeFromLoader); + /** + * Provides "managed" access to the map join hash table. For use by the native vector map join + * implementation. + */ + MapJoinHashTableFind getMapJoinHashTableFind(); + + MapJoinHashTableFactory getMapJoinHashTableFactory(); + /** Clears the contents of the table. */ void clear(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java index 7a36b53..a73e0ac 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java @@ -32,7 +32,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.JavaUtils; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastTableContainer; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.serde2.SerDe; @@ -140,7 +139,8 @@ public MapJoinTableContainer load( Map metaData = (Map) in.readObject(); if (tableContainer == null) { tableContainer = useOptimizedContainer ? - new MapJoinBytesTableContainer(hconf, valueContext, -1, 0) : + new MapJoinBytesTableContainer(hconf, valueContext, -1, 0, + new BytesBytesMultiHashMapFactory()) : create(name, metaData); } tableContainer.setSerde(keyContext, valueContext); @@ -198,75 +198,6 @@ private void loadOptimized(MapJoinBytesTableContainer container, ObjectInputStre } } - /** - * Loads the small table into a VectorMapJoinFastTableContainer. Only used on Spark path. - * @param mapJoinDesc The descriptor for the map join - * @param fs FileSystem of the folder. - * @param folder The folder to load table container. - * @param hconf The hive configuration - * @return Loaded table. - */ - @SuppressWarnings("unchecked") - public MapJoinTableContainer loadFastContainer(MapJoinDesc mapJoinDesc, - FileSystem fs, Path folder, Configuration hconf) throws HiveException { - try { - if (!fs.isDirectory(folder)) { - throw new HiveException("Error, not a directory: " + folder); - } - FileStatus[] fileStatuses = fs.listStatus(folder); - if (fileStatuses == null || fileStatuses.length == 0) { - return null; - } - - SerDe keySerDe = keyContext.getSerDe(); - SerDe valueSerDe = valueContext.getSerDe(); - Writable key = keySerDe.getSerializedClass().newInstance(); - Writable value = valueSerDe.getSerializedClass().newInstance(); - - VectorMapJoinFastTableContainer tableContainer = - new VectorMapJoinFastTableContainer(mapJoinDesc, hconf, -1); - tableContainer.setSerde(keyContext, valueContext); - - for (FileStatus fileStatus : fileStatuses) { - Path filePath = fileStatus.getPath(); - if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) { - throw new HiveException("Error, not a file: " + filePath); - } - InputStream is = null; - ObjectInputStream in = null; - try { - is = fs.open(filePath, 4096); - in = new ObjectInputStream(is); - // skip the name and metadata - in.readUTF(); - in.readObject(); - int numKeys = in.readInt(); - for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) { - key.readFields(in); - long numRows = in.readLong(); - for (long rowIndex = 0L; rowIndex < numRows; rowIndex++) { - value.readFields(in); - tableContainer.putRow(key, value); - } - } - } finally { - if (in != null) { - in.close(); - } else if (is != null) { - is.close(); - } - } - } - - tableContainer.seal(); - return tableContainer; - } catch (IOException e) { - throw new HiveException("IO error while trying to create table container", e); - } catch (Exception e) { - throw new HiveException("Error while trying to create table container", e); - } - } - public void persist(ObjectOutputStream out, MapJoinPersistableTableContainer tableContainer) throws HiveException { int numKeys = tableContainer.size(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/ReusableGetAdaptorDirectAccess.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/ReusableGetAdaptorDirectAccess.java deleted file mode 100644 index 0685d84..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/ReusableGetAdaptorDirectAccess.java +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.persistence; - - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; - -public interface ReusableGetAdaptorDirectAccess { - - JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult); - - int directSpillPartitionId(); -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMapResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMapResult.java new file mode 100644 index 0000000..b33e623 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMapResult.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; + +/* + * Interface for a hash map result. For reading the values, one-by-one. + */ +public interface MapJoinHashMapResult extends MapJoinHashTableResult { + + /** + * @return Whether there are any rows (i.e. true for match). + */ + boolean hasRows(); + + /** + * @return Whether there is 1 value row. + */ + boolean isSingleRow(); + + /** + * @return Whether there is a capped count available from cappedCount. + */ + boolean isCappedCountAvailable(); + + /** + * @return The count of values, up to a arbitrary cap limit. When available, the capped + * count can be used to make decisions on how to optimally generate join results. + */ + int cappedCount(); + + /** + * @return A reference to the first value, or null if there are no values. + */ + ByteSegmentRef first(); + + /** + * @return The next value, or null if there are no more values to be read. + */ + ByteSegmentRef next(); + + /** + * @return Whether reading is at the end. + */ + boolean isEof(); + + /** + * @return Whether there is alias filter available from aliasFilter. + */ + boolean isAliasFilterAvailable(); + + /** + * @return Alias filter byte. + */ + byte aliasFilter(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResult.java new file mode 100644 index 0000000..30db8e2 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResult.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Abstract class for a hash multi-set result additional methods. + */ +public interface MapJoinHashMultiSetResult extends MapJoinHashTableResult { + + /* + * @return The multi-set count for the lookup key. + */ + public long count(); + + /* + * Mark the result as matched with count multi-set entries. + */ + public void setMatch(long count); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResultImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResultImpl.java new file mode 100644 index 0000000..b568163 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResultImpl.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Abstract class for a hash multi-set result. + */ +public class MapJoinHashMultiSetResultImpl extends MapJoinHashTableResultImpl + implements MapJoinHashMultiSetResult { + + protected long count; + + public MapJoinHashMultiSetResultImpl() { + super(); + count = 0; + } + + /* + * @return The multi-set count for the lookup key. + */ + public long count() { + return count; + } + + public void setMatch(long count) { + this.count = count; + mapJoinResult = MapJoinResult.MATCH; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResult.java new file mode 100644 index 0000000..5849fff --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResult.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Interface for a hash set result additional methods. + */ +public interface MapJoinHashSetResult extends MapJoinHashTableResult { + + /* + * Mark the result as matched for a set entry. + */ + void setMatch(); + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResultImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResultImpl.java new file mode 100644 index 0000000..18037b6 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResultImpl.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Abstract class for a hash set result. + */ +public class MapJoinHashSetResultImpl extends MapJoinHashTableResultImpl + implements MapJoinHashSetResult { + + public MapJoinHashSetResultImpl() { + super(); + } + + public void setMatch() { + mapJoinResult = MapJoinResult.MATCH; + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTable.java new file mode 100644 index 0000000..0d6cae9 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTable.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Root abstract class for a vector map join hash table (which could be a hash map, hash multi-set, + * or hash set). + */ +public interface MapJoinHashTable extends MapJoinHashTableManage, MapJoinHashTableFind { + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFactory.java new file mode 100644 index 0000000..f179d92 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFactory.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; + +/* + * Root abstract class for a vector map join hash table (which could be a hash map, hash multi-set, + * or hash set). + */ +public interface MapJoinHashTableFactory { + + /** + * @return true when the hash table manages its the key value put helper. + */ + public boolean keyValuePutHelperIsExternal(); + + /** + * @return true if min/max optimization could be used. + */ + public boolean useMinMax(); + + /** + * @return For hash tables has manage their own key and value put object. + */ + KeyValuePut createKeyValuePut(); + + /** + * @param initialCapacity + * @param loadFactor + * @param writeBuffersSize + * @param memUsage + * @return A new hash table. + */ + MapJoinHashTable createHashTable(int initialCapacity, float loadFactor, + int writeBuffersSize, long memUsage); + + /* + * @return A new hash map result implementation specific object. + * + * The object can be used to access the values when there is a match, or + * access spill information when the partition with the key is currently spilled. + */ + MapJoinHashMapResult createHashMapResult(); + + /* + * @return A new hash multi-set result implementation specific object. + * + * The object can be used to access the *count* of values when the key is contained in the + * multi-set, or access spill information when the partition with the key is currently spilled. + */ + MapJoinHashMultiSetResult createHashMultiSetResult(); + + /* + * @return A new hash set result implementation specific object. + * + * The object can be used to access access spill information when the partition with the key + * is currently spilled. + */ + MapJoinHashSetResult createHashSetResult(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFind.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFind.java new file mode 100644 index 0000000..936d99a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFind.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +import java.io.IOException; + +/* + * Root abstract class for a vector map join hash table (which could be a hash map, hash multi-set, + * or hash set). + */ +public interface MapJoinHashTableFind { + + //---------------------------- COMMON LONG METHODS (Begin)---------------------------------------- + + boolean useMinMax(); + + long min(); + long max(); + + //----------------------------- COMMON LONG METHODS (End)----------------------------------------- + + //-------------------------------- HASH MAP (Begin)----------------------------------------------- + + /* + * Lookup a byte array key in the hash map. + * + * @param keyBytes + * A byte array containing the key within a range. + * @param keyStart + * The offset the beginning of the key. + * @param keyLength + * The length of the key. + * @param hashCode + * The key hash code. + * @param hashMapResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException; + + /* + * Lookup an long in the hash map. + * + * @param key + * The long key. + * @param hashCode + * The key hash code. + * @param hashMapResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashMapLookup(long key, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException; + + + //-------------------------------- HASH MAP (End) ------------------------------------------------ + + //---------------------------- HASH MULTI-SET (Begin) ------------------------------------------- + + /* + * Lookup an byte array key in the hash multi-set. + * + * @param keyBytes + * A byte array containing the key within a range. + * @param keyStart + * The offset the beginning of the key. + * @param keyLength + * The length of the key. + * @param hashCode + * The key hash code. + * @param hashMultiSetResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashMultiSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException; + + /* + * Lookup an long in the hash multi-set. + * + * @param key + * The long key. + * @param hashCode + * The key hash code. + * @param hashMultiSetResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashMultiSetContains(long key, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException; + + + //----------------------------- HASH MULTI-SET (End) -------------------------------------------- + + //------------------------------- HASH SET (Begin) ---------------------------------------------- + + /* + * Lookup an byte array key in the hash set. + * + * @param keyBytes + * A byte array containing the key within a range. + * @param keyStart + * The offset the beginning of the key. + * @param keyLength + * The length of the key. + * @param hashCode + * The key hash code. + * @param hashSetResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashSetResult hashSetResult) + throws IOException; + + /* + * Lookup an long in the hash set. + * + * @param key + * The long key. + * @param hashCode + * The key hash code. + * @param hashSetResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashSetContains(long key, int hashCode, MapJoinHashSetResult hashSetResult) + throws IOException; + + //--------------------------------- HASH SET (End) ---------------------------------------------- + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableManage.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableManage.java new file mode 100644 index 0000000..1d9d182 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableManage.java @@ -0,0 +1,102 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +import java.io.IOException; + +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; +import org.apache.hadoop.io.Writable; + +/* + * Root abstract class for a vector map join hash table (which could be a hash map, hash multi-set, + * or hash set). + */ +public interface MapJoinHashTableManage { + + /* + * The interface for an object that helps adding a new key/value to the hash table. + * + * Hash code calculation is pulled up to support the HybridHashTableContainer which needs + * the hash code earlier so it can partition the hash tables. And, also, for bloom filters. + * + * To support single long key min/max, we have a method that can extract the long value out. + */ + public static interface KeyValuePut { + + void setKeyValue(Writable key, Writable value) throws SerDeException, IOException; + + boolean hasHashCode(); + + int getKeyHashCode() throws SerDeException; + + long getLongKey(); + } + + /** + * The source of keys and values to put into hashtable; avoids byte copying. + * Supports BytesBytesMultiHashMap. + */ + public static interface KeyValuePutWriter extends KeyValuePut { + /** Write key into output. */ + void writeKey(RandomAccessOutput dest) throws SerDeException; + + /** Write value into output. */ + void writeValue(RandomAccessOutput dest) throws SerDeException; + + /** + * Provide updated value for state byte for a key. + * @param previousValue Previous value; null if this is the first call per key. + * @return The updated value. + */ + byte updateStateByte(Byte previousValue); + } + + public void put(KeyValuePut keyValuePutHelper) throws SerDeException; + + + /** + * Number of keys in the hashmap + * @return number of keys + */ + int size(); + + /** + * Number of values in the hashmap + * This is equal to or bigger than number of keys, since some values may share the same key + * @return number of values + */ + int getNumValues(); + + /** + * Number of bytes used by the hashmap + * There are two main components that take most memory: writeBuffers and refs + * Others include instance fields: 100 + * @return number of bytes + */ + long memorySize(); + + void seal(); + + void clear(); + + void debugDumpMetrics(); + + void expandAndRehashToTarget(int estimateNewRowCount); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResult.java new file mode 100644 index 0000000..591b0da --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResult.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Root interface for a hash table result. + */ +public interface MapJoinHashTableResult { + + /** + * Represents the hash map lookup result between two tables + */ + public enum MapJoinResult { + NONE, + MATCH, + NO_MATCH, + SPILL + } + + /** + * Forget about the most recent hash table lookup or contains call. + */ + void forget(); + + /** + * Set the map join result. + */ + void setNoMatch(); + + /** + * @return The map join result. + */ + MapJoinResult getMapJoinResult(); + + /** + * Set the spill partition id. + */ + void setSpill(int spillPartitionId); + + /** + * @return The Hybrid Grace spill partition id. + */ + int getSpillPartitionId(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResultImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResultImpl.java new file mode 100644 index 0000000..0c43655 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResultImpl.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Abstract implementation class for a hash table result. + */ +public abstract class MapJoinHashTableResultImpl implements MapJoinHashTableResult { + + protected MapJoinResult mapJoinResult; + + private int spillPartitionId; + + public MapJoinHashTableResultImpl() { + mapJoinResult = MapJoinResult.NONE; + spillPartitionId = -1; + } + + /** + * Forget about the most recent hash table lookup or contains call. + */ + @Override + public void forget() { + mapJoinResult = MapJoinResult.NONE; + spillPartitionId = -1; + } + + /** + * Set the map join result. + */ + @Override + public void setNoMatch() { + this.mapJoinResult = MapJoinResult.NO_MATCH; + } + + /** + * @return The map join result. + */ + @Override + public MapJoinResult getMapJoinResult() { + return mapJoinResult; + } + + /** + * Set the spill partition id. + */ + @Override + public void setSpill(int spillPartitionId) { + this.mapJoinResult = MapJoinResult.SPILL; + this.spillPartitionId = spillPartitionId; + } + + /** + * @return The Hybrid Grace spill partition id. + */ + @Override + public int getSpillPartitionId() { + return spillPartitionId; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(mapJoinResult.name()); + if (mapJoinResult == MapJoinResult.SPILL) { + sb.append(", spillPartitionId "); + sb.append(spillPartitionId); + } + return sb.toString(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java index 1634f42..7d3a08c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java @@ -64,8 +64,6 @@ private MapJoinOperator joinOp; private MapJoinDesc desc; - private boolean useFastContainer = false; - @Override public void init(ExecMapperContext context, MapredContext mrContext, Configuration hconf, MapJoinOperator joinOp) { @@ -73,12 +71,6 @@ public void init(ExecMapperContext context, MapredContext mrContext, Configurati this.hconf = hconf; this.joinOp = joinOp; this.desc = joinOp.getConf(); - if (desc.getVectorMode() && HiveConf.getBoolVar( - hconf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) { - VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); - useFastContainer = vectorDesc != null && vectorDesc.hashTableImplementationType() == - VectorMapJoinDesc.HashTableImplementationType.FAST; - } } @Override @@ -108,7 +100,7 @@ public void load(MapJoinTableContainer[] mapJoinTables, FileSystem fs = FileSystem.get(baseDir.toUri(), hconf); BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext(); boolean firstContainer = true; - boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar( + boolean useOptimizedContainer = HiveConf.getBoolVar( hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE); for (int pos = 0; pos < mapJoinTables.length; pos++) { if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) { @@ -156,17 +148,14 @@ private MapJoinTableContainer load(FileSystem fs, Path path, MapJoinTableContainerSerDe mapJoinTableSerde) throws HiveException { LOG.info("\tLoad back all hashtable files from tmp folder uri:" + path); if (!SparkUtilities.isDedicatedCluster(hconf)) { - return useFastContainer ? mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) : - mapJoinTableSerde.load(fs, path, hconf); + return mapJoinTableSerde.load(fs, path, hconf); } MapJoinTableContainer mapJoinTable = SmallTableCache.get(path); if (mapJoinTable == null) { synchronized (path.toString().intern()) { mapJoinTable = SmallTableCache.get(path); if (mapJoinTable == null) { - mapJoinTable = useFastContainer ? - mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) : - mapJoinTableSerde.load(fs, path, hconf); + mapJoinTable = mapJoinTableSerde.load(fs, path, hconf); SmallTableCache.cache(path, mapJoinTable); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java index a742458..cf8d1dd 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; +import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMapFactory; import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableConf; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer; @@ -38,9 +39,13 @@ import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastHashTableFactory; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; @@ -63,12 +68,34 @@ private MapJoinDesc desc; private TezContext tezContext; + private VectorMapJoinDesc vectorDesc; + private MapJoinHashTableFactory mapJoinHashTableFactory; + @Override public void init(ExecMapperContext context, MapredContext mrContext, Configuration hconf, MapJoinOperator joinOp) { this.tezContext = (TezContext) mrContext; this.hconf = hconf; this.desc = joinOp.getConf(); + this.vectorDesc = this.desc.getVectorDesc(); + HashTableImplementationType hashTableImplementationType; + if (this.vectorDesc == null) { + hashTableImplementationType = HashTableImplementationType.NONE; + } else { + hashTableImplementationType = vectorDesc.hashTableImplementationType(); + } + switch (hashTableImplementationType) { + case NONE: + // Non-native vector map join uses the regular BytesBytesMultiHashMap. + mapJoinHashTableFactory = new BytesBytesMultiHashMapFactory(); + break; + case FAST: + mapJoinHashTableFactory = new VectorMapJoinFastHashTableFactory(this.desc); + break; + default: + throw new RuntimeException("Unknown vector map join hash table implementation type " + + hashTableImplementationType.name()); + } } @Override @@ -166,7 +193,9 @@ public void load(MapJoinTableContainer[] mapJoinTables, KeyValueReader kvReader = (KeyValueReader) input.getReader(); MapJoinObjectSerDeContext keyCtx = mapJoinTableSerdes[pos].getKeyContext(), valCtx = mapJoinTableSerdes[pos].getValueContext(); - if (useOptimizedTables) { + if (useOptimizedTables && + mapJoinHashTableFactory instanceof BytesBytesMultiHashMapFactory) { + // Some keys are not supported by regular hive key handling. ObjectInspector keyOi = keyCtx.getSerDe().getObjectInspector(); if (!MapJoinBytesTableContainer.isSupportedKey(keyOi)) { if (isFirstKey) { @@ -195,10 +224,11 @@ public void load(MapJoinTableContainer[] mapJoinTables, MapJoinTableContainer tableContainer; if (useOptimizedTables) { if (!useHybridGraceHashJoin || isCrossProduct) { - tableContainer = new MapJoinBytesTableContainer(hconf, valCtx, keyCount, 0); + tableContainer = new MapJoinBytesTableContainer(hconf, valCtx, keyCount, 0, + mapJoinHashTableFactory); } else { tableContainer = new HybridHashTableContainer(hconf, keyCount, memory, - desc.getParentDataSizes().get(pos), nwayConf); + desc.getParentDataSizes().get(pos), nwayConf, mapJoinHashTableFactory); } } else { tableContainer = new HashMapWrapper(hconf, keyCount); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java index c4b95c3..c890674 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java @@ -20,6 +20,8 @@ import java.util.Arrays; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + /** * This class collects column information for copying a row from one VectorizedRowBatch to * same/another batch. @@ -30,7 +32,7 @@ protected int[] sourceColumns; protected int[] outputColumns; - protected String[] typeNames; + protected TypeInfo[] typeInfos; protected VectorColumnOrderedMap vectorColumnMapping; @@ -38,7 +40,7 @@ public VectorColumnMapping(String name) { this.vectorColumnMapping = new VectorColumnOrderedMap(name); } - public abstract void add(int sourceColumn, int outputColumn, String typeName); + public abstract void add(int sourceColumn, int outputColumn, TypeInfo typeInfo); public abstract void finalize(); @@ -54,8 +56,8 @@ public int getCount() { return outputColumns; } - public String[] getTypeNames() { - return typeNames; + public TypeInfo[] getTypeInfos() { + return typeInfos; } @Override @@ -65,7 +67,7 @@ public String toString() { sb.append(", "); sb.append("output columns: " + Arrays.toString(outputColumns)); sb.append(", "); - sb.append("type names: " + Arrays.toString(typeNames)); + sb.append("type infos: " + Arrays.toString(typeInfos)); return sb.toString(); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java index 0e6014b..97d55f5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java @@ -23,8 +23,10 @@ import java.util.TreeMap; import org.apache.commons.lang.ArrayUtils; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * This class collects column information for mapping vector columns, including the hive type name. @@ -43,17 +45,17 @@ private class Value { int valueColumn; - String typeName; + TypeInfo typeInfo; - Value(int valueColumn, String typeName) { + Value(int valueColumn, TypeInfo typeInfo) { this.valueColumn = valueColumn; - this.typeName = typeName; + this.typeInfo = typeInfo; } public String toString() { StringBuilder sb = new StringBuilder(); sb.append("(value column: " + valueColumn); - sb.append(", type name: " + typeName + ")"); + sb.append(", type info: " + typeInfo.toString() + ")"); return sb.toString(); } } @@ -62,12 +64,12 @@ public String toString() { private final int[] orderedColumns; private final int[] valueColumns; - private final String[] typeNames; + private final TypeInfo[] typeInfos; - Mapping(int[] orderedColumns, int[] valueColumns, String[] typeNames) { + Mapping(int[] orderedColumns, int[] valueColumns, TypeInfo[] typeInfos) { this.orderedColumns = orderedColumns; this.valueColumns = valueColumns; - this.typeNames = typeNames; + this.typeInfos = typeInfos; } public int getCount() { @@ -82,8 +84,8 @@ public int getCount() { return valueColumns; } - public String[] getTypeNames() { - return typeNames; + public TypeInfo[] getTypeInfos() { + return typeInfos; } } @@ -92,14 +94,14 @@ public VectorColumnOrderedMap(String name) { orderedTreeMap = new TreeMap(); } - public void add(int orderedColumn, int valueColumn, String typeName) { + public void add(int orderedColumn, int valueColumn, TypeInfo typeInfo) { if (orderedTreeMap.containsKey(orderedColumn)) { throw new RuntimeException( name + " duplicate column " + orderedColumn + " in ordered column map " + orderedTreeMap.toString() + - " when adding value column " + valueColumn + ", type " + typeName); + " when adding value column " + valueColumn + ", type into " + typeInfo.toString()); } - orderedTreeMap.put(orderedColumn, new Value(valueColumn, typeName)); + orderedTreeMap.put(orderedColumn, new Value(valueColumn, typeInfo)); } public boolean orderedColumnsContain(int orderedColumn) { @@ -109,17 +111,16 @@ public boolean orderedColumnsContain(int orderedColumn) { public Mapping getMapping() { ArrayList orderedColumns = new ArrayList(); ArrayList valueColumns = new ArrayList(); - ArrayList typeNames = new ArrayList(); + ArrayList typeInfos = new ArrayList(); for (Map.Entry entry : orderedTreeMap.entrySet()) { orderedColumns.add(entry.getKey()); Value value = entry.getValue(); valueColumns.add(value.valueColumn); - typeNames.add(value.typeName); + typeInfos.add(value.typeInfo); } return new Mapping( ArrayUtils.toPrimitive(orderedColumns.toArray(new Integer[0])), ArrayUtils.toPrimitive(valueColumns.toArray(new Integer[0])), - typeNames.toArray(new String[0])); - + typeInfos.toArray(new TypeInfo[0])); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java index f35aff7..4ceff6b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.exec.vector; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOrderedMap.Mapping; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * This class collects column information for copying a row from one VectorizedRowBatch to @@ -35,9 +36,9 @@ public VectorColumnOutputMapping(String name) { } @Override - public void add(int sourceColumn, int outputColumn, String typeName) { + public void add(int sourceColumn, int outputColumn, TypeInfo typeInfo) { // Order on outputColumn. - vectorColumnMapping.add(outputColumn, sourceColumn, typeName); + vectorColumnMapping.add(outputColumn, sourceColumn, typeInfo); } public boolean containsOutputColumn(int outputColumn) { @@ -51,7 +52,7 @@ public void finalize() { // Ordered columns are the output columns. sourceColumns = mapping.getValueColumns(); outputColumns = mapping.getOrderedColumns(); - typeNames = mapping.getTypeNames(); + typeInfos = mapping.getTypeInfos(); // Not needed anymore. vectorColumnMapping = null; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java index 4f5ba9a..061e396 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.exec.vector; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOrderedMap.Mapping; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * This class collects column information for copying a row from one VectorizedRowBatch to @@ -35,9 +36,9 @@ public VectorColumnSourceMapping(String name) { } @Override - public void add(int sourceColumn, int outputColumn, String typeName) { + public void add(int sourceColumn, int outputColumn, TypeInfo typeInfo) { // Order on sourceColumn. - vectorColumnMapping.add(sourceColumn, outputColumn, typeName); + vectorColumnMapping.add(sourceColumn, outputColumn, typeInfo); } @Override @@ -47,7 +48,7 @@ public void finalize() { // Ordered columns are the source columns. sourceColumns = mapping.getOrderedColumns(); outputColumns = mapping.getValueColumns(); - typeNames = mapping.getTypeNames(); + typeInfos = mapping.getTypeInfos(); // Not needed anymore. vectorColumnMapping = null; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java index c8e0284..911aeb0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java @@ -262,8 +262,7 @@ public void init(VectorColumnMapping columnMapping) throws HiveException { for (int i = 0; i < count; i++) { int inputColumn = columnMapping.getInputColumns()[i]; int outputColumn = columnMapping.getOutputColumns()[i]; - String typeName = columnMapping.getTypeNames()[i].toLowerCase(); - TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + TypeInfo typeInfo = columnMapping.getTypeInfos()[i]; Type columnVectorType = VectorizationContext.getColumnVectorTypeFromTypeInfo(typeInfo); CopyRow copyRowByValue = null; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java index 3eadc12..5432d67 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java @@ -717,7 +717,7 @@ public void deserializeByValue(VectorizedRowBatch batch, int batchIndex) throws readersByValue[i].apply(batch, batchIndex); i++; // Increment after the apply which could throw an exception. } - } catch (EOFException e) { + } catch (Exception e) { throwMoreDetailedException(e, i); } deserializeRead.extraFieldsCheck(); @@ -730,13 +730,13 @@ public void deserializeByReference(VectorizedRowBatch batch, int batchIndex) thr readersByReference[i].apply(batch, batchIndex); i++; // Increment after the apply which could throw an exception. } - } catch (EOFException e) { + } catch (Exception e) { throwMoreDetailedException(e, i); } deserializeRead.extraFieldsCheck(); } - private void throwMoreDetailedException(IOException e, int index) throws EOFException { + private void throwMoreDetailedException(Exception e, int index) throws IOException { StringBuilder sb = new StringBuilder(); sb.append("Detail: \"" + e.toString() + "\" occured for field " + index + " of " + typeInfos.length + " fields ("); for (int i = 0; i < typeInfos.length; i++) { @@ -746,6 +746,8 @@ private void throwMoreDetailedException(IOException e, int index) throws EOFExce sb.append(((PrimitiveTypeInfo) typeInfos[i]).getPrimitiveCategory().name()); } sb.append(")"); - throw new EOFException(sb.toString()); + sb.append(", deserialize context: "); + sb.append(deserializeRead.getCurrentContext()); + throw new IOException(sb.toString()); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java index e883f38..4ec93d7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java @@ -45,6 +45,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BytesWritable; @@ -734,6 +735,18 @@ public void init(List typeNames) throws HiveException { } } + public void init(TypeInfo[] typeInfos) throws HiveException { + + extracters = new Extractor[typeInfos.length]; + + int i = 0; + for (TypeInfo typeInfo : typeInfos) { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + extracters[i] = createExtractor(primitiveTypeInfo, i); + i++; + } + } + public int getCount() { return extracters.length; } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java index 31f5c72..5d14118 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java @@ -348,7 +348,7 @@ public void processBatch(VectorizedRowBatch batch) throws HiveException { //Validate that some progress is being made if (!(numEntriesHashTable < preFlushEntriesCount)) { - if (LOG.isDebugEnabled()) { + if (isLogDebugEnabled) { LOG.debug(String.format("Flush did not progress: %d entries before, %d entries after", preFlushEntriesCount, numEntriesHashTable)); @@ -427,7 +427,7 @@ private void computeMemoryLimits() { maxHashTblMemory = (int)(maxMemory * memoryThreshold); - if (LOG.isDebugEnabled()) { + if (isLogDebugEnabled) { LOG.debug(String.format("maxMemory:%dMb (%d * %f) fixSize:%d (key:%d agg:%d)", maxHashTblMemory/1024/1024, maxMemory/1024/1024, @@ -450,7 +450,7 @@ private void flush(boolean all) throws HiveException { (int)(numEntriesHashTable * this.percentEntriesToFlush); int entriesFlushed = 0; - if (LOG.isDebugEnabled()) { + if (isLogDebugEnabled) { LOG.debug(String.format( "Flush %d %s entries:%d fixed:%d variable:%d (used:%dMb max:%dMb) gcCanary:%s", entriesToFlush, all ? "(all)" : "", @@ -483,7 +483,7 @@ private void flush(boolean all) throws HiveException { numEntriesHashTable = 0; } - if (all && LOG.isDebugEnabled()) { + if (all && isLogDebugEnabled) { LOG.debug(String.format("GC canary caused %d flushes", gcCanaryFlushes)); } } @@ -535,7 +535,7 @@ private void updateAvgVariableSize(VectorizedRowBatch batch) { private void checkHashModeEfficiency() throws HiveException { if (lastModeCheckRowCount > numRowsCompareHashAggr) { lastModeCheckRowCount = 0; - if (LOG.isDebugEnabled()) { + if (isLogDebugEnabled) { LOG.debug(String.format("checkHashModeEfficiency: HT:%d RC:%d MIN:%d", numEntriesHashTable, sumBatchSize, (long)(sumBatchSize * minReductionHashAggr))); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java index 622f777..9b86366 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector; +import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -32,6 +33,7 @@ import org.apache.hadoop.hive.ql.exec.JoinUtil; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory; @@ -184,8 +186,8 @@ protected Object _evaluate(Object row, int version) throws HiveException { } @Override - protected JoinUtil.JoinResult setMapJoinKey(ReusableGetAdaptor dest, Object row, byte alias) - throws HiveException { + protected MapJoinResult setMapJoinKey(ReusableGetAdaptor dest, Object row, byte alias) + throws HiveException, IOException { return dest.setFromVector(keyValues[batchIndex], keyOutputWriters, keyWrapperBatch); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 1eb960d..012579b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -171,7 +171,7 @@ public VectorizationContext(String contextName, List initialColumnNames) this.contextName = contextName; level = 0; if (LOG.isDebugEnabled()) { - LOG.debug("VectorizationContext consructor contextName " + contextName + " level " + LOG.debug("VectorizationContext constructor contextName " + contextName + " level " + level + " initialColumnNames " + initialColumnNames); } this.initialColumnNames = initialColumnNames; @@ -195,7 +195,7 @@ public VectorizationContext(String contextName) { this.contextName = contextName; level = 0; if (LOG.isDebugEnabled()) { - LOG.debug("VectorizationContext consructor contextName " + contextName + " level " + level); + LOG.debug("VectorizationContext constructor contextName " + contextName + " level " + level); } initialColumnNames = new ArrayList(); projectedColumns = new ArrayList(); @@ -212,7 +212,7 @@ public VectorizationContext(String contextName) { public VectorizationContext(String contextName, VectorizationContext vContext) { this.contextName = contextName; level = vContext.level + 1; - LOG.info("VectorizationContext consructor reference contextName " + contextName + " level " + level); + // LOG.info("VectorizationContext consructor reference contextName " + contextName + " level " + level); this.initialColumnNames = vContext.initialColumnNames; this.projectedColumns = new ArrayList(); this.projectionColumnNames = new ArrayList(); @@ -421,7 +421,7 @@ private VectorExpression getColumnVectorExpression(ExprNodeColumnDesc expr = new SelectColumnIsTrue(columnNum); break; case PROJECTION: - expr = new IdentityExpression(columnNum, exprDesc.getTypeString()); + expr = new IdentityExpression(columnNum, exprDesc.getColumn(), exprDesc.getTypeString()); break; } return expr; @@ -972,20 +972,23 @@ private VectorExpression getIdentityExpression(List childExprList) throws HiveException { ExprNodeDesc childExpr = childExprList.get(0); int inputCol; + String name; String colType; VectorExpression v1 = null; if (childExpr instanceof ExprNodeGenericFuncDesc) { + name = ((ExprNodeGenericFuncDesc) childExpr).getName(); v1 = getVectorExpression(childExpr); inputCol = v1.getOutputColumn(); colType = v1.getOutputType(); } else if (childExpr instanceof ExprNodeColumnDesc) { + name = ((ExprNodeColumnDesc) childExpr) .getColumn(); ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) childExpr; inputCol = getInputColumnIndex(colDesc.getColumn()); colType = colDesc.getTypeString(); } else { throw new HiveException("Expression not supported: "+childExpr); } - VectorExpression expr = new IdentityExpression(inputCol, colType); + VectorExpression expr = new IdentityExpression(inputCol, name, colType); if (v1 != null) { expr.setChildExpressions(new VectorExpression [] {v1}); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java index be04da8..3257082 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java @@ -572,7 +572,7 @@ public static StandardStructObjectInspector convertToStandardStructObjectInspect } static ColumnVector cloneColumnVector(ColumnVector source - ) throws HiveException{ + ) throws HiveException { if (source instanceof LongColumnVector) { return new LongColumnVector(((LongColumnVector) source).vector.length); } else if (source instanceof DoubleColumnVector) { @@ -617,6 +617,17 @@ static ColumnVector cloneColumnVector(ColumnVector source " is not supported!"); } + public static PrimitiveTypeInfo[] primitiveTypeInfosFromTypeInfos( + TypeInfo[] typeInfos) throws HiveException { + + PrimitiveTypeInfo[] result = new PrimitiveTypeInfo[typeInfos.length]; + + for(int i = 0; i < typeInfos.length; i++) { + result[i] = (PrimitiveTypeInfo) typeInfos[i]; + } + return result; + } + /** * Make a new (scratch) batch, which is exactly "like" the batch provided, except that it's empty * @param batch the batch to imitate diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IdentityExpression.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IdentityExpression.java index 402d0f8..df31c28 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IdentityExpression.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IdentityExpression.java @@ -28,13 +28,15 @@ private static final long serialVersionUID = 1L; private int colNum = -1; + private String name = null; private String type = null; public IdentityExpression() { } - public IdentityExpression(int colNum, String type) { + public IdentityExpression(int colNum, String name, String type) { this.colNum = colNum; + this.name = name; this.type = type; } @@ -68,6 +70,10 @@ public int getColNum() { return getOutputColumn(); } + public String getName() { + return name; + } + public String getType() { return getOutputType(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeries.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeries.java index ac6c4b8..c89c093 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeries.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeries.java @@ -23,26 +23,29 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** - * An abstraction of keys within a VectorizedRowBatch. - * - * A key is one or more columns. + * An abstraction of columns representing a key within a VectorizedRowBatch. * * When there is a sequential "run" of equal keys, they are collapsed and represented by a * duplicate count. * - * The batch of keys (with sequential duplicates collapsed) is called a series. + * The (sequential duplicates collapsed) keys in the batch is called a series. * - * A key can be all null, or a key with no or some nulls. + * A key can be either ALL NULL or have at least one column value and 0 or more column NULLs. * * All keys have a duplicate count. * - * A key with no or some nulls has: + * A key with at least one column value and 0 or more NULLs: * 1) A hash code. - * 2) Key values and other value(s) defined by other interfaces. + * 2) Column values defined by other interfaces. * - * The key series is logically indexed. That is, if batch.selectedInUse is true, the indices + * The key series is logically indexed. That is, when batch.selectedInUse is true, the indices * will be logical and need to be mapped through batch.selected to get the physical batch - * indices. Otherwise, the indices are physical batch indices. + * indices. Otherwise, when !batch.selectedInUse, the indices are physical batch indices. + * + * NOTE: This interface doesn't support the series count so that the VectorKeySeriesCombo class + * can combine multiple VectorKeySeries into a combined key without having to compute the series + * count. + * */ public interface VectorKeySeries { @@ -70,7 +73,7 @@ /** * @return true when the current key is all nulls. */ - boolean getCurrentIsAllNull(); + boolean getCurrentKeyAllNull(); /** * @return the number of duplicate keys of the current key. @@ -80,13 +83,13 @@ /** * @return true when there is at least one null in the current key. - * Only valid when getCurrentIsAllNull is false. Otherwise, undefined. + * Only valid when getCurrentKeyAllNull is false. Otherwise, undefined. */ - boolean getCurrentHasAnyNulls(); + boolean getCurrentKeyHasAnyNulls(); /** * @return the hash code of the current key. - * Only valid when getCurrentIsAllNull is false. Otherwise, undefined. + * Only valid when getCurrentKeyAllNull is false. Otherwise, undefined. */ int getCurrentHashCode(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytes.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytes.java new file mode 100644 index 0000000..5d57e60 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytes.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hive.common.util.HashCodeUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * A key series of a single column of byte array keys. + * + */ +public final class VectorKeySeriesBytes extends VectorKeySeriesBytesBase { + + private static final Logger LOG = LoggerFactory.getLogger(VectorKeySeriesBytes.class.getName()); + + private byte[][] keyBytesArrays; + private int[] keyStarts; + private int[] keyLengths; + + private byte[] currentBytes; + private int currentStart; + private int currentLength; + + public VectorKeySeriesBytes(int columnNum) { + super(columnNum); + keyBytesArrays = new byte[VectorizedRowBatch.DEFAULT_SIZE][]; + keyStarts = new int[VectorizedRowBatch.DEFAULT_SIZE]; + keyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + public byte[] getCurrentBytes() { + return currentBytes; + } + + public int getCurrentStart() { + return currentStart; + } + + public int getCurrentLength() { + return currentLength; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + super.processBatch(batch); + + if (nonAllNullKeyCount > 0) { + HashCodeUtil.calculateBytesArrayHashCodes(keyBytesArrays, + keyStarts, keyLengths, hashCodes, nonAllNullKeyCount); + } + + // Do the position after we compute the checksums. + positionToFirst(); + } + + @Override + public void saveBytesKey(int nonAllNullKeyPosition, byte[] keyBytes, int keyByteStart, + int keyByteLength) { + keyBytesArrays[nonAllNullKeyPosition] = keyBytes; + keyStarts[nonAllNullKeyPosition] = keyByteStart; + keyLengths[nonAllNullKeyPosition] = keyByteLength; + } + + @Override + public void setNextNonEmptyKey(int nonAllNullKeyPosition) { + currentBytes = keyBytesArrays[nonAllNullKeyPosition]; + currentStart = keyStarts[nonAllNullKeyPosition]; + currentLength = keyLengths[nonAllNullKeyPosition]; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesBase.java new file mode 100644 index 0000000..da31a42 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesBase.java @@ -0,0 +1,259 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import com.google.common.base.Preconditions; + +/** + * An abstract class implementing key series of a single column of byte array keys where the keys + * get serialized. + */ +public abstract class VectorKeySeriesBytesBase extends VectorKeySeriesSingleImpl { + + private final int columnNum; + + public VectorKeySeriesBytesBase(int columnNum) { + super(); + this.columnNum = columnNum; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + + BytesColumnVector bytesColVector = (BytesColumnVector) batch.cols[columnNum]; + + final byte[][] vector = bytesColVector.vector; + final int[] start = bytesColVector.start; + final int[] length = bytesColVector.length; + + if (bytesColVector.isRepeating){ + duplicateCounts[0] = currentBatchSize; + if (bytesColVector.noNulls || !bytesColVector.isNull[0]) { + keyAllNulls[0] = false; + saveBytesKey(0, vector[0], start[0], length[0]); + nonAllNullKeyCount = 1; + } else { + keyAllNulls[0] = true; + nonAllNullKeyCount = 0; + } + keyCount = 1; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + keyCount = 0; + nonAllNullKeyCount = 0; + if (batch.selectedInUse) { + int[] selected = batch.selected; + if (bytesColVector.noNulls) { + + duplicateCounts[0] = 1; + int index; + index = selected[0]; + byte[] prevKeyBytes = vector[index]; + int prevKeyStart = start[index]; + int prevKeyLength = length[index]; + saveBytesKey(0, prevKeyBytes, prevKeyStart, prevKeyLength); + + int currentKeyStart; + int currentKeyLength; + byte[] currentKeyBytes; + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + currentKeyBytes = vector[index]; + currentKeyStart = start[index]; + currentKeyLength = length[index]; + if (StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, + currentKeyBytes, currentKeyStart, currentKeyLength)) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveBytesKey(keyCount, currentKeyBytes, currentKeyStart, currentKeyLength); + prevKeyBytes = currentKeyBytes; + prevKeyStart = currentKeyStart; + prevKeyLength = currentKeyLength; + } + } + Arrays.fill(keyAllNulls, 0, ++keyCount, false); + nonAllNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = bytesColVector.isNull; + + boolean prevKeyIsNull; + byte[] prevKeyBytes = null; + int prevKeyStart = 0; + int prevKeyLength = 0; + duplicateCounts[0] = 1; + int index = selected[0]; + if (isNull[index]) { + keyAllNulls[0] = true; + prevKeyIsNull = true; + } else { + keyAllNulls[0] = false; + prevKeyIsNull = false; + prevKeyBytes = vector[index]; + prevKeyStart = start[index]; + prevKeyLength = length[index]; + saveBytesKey(0, prevKeyBytes, prevKeyStart, prevKeyLength); + nonAllNullKeyCount = 1; + } + + int currentKeyStart; + int currentKeyLength; + byte[] currentKeyBytes; + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = true; + prevKeyIsNull = true; + } + } else { + currentKeyBytes = vector[index]; + currentKeyStart = start[index]; + currentKeyLength = length[index]; + if (!prevKeyIsNull && + StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, + currentKeyBytes, currentKeyStart, currentKeyLength)) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = false; + saveBytesKey(nonAllNullKeyCount++, currentKeyBytes, currentKeyStart, currentKeyLength); + prevKeyIsNull = false; + prevKeyBytes = currentKeyBytes; + prevKeyStart = currentKeyStart; + prevKeyLength = currentKeyLength; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } else { + + // NOT selectedInUse + + if (bytesColVector.noNulls) { + + duplicateCounts[0] = 1; + byte[] prevKeyBytes = vector[0]; + int prevKeyStart = start[0]; + int prevKeyLength = length[0]; + saveBytesKey(0, prevKeyBytes, prevKeyStart, prevKeyLength); + + int currentKeyStart; + int currentKeyLength; + byte[] currentKeyBytes; + for (int index = 1; index < currentBatchSize; index++) { + currentKeyBytes = vector[index]; + currentKeyStart = start[index]; + currentKeyLength = length[index]; + if (StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, + currentKeyBytes, currentKeyStart, currentKeyLength)) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveBytesKey(keyCount, currentKeyBytes, currentKeyStart, currentKeyLength); + prevKeyBytes = currentKeyBytes; + prevKeyStart = currentKeyStart; + prevKeyLength = currentKeyLength; + } + } + Arrays.fill(keyAllNulls, 0, ++keyCount, false); + nonAllNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = bytesColVector.isNull; + + boolean prevKeyIsNull; + byte[] prevKeyBytes = null; + int prevKeyStart = 0; + int prevKeyLength = 0; + duplicateCounts[0] = 1; + if (isNull[0]) { + keyAllNulls[0] = true; + prevKeyIsNull = true; + } else { + keyAllNulls[0] = false; + prevKeyIsNull = false; + prevKeyBytes = vector[0]; + prevKeyStart = start[0]; + prevKeyLength = length[0]; + saveBytesKey(0, prevKeyBytes, prevKeyStart, prevKeyLength); + nonAllNullKeyCount = 1; + } + + byte[] currentKeyBytes; + int currentKeyStart; + int currentKeyLength; + for (int index = 1; index < currentBatchSize; index++) { + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = true; + prevKeyIsNull = true; + } + } else { + currentKeyBytes = vector[index]; + currentKeyStart = start[index]; + currentKeyLength = length[index]; + if (!prevKeyIsNull && + StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, + currentKeyBytes, currentKeyStart, currentKeyLength)) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = false; + saveBytesKey(nonAllNullKeyCount++, currentKeyBytes, currentKeyStart, currentKeyLength); + prevKeyIsNull = false; + prevKeyBytes = currentKeyBytes; + prevKeyStart = currentKeyStart; + prevKeyLength = currentKeyLength; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } + } + + Preconditions.checkState(validate()); + } + + /* + * Serialize a bytes key. + */ + protected abstract void saveBytesKey(int nonAllNullKeyPosition, byte[] keyBytes, int keyByteStart, + int keyByteLength) throws IOException; +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesSerialized.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesSerialized.java deleted file mode 100644 index 81a8455..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesSerialized.java +++ /dev/null @@ -1,271 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; -import org.apache.hadoop.hive.serde2.fast.SerializeWrite; -import com.google.common.base.Preconditions; - -/** - * A key series of a single column of byte array keys where the keys get serialized. - */ -public class VectorKeySeriesBytesSerialized - extends VectorKeySeriesSerializedImpl implements VectorKeySeriesSerialized { - - private final int columnNum; - - private int outputStartPosition; - - public VectorKeySeriesBytesSerialized(int columnNum, T serializeWrite) { - super(serializeWrite); - this.columnNum = columnNum; - } - - @Override - public void processBatch(VectorizedRowBatch batch) throws IOException { - - currentBatchSize = batch.size; - Preconditions.checkState(currentBatchSize > 0); - - BytesColumnVector bytesColVector = (BytesColumnVector) batch.cols[columnNum]; - - byte[][] vectorBytesArrays = bytesColVector.vector; - int[] vectorStarts = bytesColVector.start; - int[] vectorLengths = bytesColVector.length; - - // The serialize routine uses this to build serializedKeyLengths. - outputStartPosition = 0; - output.reset(); - - if (bytesColVector.isRepeating){ - duplicateCounts[0] = currentBatchSize; - if (bytesColVector.noNulls || !bytesColVector.isNull[0]) { - seriesIsAllNull[0] = false; - serialize(0, vectorBytesArrays[0], vectorStarts[0], vectorLengths[0]); - nonNullKeyCount = 1; - } else { - seriesIsAllNull[0] = true; - nonNullKeyCount = 0; - } - seriesCount = 1; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - seriesCount = 0; - nonNullKeyCount = 0; - if (batch.selectedInUse) { - int[] selected = batch.selected; - if (bytesColVector.noNulls) { - - duplicateCounts[0] = 1; - int index; - index = selected[0]; - byte[] prevKeyBytes = vectorBytesArrays[index]; - int prevKeyStart = vectorStarts[index]; - int prevKeyLength = vectorLengths[index]; - serialize(0, prevKeyBytes, prevKeyStart, prevKeyLength); - - int currentKeyStart; - int currentKeyLength; - byte[] currentKeyBytes; - for (int logical = 1; logical < currentBatchSize; logical++) { - index = selected[logical]; - currentKeyBytes = vectorBytesArrays[index]; - currentKeyStart = vectorStarts[index]; - currentKeyLength = vectorLengths[index]; - if (StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, - currentKeyBytes, currentKeyStart, currentKeyLength)) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - serialize(seriesCount, currentKeyBytes, currentKeyStart, currentKeyLength); - prevKeyBytes = currentKeyBytes; - prevKeyStart = currentKeyStart; - prevKeyLength = currentKeyLength; - } - } - Arrays.fill(seriesIsAllNull, 0, ++seriesCount, false); - nonNullKeyCount = seriesCount; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - boolean[] isNull = bytesColVector.isNull; - - boolean prevKeyIsNull; - byte[] prevKeyBytes = null; - int prevKeyStart = 0; - int prevKeyLength = 0; - duplicateCounts[0] = 1; - int index = selected[0]; - if (isNull[index]) { - seriesIsAllNull[0] = true; - prevKeyIsNull = true; - } else { - seriesIsAllNull[0] = false; - prevKeyIsNull = false; - prevKeyBytes = vectorBytesArrays[index]; - prevKeyStart = vectorStarts[index]; - prevKeyLength = vectorLengths[index]; - serialize(0, prevKeyBytes, prevKeyStart, prevKeyLength); - nonNullKeyCount = 1; - } - - int currentKeyStart; - int currentKeyLength; - byte[] currentKeyBytes; - for (int logical = 1; logical < currentBatchSize; logical++) { - index = selected[logical]; - if (isNull[index]) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = true; - prevKeyIsNull = true; - } - } else { - currentKeyBytes = vectorBytesArrays[index]; - currentKeyStart = vectorStarts[index]; - currentKeyLength = vectorLengths[index]; - if (!prevKeyIsNull && - StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, - currentKeyBytes, currentKeyStart, currentKeyLength)) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = false; - serialize(nonNullKeyCount++, currentKeyBytes, currentKeyStart, currentKeyLength); - prevKeyIsNull = false; - prevKeyBytes = currentKeyBytes; - prevKeyStart = currentKeyStart; - prevKeyLength = currentKeyLength; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - } else { - - // NOT selectedInUse - - if (bytesColVector.noNulls) { - - duplicateCounts[0] = 1; - byte[] prevKeyBytes = vectorBytesArrays[0]; - int prevKeyStart = vectorStarts[0]; - int prevKeyLength = vectorLengths[0]; - serialize(0, prevKeyBytes, prevKeyStart, prevKeyLength); - - int currentKeyStart; - int currentKeyLength; - byte[] currentKeyBytes; - for (int index = 1; index < currentBatchSize; index++) { - currentKeyBytes = vectorBytesArrays[index]; - currentKeyStart = vectorStarts[index]; - currentKeyLength = vectorLengths[index]; - if (StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, - currentKeyBytes, currentKeyStart, currentKeyLength)) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - serialize(seriesCount, currentKeyBytes, currentKeyStart, currentKeyLength); - prevKeyBytes = currentKeyBytes; - prevKeyStart = currentKeyStart; - prevKeyLength = currentKeyLength; - } - } - Arrays.fill(seriesIsAllNull, 0, ++seriesCount, false); - nonNullKeyCount = seriesCount; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - boolean[] isNull = bytesColVector.isNull; - - boolean prevKeyIsNull; - byte[] prevKeyBytes = null; - int prevKeyStart = 0; - int prevKeyLength = 0; - duplicateCounts[0] = 1; - if (isNull[0]) { - seriesIsAllNull[0] = true; - prevKeyIsNull = true; - } else { - seriesIsAllNull[0] = false; - prevKeyIsNull = false; - prevKeyBytes = vectorBytesArrays[0]; - prevKeyStart = vectorStarts[0]; - prevKeyLength = vectorLengths[0]; - serialize(0, prevKeyBytes, prevKeyStart, prevKeyLength); - nonNullKeyCount = 1; - } - - byte[] currentKeyBytes; - int currentKeyStart; - int currentKeyLength; - for (int index = 1; index < currentBatchSize; index++) { - if (isNull[index]) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = true; - prevKeyIsNull = true; - } - } else { - currentKeyBytes = vectorBytesArrays[index]; - currentKeyStart = vectorStarts[index]; - currentKeyLength = vectorLengths[index]; - if (!prevKeyIsNull && - StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, - currentKeyBytes, currentKeyStart, currentKeyLength)) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = false; - serialize(nonNullKeyCount++, currentKeyBytes, currentKeyStart, currentKeyLength); - prevKeyIsNull = false; - prevKeyBytes = currentKeyBytes; - prevKeyStart = currentKeyStart; - prevKeyLength = currentKeyLength; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - } - } - - // Finally. - computeSerializedHashCodes(); - positionToFirst(); - Preconditions.checkState(validate()); - } - - private void serialize(int pos, byte[] bytes, int start, int length) throws IOException { - serializeWrite.setAppend(output); - serializeWrite.writeString(bytes, start, length); - int outputNewPosition = output.getLength(); - serializedKeyLengths[pos] = outputNewPosition - outputStartPosition; - outputStartPosition = outputNewPosition; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesCombo.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesCombo.java new file mode 100644 index 0000000..19d58d8 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesCombo.java @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import com.google.common.base.Preconditions; + +/** + * A key series of a multiple columns or an array of VectorKeySeriesSingle. + */ +public class VectorKeySeriesCombo extends VectorKeySeriesImpl implements VectorKeySeries { + + protected VectorKeySeriesSingle[] keySeriesArray; + + protected int currentBatchSize; + + private long allNullMask; + + protected long currentNullMask; + + public VectorKeySeriesCombo() { + super(); + } + + public void init(VectorKeySeriesSingle[] keySeriesArray) { + Preconditions.checkState(keySeriesArray.length > 0); + this.keySeriesArray = keySeriesArray; + allNullMask = (1L << keySeriesArray.length) - 1; + } + + @Override + public boolean getCurrentKeyAllNull() { + return (currentNullMask == allNullMask); + } + + @Override + public boolean getCurrentKeyHasAnyNulls() { + return (currentNullMask != 0); + } + + public long getCurrentNullMask() { + return currentNullMask; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + + for (int i = 0; i < keySeriesArray.length; i ++) { + keySeriesArray[i].processBatch(batch); + } + + positionToFirst(); + } + + @Override + public void positionToFirst() { + currentLogical = 0; + + // Prime the pump with the first key. + keySeriesArray[0].positionToFirst(); + if (keySeriesArray[0].getCurrentKeyAllNull()) { + currentNullMask = (1L << 0); + } else { + currentNullMask = 0; + } + currentDuplicateCount = keySeriesArray[0].getCurrentDuplicateCount(); + + // Determine the minimum series count. + for (int i = 1; i < keySeriesArray.length; i++) { + VectorKeySeriesSingle key = keySeriesArray[i]; + key.positionToFirst(); + if (key.getCurrentKeyAllNull()) { + currentNullMask |= (1L << i); + } + currentDuplicateCount = Math.min(currentDuplicateCount, key.getCurrentDuplicateCount()); + } + Preconditions.checkState(currentDuplicateCount > 0); + Preconditions.checkState(currentDuplicateCount <= currentBatchSize); + } + + public boolean next() { + + currentLogical += currentDuplicateCount; + if (currentLogical >= currentBatchSize) { + return false; + } + + int prevDuplicateCount = currentDuplicateCount; + + // Advance past the series we just used. + keySeriesArray[0].advance(prevDuplicateCount); + + /* + * Calculate the next series. + */ + + // Prime the pump with the first key. + if (keySeriesArray[0].getCurrentKeyAllNull()) { + currentNullMask = (1L << 0); + } else { + currentNullMask = 0; + } + currentDuplicateCount = keySeriesArray[0].getCurrentDuplicateCount(); + + // Determine the minimum series count. + for (int i = 1; i < keySeriesArray.length; i++) { + VectorKeySeriesSingle key = keySeriesArray[i]; + key.advance(prevDuplicateCount); + if (key.getCurrentKeyAllNull()) { + currentNullMask |= (1L << i); + } + currentDuplicateCount = Math.min(currentDuplicateCount, key.getCurrentDuplicateCount()); + } + return true; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDouble.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDouble.java new file mode 100644 index 0000000..24e7cd9 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDouble.java @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.common.util.HashCodeUtil; + +/** + * A key series of a single column of double keys. + */ +public final class VectorKeySeriesDouble extends VectorKeySeriesDoubleBase { + + private final double[] doubleKeys; + private final long[] doubleAsLongKeys; + + private double currentDoubleKey; + private long currentDoubleAsLongKey; + + public VectorKeySeriesDouble(int columnNum, PrimitiveTypeInfo primitiveTypeInfo) { + super(columnNum, primitiveTypeInfo); + doubleKeys = new double[VectorizedRowBatch.DEFAULT_SIZE]; + doubleAsLongKeys = new long[VectorizedRowBatch.DEFAULT_SIZE]; + } + + public double getCurrentDoubleKey() { + return currentDoubleKey; + } + + public long getCurrentDoubleAsLongKey() { + return currentDoubleAsLongKey; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + super.processBatch(batch); + + if (nonAllNullKeyCount > 0) { + HashCodeUtil.calculateLongArrayHashCodes(doubleAsLongKeys, hashCodes, nonAllNullKeyCount); + } + + // Do the position after we compute the checksums. + positionToFirst(); + } + + @Override + protected void saveDoubleKey(int nonAllNullKeyPosition, double key) + throws IOException { + doubleKeys[nonAllNullKeyPosition] = key; + doubleAsLongKeys[nonAllNullKeyPosition] = Double.doubleToLongBits(key); + } + + @Override + public void setNextNonEmptyKey(int nonAllNullKeyPosition) { + currentDoubleKey = doubleKeys[nonAllNullKeyPosition]; + currentDoubleAsLongKey = doubleAsLongKeys[nonAllNullKeyPosition]; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDoubleBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDoubleBase.java new file mode 100644 index 0000000..ccde18a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDoubleBase.java @@ -0,0 +1,212 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import com.google.common.base.Preconditions; + +/** + * An abstract class implementing key series of a single column of double keys where the keys + * get serialized. + */ +public abstract class VectorKeySeriesDoubleBase extends VectorKeySeriesSingleImpl { + + protected final int columnNum; + protected final PrimitiveCategory primitiveCategory; + + public VectorKeySeriesDoubleBase(int columnNum, PrimitiveTypeInfo primitiveTypeInfo) { + super(); + this.columnNum = columnNum; + primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + + DoubleColumnVector doubleColVector = (DoubleColumnVector) batch.cols[columnNum]; + + double[] vector = doubleColVector.vector; + + if (doubleColVector.isRepeating){ + duplicateCounts[0] = currentBatchSize; + if (doubleColVector.noNulls || !doubleColVector.isNull[0]) { + keyAllNulls[0] = false; + saveDoubleKey(0, vector[0]); + nonAllNullKeyCount = 1; + } else { + keyAllNulls[0] = true; + nonAllNullKeyCount = 0; + } + keyCount = 1; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + keyCount = 0; + nonAllNullKeyCount = 0; + if (batch.selectedInUse) { + int[] selected = batch.selected; + if (doubleColVector.noNulls) { + + duplicateCounts[0] = 1; + double prevKey = vector[selected[0]]; + saveDoubleKey(0, prevKey); + + double currentKey; + for (int logical = 1; logical < currentBatchSize; logical++) { + currentKey = vector[selected[logical]]; + if (prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveDoubleKey(keyCount, currentKey); + prevKey = currentKey; + } + } + Arrays.fill(keyAllNulls, 0, ++keyCount, false); + nonAllNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = doubleColVector.isNull; + + boolean prevKeyIsNull; + double prevKey = 0; + duplicateCounts[0] = 1; + int index = selected[0]; + if (isNull[index]) { + keyAllNulls[0] = true; + prevKeyIsNull = true; + } else { + keyAllNulls[0] = false; + prevKeyIsNull = false; + prevKey = vector[index]; + saveDoubleKey(0, prevKey); + nonAllNullKeyCount = 1; + } + + double currentKey; + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = true; + prevKeyIsNull = true; + } + } else { + currentKey = vector[index]; + if (!prevKeyIsNull && prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = false; + saveDoubleKey(nonAllNullKeyCount++, currentKey); + prevKeyIsNull = false; + prevKey = currentKey; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } else { + + // NOT selectedInUse + + if (doubleColVector.noNulls) { + + duplicateCounts[0] = 1; + double prevKey = vector[0]; + saveDoubleKey(0, prevKey); + double currentKey; + for (int index = 1; index < currentBatchSize; index++) { + currentKey = vector[index]; + if (prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveDoubleKey(keyCount, currentKey); + prevKey = currentKey; + } + } + Arrays.fill(keyAllNulls, 0, ++keyCount, false); + nonAllNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = doubleColVector.isNull; + + boolean prevKeyIsNull; + double prevKey = 0; + duplicateCounts[0] = 1; + if (isNull[0]) { + keyAllNulls[0] = true; + prevKeyIsNull = true; + } else { + keyAllNulls[0] = false; + prevKeyIsNull = false; + prevKey = vector[0]; + saveDoubleKey(nonAllNullKeyCount++, prevKey); + } + + for (int index = 1; index < currentBatchSize; index++) { + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = true; + prevKeyIsNull = true; + } + } else { + double currentKey = vector[index]; + if (!prevKeyIsNull && prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = false; + saveDoubleKey(nonAllNullKeyCount++, currentKey); + prevKeyIsNull = false; + prevKey = currentKey; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } + } + + Preconditions.checkState(validate()); + } + + /* + * Serialize a double key. + */ + protected abstract void saveDoubleKey(int nonAllNullKeyPosition, double key) + throws IOException; +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesImpl.java index 55e923e..3b35f96 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesImpl.java @@ -19,20 +19,20 @@ package org.apache.hadoop.hive.ql.exec.vector.keyseries; /** - * A base implementation of VectorKeySeries. + * A abstract implementation of VectorKeySeries. * */ public abstract class VectorKeySeriesImpl implements VectorKeySeries { protected int currentLogical; - protected boolean currentIsAllNull; + protected boolean currentKeyAllNull; protected boolean currentHasAnyNulls; protected int currentDuplicateCount; protected int currentHashCode; VectorKeySeriesImpl() { currentLogical = 0; - currentIsAllNull = false; + currentKeyAllNull = false; // Set to true by default. Only actively set in the multiple key case to support Outer Join. currentHasAnyNulls = true; @@ -47,12 +47,12 @@ public int getCurrentLogical() { } @Override - public boolean getCurrentIsAllNull() { - return currentIsAllNull; + public boolean getCurrentKeyAllNull() { + return currentKeyAllNull; } @Override - public boolean getCurrentHasAnyNulls() { + public boolean getCurrentKeyHasAnyNulls() { return currentHasAnyNulls; } @@ -65,4 +65,17 @@ public int getCurrentDuplicateCount() { public int getCurrentHashCode() { return currentHashCode; } + + public static String displayBytes(byte[] bytes, int start, int length) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + char ch = (char) bytes[i]; + if (ch < ' ' || ch > '~') { + sb.append(String.format("\\%03d", bytes[i] & 0xff)); + } else { + sb.append(ch); + } + } + return sb.toString(); + } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLong.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLong.java new file mode 100644 index 0000000..7a86ebd --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLong.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.common.util.HashCodeUtil; + +/** + * A key series of a single column of long keys. + */ +public final class VectorKeySeriesLong extends VectorKeySeriesLongBase { + + private final long[] longKeys; + + private long currentKey; + + public VectorKeySeriesLong(int columnNum, PrimitiveTypeInfo primitiveTypeInfo) { + super(columnNum, primitiveTypeInfo); + longKeys = new long[VectorizedRowBatch.DEFAULT_SIZE]; + } + + public long getCurrentKey() { + return currentKey; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + super.processBatch(batch); + + if (nonAllNullKeyCount > 0) { + HashCodeUtil.calculateLongArrayHashCodes(longKeys, hashCodes, nonAllNullKeyCount); + } + + // Do the position after we compute the checksums. + positionToFirst(); + } + + @Override + protected void saveLongKey(int nonAllNullKeyPosition, long key) + throws IOException { + longKeys[nonAllNullKeyPosition] = key; + } + + @Override + public void setNextNonEmptyKey(int nonAllNullKeyPosition) { + currentKey = longKeys[nonAllNullKeyPosition]; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongBase.java new file mode 100644 index 0000000..f7a5258 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongBase.java @@ -0,0 +1,212 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import com.google.common.base.Preconditions; + +/** + * An abstract class implementing key series of a single column of long keys where the keys + * get serialized. + */ +public abstract class VectorKeySeriesLongBase extends VectorKeySeriesSingleImpl { + + protected final int columnNum; + protected final PrimitiveCategory primitiveCategory; + + public VectorKeySeriesLongBase(int columnNum, PrimitiveTypeInfo primitiveTypeInfo) { + super(); + this.columnNum = columnNum; + primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + + LongColumnVector longColVector = (LongColumnVector) batch.cols[columnNum]; + + long[] vector = longColVector.vector; + + if (longColVector.isRepeating){ + duplicateCounts[0] = currentBatchSize; + if (longColVector.noNulls || !longColVector.isNull[0]) { + keyAllNulls[0] = false; + saveLongKey(0, vector[0]); + nonAllNullKeyCount = 1; + } else { + keyAllNulls[0] = true; + nonAllNullKeyCount = 0; + } + keyCount = 1; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + keyCount = 0; + nonAllNullKeyCount = 0; + if (batch.selectedInUse) { + int[] selected = batch.selected; + if (longColVector.noNulls) { + + duplicateCounts[0] = 1; + long prevKey = vector[selected[0]]; + saveLongKey(0, prevKey); + + long currentKey; + for (int logical = 1; logical < currentBatchSize; logical++) { + currentKey = vector[selected[logical]]; + if (prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveLongKey(keyCount, currentKey); + prevKey = currentKey; + } + } + Arrays.fill(keyAllNulls, 0, ++keyCount, false); + nonAllNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = longColVector.isNull; + + boolean prevKeyIsNull; + long prevKey = 0; + duplicateCounts[0] = 1; + int index = selected[0]; + if (isNull[index]) { + keyAllNulls[0] = true; + prevKeyIsNull = true; + } else { + keyAllNulls[0] = false; + prevKeyIsNull = false; + prevKey = vector[index]; + saveLongKey(0, prevKey); + nonAllNullKeyCount = 1; + } + + long currentKey; + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = true; + prevKeyIsNull = true; + } + } else { + currentKey = vector[index]; + if (!prevKeyIsNull && prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = false; + saveLongKey(nonAllNullKeyCount++, currentKey); + prevKeyIsNull = false; + prevKey = currentKey; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } else { + + // NOT selectedInUse + + if (longColVector.noNulls) { + + duplicateCounts[0] = 1; + long prevKey = vector[0]; + saveLongKey(0, prevKey); + long currentKey; + for (int index = 1; index < currentBatchSize; index++) { + currentKey = vector[index]; + if (prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveLongKey(keyCount, currentKey); + prevKey = currentKey; + } + } + Arrays.fill(keyAllNulls, 0, ++keyCount, false); + nonAllNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = longColVector.isNull; + + boolean prevKeyIsNull; + long prevKey = 0; + duplicateCounts[0] = 1; + if (isNull[0]) { + keyAllNulls[0] = true; + prevKeyIsNull = true; + } else { + keyAllNulls[0] = false; + prevKeyIsNull = false; + prevKey = vector[0]; + saveLongKey(nonAllNullKeyCount++, prevKey); + } + + for (int index = 1; index < currentBatchSize; index++) { + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = true; + prevKeyIsNull = true; + } + } else { + long currentKey = vector[index]; + if (!prevKeyIsNull && prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = false; + saveLongKey(nonAllNullKeyCount++, currentKey); + prevKeyIsNull = false; + prevKey = currentKey; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } + } + + Preconditions.checkState(validate()); + } + + /* + * Serialize a long key. + */ + protected abstract void saveLongKey(int nonAllNullKeyPosition, long key) + throws IOException; +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongSerialized.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongSerialized.java deleted file mode 100644 index a0134fd..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongSerialized.java +++ /dev/null @@ -1,249 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.serde2.fast.SerializeWrite; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; - -import com.google.common.base.Preconditions; - -/** - * A key series of a single column of long keys where the keys get serialized. - */ -public class VectorKeySeriesLongSerialized - extends VectorKeySeriesSerializedImpl implements VectorKeySeriesSerialized { - - private final int columnNum; - private PrimitiveCategory primitiveCategory; - - private int currentKeyStart; - - public VectorKeySeriesLongSerialized(int columnNum, PrimitiveTypeInfo primitiveTypeInfo, - T serializeWrite) { - super(serializeWrite); - this.columnNum = columnNum; - primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); - } - - @Override - public void processBatch(VectorizedRowBatch batch) throws IOException { - - currentBatchSize = batch.size; - Preconditions.checkState(currentBatchSize > 0); - - LongColumnVector longColVector = (LongColumnVector) batch.cols[columnNum]; - - long[] vector = longColVector.vector; - - // The serialize routine uses this to build serializedKeyLengths. - currentKeyStart = 0; - output.reset(); - - if (longColVector.isRepeating){ - duplicateCounts[0] = currentBatchSize; - if (longColVector.noNulls || !longColVector.isNull[0]) { - seriesIsAllNull[0] = false; - serialize(0, vector[0]); - nonNullKeyCount = 1; - } else { - seriesIsAllNull[0] = true; - nonNullKeyCount = 0; - } - seriesCount = 1; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - seriesCount = 0; - nonNullKeyCount = 0; - if (batch.selectedInUse) { - int[] selected = batch.selected; - if (longColVector.noNulls) { - - duplicateCounts[0] = 1; - long prevKey = vector[selected[0]]; - serialize(0, prevKey); - - long currentKey; - for (int logical = 1; logical < currentBatchSize; logical++) { - currentKey = vector[selected[logical]]; - if (prevKey == currentKey) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - serialize(seriesCount, currentKey); - prevKey = currentKey; - } - } - Arrays.fill(seriesIsAllNull, 0, ++seriesCount, false); - nonNullKeyCount = seriesCount; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - boolean[] isNull = longColVector.isNull; - - boolean prevKeyIsNull; - long prevKey = 0; - duplicateCounts[0] = 1; - int index = selected[0]; - if (isNull[index]) { - seriesIsAllNull[0] = true; - prevKeyIsNull = true; - nonNullKeyCount = 0; - } else { - seriesIsAllNull[0] = false; - prevKeyIsNull = false; - prevKey = vector[index]; - serialize(0, prevKey); - nonNullKeyCount = 1; - } - - long currentKey; - for (int logical = 1; logical < currentBatchSize; logical++) { - index = selected[logical]; - if (isNull[index]) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = true; - prevKeyIsNull = true; - } - } else { - currentKey = vector[index]; - if (!prevKeyIsNull && prevKey == currentKey) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = false; - serialize(nonNullKeyCount++, currentKey); - prevKeyIsNull = false; - prevKey = currentKey; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - } else { - - // NOT selectedInUse - - if (longColVector.noNulls) { - - duplicateCounts[0] = 1; - long prevKey = vector[0]; - serialize(0, prevKey); - long currentKey; - for (int index = 1; index < currentBatchSize; index++) { - currentKey = vector[index]; - if (prevKey == currentKey) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - serialize(seriesCount, currentKey); - prevKey = currentKey; - } - } - Arrays.fill(seriesIsAllNull, 0, ++seriesCount, false); - nonNullKeyCount = seriesCount; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - boolean[] isNull = longColVector.isNull; - - boolean prevKeyIsNull; - long prevKey = 0; - duplicateCounts[0] = 1; - if (isNull[0]) { - seriesIsAllNull[0] = true; - prevKeyIsNull = true; - nonNullKeyCount = 0; - } else { - seriesIsAllNull[0] = false; - prevKeyIsNull = false; - prevKey = vector[0]; - serialize(0, prevKey); - nonNullKeyCount = 1; - } - - for (int index = 1; index < currentBatchSize; index++) { - if (isNull[index]) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = true; - prevKeyIsNull = true; - } - } else { - long currentKey = vector[index]; - if (!prevKeyIsNull && prevKey == currentKey) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = false; - serialize(nonNullKeyCount++, currentKey); - prevKeyIsNull = false; - prevKey = currentKey; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - } - } - - // Finally. - computeSerializedHashCodes(); - positionToFirst(); - Preconditions.checkState(validate()); - } - - private void serialize(int pos, long value) throws IOException { - serializeWrite.setAppend(output); - - // UNDONE: Add support for DATE, TIMESTAMP, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME... - switch (primitiveCategory) { - case BOOLEAN: - serializeWrite.writeBoolean(value != 0); - break; - case BYTE: - serializeWrite.writeByte((byte) value); - break; - case SHORT: - serializeWrite.writeShort((short) value); - break; - case INT: - serializeWrite.writeInt((int) value); - break; - case LONG: - serializeWrite.writeLong(value); - break; - default: - throw new RuntimeException("Unexpected primitive category " + primitiveCategory.name()); - } - int outputNewPosition = output.getLength(); - serializedKeyLengths[pos] = outputNewPosition - currentKeyStart; - currentKeyStart = outputNewPosition; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiBase.java new file mode 100644 index 0000000..d1d4f3a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiBase.java @@ -0,0 +1,168 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * An abstract class implementing key series of multiple column keys where the keys + * get serialized. + * + * It can be one key that requires custom serialization (e.g. HiveDecimal). + */ +public abstract class VectorKeySeriesMultiBase extends VectorKeySeriesSingleImpl { + + private static final Logger LOG = LoggerFactory.getLogger(VectorKeySeriesMultiBase.class.getName()); + + protected boolean[] hasAnyNulls; + + public VectorKeySeriesMultiBase() { + super(); + hasAnyNulls = new boolean[VectorizedRowBatch.DEFAULT_SIZE]; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + Preconditions.checkState(currentBatchSize <= VectorizedRowBatch.DEFAULT_SIZE); + + keyCount = 0; + boolean prevKeyIsNull; + duplicateCounts[0] = 1; + if (batch.selectedInUse) { + int[] selected = batch.selected; + int index = selected[0]; + writeMultiKey(batch, index, 0); + if (isAllNulls()) { + keyAllNulls[0] = prevKeyIsNull = true; + hasAnyNulls[0] = true; + nonAllNullKeyCount = 0; + } else { + keyAllNulls[0] = prevKeyIsNull = false; + hasAnyNulls[0] = hasAnyNulls(); + nonAllNullKeyCount = 1; + } + + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + writeMultiKey(batch, index, nonAllNullKeyCount); + if (isAllNulls()) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = prevKeyIsNull = true; + hasAnyNulls[keyCount] = true; + } + } else { + if (!prevKeyIsNull && equalsPrevKey(nonAllNullKeyCount)) { + forgetKey(); + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = prevKeyIsNull = false; + hasAnyNulls[nonAllNullKeyCount] = hasAnyNulls(); + nonAllNullKeyCount++; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + writeMultiKey(batch, 0, 0); + if (isAllNulls()) { + keyAllNulls[0] = prevKeyIsNull = true; + hasAnyNulls[0] = true; + nonAllNullKeyCount = 0; + } else { + keyAllNulls[0] = prevKeyIsNull = false; + hasAnyNulls[0] = hasAnyNulls(); + nonAllNullKeyCount = 1; + } + + for (int index = 1; index < currentBatchSize; index++) { + writeMultiKey(batch, index, nonAllNullKeyCount); + if (isAllNulls()) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = prevKeyIsNull = true; + hasAnyNulls[keyCount] = true; + } + } else { + if (!prevKeyIsNull && equalsPrevKey(nonAllNullKeyCount)) { + forgetKey(); + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyAllNulls[keyCount] = prevKeyIsNull = false; + hasAnyNulls[nonAllNullKeyCount] = hasAnyNulls(); + nonAllNullKeyCount++; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + + Preconditions.checkState(validate()); + } + + @Override + public void setNextNonEmptyKey(int nonAllNullKeyPosition) { + currentHasAnyNulls = hasAnyNulls[nonAllNullKeyPosition]; + } + + /* + * Serialize a multiple column key. + */ + protected abstract void writeMultiKey(VectorizedRowBatch batch, int index, int nonAllNullKeyCount) + throws IOException; + + /* + * After calling writeMultiKey, this method returns whether ALL the key columns are null. + */ + protected abstract boolean isAllNulls(); + + /* + * After calling writeMultiKey, this method returns whether ANY the key columns is null. + */ + protected abstract boolean hasAnyNulls(); + + /* + * Compare the previous serialized multiple column key is equal to the last serialized + * multiple column key. + */ + protected abstract boolean equalsPrevKey(int nonAllNullKeyCount); + + /* + * Forget the last serialized multiple column key (as a result of finding with + * equalsPrevKey the previous key matched). + */ + protected abstract void forgetKey(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiSerialized.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiSerialized.java deleted file mode 100644 index 2cc3531..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiSerialized.java +++ /dev/null @@ -1,187 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.fast.SerializeWrite; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - * A key series of a multiple columns of keys where the keys get serialized. - * (Or, it can be 1 column). - */ -public class VectorKeySeriesMultiSerialized - extends VectorKeySeriesSerializedImpl implements VectorKeySeriesSerialized { - - private static final Logger LOG = LoggerFactory.getLogger( - VectorKeySeriesMultiSerialized.class.getName()); - - private VectorSerializeRow keySerializeRow; - - private boolean[] hasAnyNulls; - - public VectorKeySeriesMultiSerialized(T serializeWrite) { - super(serializeWrite); - } - - public void init(TypeInfo[] typeInfos, int[] columnNums) throws HiveException { - keySerializeRow = new VectorSerializeRow(serializeWrite); - keySerializeRow.init(typeInfos, columnNums); - hasAnyNulls = new boolean[VectorizedRowBatch.DEFAULT_SIZE]; - } - - @Override - public void processBatch(VectorizedRowBatch batch) throws IOException { - - currentBatchSize = batch.size; - Preconditions.checkState(currentBatchSize > 0); - - // LOG.info("VectorKeySeriesMultiSerialized processBatch size " + currentBatchSize + " numCols " + batch.numCols + " selectedInUse " + batch.selectedInUse); - - int prevKeyStart = 0; - int prevKeyLength; - int currentKeyStart = 0; - output.reset(); - - seriesCount = 0; - boolean prevKeyIsNull; - duplicateCounts[0] = 1; - if (batch.selectedInUse) { - int[] selected = batch.selected; - int index = selected[0]; - keySerializeRow.setOutputAppend(output); - keySerializeRow.serializeWrite(batch, index); - if (keySerializeRow.getIsAllNulls()) { - seriesIsAllNull[0] = prevKeyIsNull = true; - prevKeyLength = 0; - output.setWritePosition(0); - nonNullKeyCount = 0; - } else { - seriesIsAllNull[0] = prevKeyIsNull = false; - serializedKeyLengths[0] = currentKeyStart = prevKeyLength = output.getLength(); - hasAnyNulls[0] = keySerializeRow.getHasAnyNulls(); - nonNullKeyCount = 1; - } - - int keyLength; - for (int logical = 1; logical < currentBatchSize; logical++) { - index = selected[logical]; - keySerializeRow.setOutputAppend(output); - keySerializeRow.serializeWrite(batch, index); - if (keySerializeRow.getIsAllNulls()) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = prevKeyIsNull = true; - } - output.setWritePosition(currentKeyStart); - } else { - keyLength = output.getLength() - currentKeyStart; - if (!prevKeyIsNull && - StringExpr.equal( - output.getData(), prevKeyStart, prevKeyLength, - output.getData(), currentKeyStart, keyLength)) { - duplicateCounts[seriesCount]++; - output.setWritePosition(currentKeyStart); - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = prevKeyIsNull = false; - prevKeyStart = currentKeyStart; - serializedKeyLengths[nonNullKeyCount] = prevKeyLength = keyLength; - currentKeyStart += keyLength; - hasAnyNulls[nonNullKeyCount] = keySerializeRow.getHasAnyNulls(); - nonNullKeyCount++; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - keySerializeRow.setOutputAppend(output); - keySerializeRow.serializeWrite(batch, 0); - if (keySerializeRow.getIsAllNulls()) { - seriesIsAllNull[0] = prevKeyIsNull = true; - prevKeyLength = 0; - output.setWritePosition(0); - nonNullKeyCount = 0; - } else { - seriesIsAllNull[0] = prevKeyIsNull = false; - serializedKeyLengths[0] = currentKeyStart = prevKeyLength = output.getLength(); - hasAnyNulls[0] = keySerializeRow.getHasAnyNulls(); - nonNullKeyCount = 1; - } - - int keyLength; - for (int index = 1; index < currentBatchSize; index++) { - keySerializeRow.setOutputAppend(output); - keySerializeRow.serializeWrite(batch, index); - if (keySerializeRow.getIsAllNulls()) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = prevKeyIsNull = true; - } - output.setWritePosition(currentKeyStart); - } else { - keyLength = output.getLength() - currentKeyStart; - if (!prevKeyIsNull && - StringExpr.equal( - output.getData(), prevKeyStart, prevKeyLength, - output.getData(), currentKeyStart, keyLength)) { - duplicateCounts[seriesCount]++; - output.setWritePosition(currentKeyStart); - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = prevKeyIsNull = false; - prevKeyStart = currentKeyStart; - serializedKeyLengths[nonNullKeyCount] = prevKeyLength = keyLength; - currentKeyStart += keyLength; - hasAnyNulls[nonNullKeyCount] = keySerializeRow.getHasAnyNulls(); - nonNullKeyCount++; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - - // Finally. - computeSerializedHashCodes(); - positionToFirst(); - Preconditions.checkState(validate()); - } - - @Override - public void setNextNonNullKey(int nonNullKeyPosition) { - super.setNextNonNullKey(nonNullKeyPosition); - - currentHasAnyNulls = hasAnyNulls[nonNullKeyPosition]; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerialized.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerialized.java deleted file mode 100644 index 1dfb3df..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerialized.java +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -/** - * An abstract adding serialization to key series. - * - * A key with no or some nulls has serialized bytes, offset, and length. - */ -public interface VectorKeySeriesSerialized extends VectorKeySeries { - - /** - * @return the serialized bytes (including optional key tag), start offset of the key in the - * bytes, and key byte length. - */ - byte[] getSerializedBytes(); - int getSerializedStart(); - int getSerializedLength(); -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerializedImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerializedImpl.java deleted file mode 100644 index 1fbafa7..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerializedImpl.java +++ /dev/null @@ -1,130 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.serde2.ByteStream.Output; -import org.apache.hadoop.hive.serde2.fast.SerializeWrite; -import org.apache.hive.common.util.HashCodeUtil; - -import com.google.common.base.Preconditions; - -/** - * Implementation of base serialization interface. - * - */ -public abstract class VectorKeySeriesSerializedImpl - extends VectorKeySeriesSingleImpl implements VectorKeySeriesSerialized { - - protected T serializeWrite; - - protected int bufferOffset; - - // The serialized (non-NULL) series keys. These 3 members represent the value. - public int serializedStart; - public int serializedLength; - public byte[] serializedBytes; - - protected final Output output; - - protected final int[] serializedKeyLengths; - - public VectorKeySeriesSerializedImpl(T serializeWrite) { - super(); - this.serializeWrite = serializeWrite; - output = new Output(); - serializedKeyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; - } - - public boolean validate() { - super.validate(); - - int nullCount = 0; - for (int i = 0; i < seriesCount; i++) { - if (seriesIsAllNull[i]) { - nullCount++; - } - } - Preconditions.checkState(nullCount + nonNullKeyCount == seriesCount); - - int lengthSum = 0; - int keyLength; - for (int i = 0; i < nonNullKeyCount; i++) { - keyLength = serializedKeyLengths[i]; - Preconditions.checkState(keyLength > 0); - lengthSum += keyLength; - Preconditions.checkState(lengthSum <= output.getLength()); - } - return true; - } - - @Override - public byte[] getSerializedBytes() { - return serializedBytes; - } - - @Override - public int getSerializedStart() { - return serializedStart; - } - - @Override - public int getSerializedLength() { - return serializedLength; - } - - /** - * Batch compute the hash codes for all the serialized keys. - * - * NOTE: MAJOR MAJOR ASSUMPTION: - * We assume that HashCodeUtil.murmurHash produces the same result - * as MurmurHash.hash with seed = 0 (the method used by ReduceSinkOperator for - * UNIFORM distribution). - */ - protected void computeSerializedHashCodes() { - int offset = 0; - int keyLength; - byte[] bytes = output.getData(); - for (int i = 0; i < nonNullKeyCount; i++) { - keyLength = serializedKeyLengths[i]; - hashCodes[i] = HashCodeUtil.murmurHash(bytes, offset, keyLength); - offset += keyLength; - } - } - - @Override - public void positionToFirst() { - - // Reset this before calling positionToFirst. - bufferOffset = 0; - - super.positionToFirst(); - - // This is constant for whole series. - serializedBytes = output.getData(); - } - - @Override - public void setNextNonNullKey(int nonNullKeyPosition) { - serializedStart = bufferOffset; - serializedLength = serializedKeyLengths[nonNullKeyPosition]; - Preconditions.checkState(serializedStart + serializedLength <= output.getData().length); - bufferOffset += serializedLength; - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingle.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingle.java new file mode 100644 index 0000000..b2773db --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingle.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +/** + * Interface for a single key VectorKeySeries. + */ +public interface VectorKeySeriesSingle extends VectorKeySeries { + + /** + * @return the number of keys in the series. Excludes series duplicates and + * includes the ALL NULL keys, too. + */ + int getKeyCount(); + + /** + * @return the number of keys key with at least one column value. + * The keys may have 0 or more NULLs -- but NOT ALL NULLS. + */ + int getNonAllNullKeyCount(); + + /** + * Advance the current key by a duplicate key count. + * If there are more than duplicateCount keys left in the current key, then + * we remain in the current key. + * @param duplicateCount + */ + void advance(int duplicateCount); + + /** + * Position the current non-empty key from the specified position. + * @param nonAllNullKeyPosition + */ + void setNextNonEmptyKey(int nonAllNullKeyPosition); +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingleImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingleImpl.java index bf0a25b..b2d0deb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingleImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingleImpl.java @@ -28,53 +28,64 @@ * */ public abstract class VectorKeySeriesSingleImpl extends VectorKeySeriesImpl - implements VectorKeySeries { + implements VectorKeySeriesSingle { private static final Log LOG = LogFactory.getLog(VectorKeySeriesSingleImpl.class.getName()); protected int currentBatchSize; - // The number of keys (with sequential duplicates collapsed, both NULL and non-NULL) in the batch. - protected int seriesCount; + // The number of keys in the batch. With sequential duplicates collapsed and including + // ALL NULL keys. + protected int keyCount; // The current position in the key series. - protected int seriesPosition; + protected int keyPosition; - // The number of duplicates for each series key (NULL or non-NULL). + // The number of duplicates for each series key. protected final int[] duplicateCounts; // Whether a series key is NULL. - protected final boolean[] seriesIsAllNull; + protected final boolean[] keyAllNulls; - // The number of non-NULL keys. They have associated hash codes and key data. - protected int nonNullKeyCount; + // The number of non ALL NULL keys. They have associated hash codes and key data. + protected int nonAllNullKeyCount; // The current non-NULL key position. - protected int nonNullKeyPosition; + protected int nonAllNullKeyPosition; // The hash code for each non-NULL key. protected final int[] hashCodes; - VectorKeySeriesSingleImpl() { + protected VectorKeySeriesSingleImpl() { super(); - seriesCount = 0; - seriesPosition = 0; + keyCount = 0; + keyPosition = 0; duplicateCounts = new int[VectorizedRowBatch.DEFAULT_SIZE]; - seriesIsAllNull = new boolean[VectorizedRowBatch.DEFAULT_SIZE]; + keyAllNulls = new boolean[VectorizedRowBatch.DEFAULT_SIZE]; - nonNullKeyCount = 0; - nonNullKeyPosition = -1; + nonAllNullKeyCount = 0; + nonAllNullKeyPosition = -1; hashCodes = new int[VectorizedRowBatch.DEFAULT_SIZE]; } + @Override + public int getKeyCount() { + return keyCount; + } + + @Override + public int getNonAllNullKeyCount() { + return nonAllNullKeyCount; + } + public boolean validate() { - Preconditions.checkState(seriesCount > 0); - Preconditions.checkState(seriesCount <= currentBatchSize); - Preconditions.checkState(nonNullKeyCount >= 0); - Preconditions.checkState(nonNullKeyCount <= seriesCount); + Preconditions.checkState(keyCount > 0); + Preconditions.checkState(keyCount <= currentBatchSize); + Preconditions.checkState(nonAllNullKeyCount >= 0); + Preconditions.checkState(nonAllNullKeyCount <= keyCount); validateDuplicateCount(); return true; @@ -83,7 +94,7 @@ public boolean validate() { private void validateDuplicateCount() { int sum = 0; int duplicateCount; - for (int i = 0; i < seriesCount; i++) { + for (int i = 0; i < keyCount; i++) { duplicateCount = duplicateCounts[i]; Preconditions.checkState(duplicateCount > 0); Preconditions.checkState(duplicateCount <= currentBatchSize); @@ -94,18 +105,18 @@ private void validateDuplicateCount() { @Override public void positionToFirst() { - seriesPosition = 0; + keyPosition = 0; currentLogical = 0; currentDuplicateCount = duplicateCounts[0]; - currentIsAllNull = seriesIsAllNull[0]; + currentKeyAllNull = keyAllNulls[0]; - if (!currentIsAllNull) { - nonNullKeyPosition = 0; + if (!currentKeyAllNull) { + nonAllNullKeyPosition = 0; currentHashCode = hashCodes[0]; - setNextNonNullKey(0); + setNextNonEmptyKey(0); } else { - nonNullKeyPosition = -1; + nonAllNullKeyPosition = -1; } Preconditions.checkState(currentDuplicateCount > 0); } @@ -119,40 +130,41 @@ public boolean next() { return false; } - Preconditions.checkState(seriesPosition + 1 < seriesCount); + Preconditions.checkState(keyPosition + 1 < keyCount); - seriesPosition++; - currentDuplicateCount = duplicateCounts[seriesPosition]; - currentIsAllNull = seriesIsAllNull[seriesPosition]; + keyPosition++; + currentDuplicateCount = duplicateCounts[keyPosition]; + currentKeyAllNull = keyAllNulls[keyPosition]; - if (!currentIsAllNull) { - Preconditions.checkState(nonNullKeyPosition + 1 < nonNullKeyCount); - nonNullKeyPosition++; - currentHashCode = hashCodes[nonNullKeyPosition]; - setNextNonNullKey(nonNullKeyPosition); + if (!currentKeyAllNull) { + Preconditions.checkState(nonAllNullKeyPosition + 1 < nonAllNullKeyCount); + nonAllNullKeyPosition++; + currentHashCode = hashCodes[nonAllNullKeyPosition]; + setNextNonEmptyKey(nonAllNullKeyPosition); } Preconditions.checkState(currentDuplicateCount > 0); return true; } // For use by VectorKeySeriesMulti so that the minimum equal key can be advanced. + @Override public void advance(int duplicateCount) { currentLogical += currentDuplicateCount; currentDuplicateCount -= duplicateCount; if (currentDuplicateCount == 0) { - seriesPosition++; - currentIsAllNull = seriesIsAllNull[seriesPosition]; - currentDuplicateCount = duplicateCounts[seriesPosition]; - - if (!currentIsAllNull) { - nonNullKeyPosition++; - currentHashCode = hashCodes[nonNullKeyPosition]; - setNextNonNullKey(nonNullKeyPosition); + keyPosition++; + currentKeyAllNull = keyAllNulls[keyPosition]; + currentDuplicateCount = duplicateCounts[keyPosition]; + + if (!currentKeyAllNull) { + nonAllNullKeyPosition++; + currentHashCode = hashCodes[nonAllNullKeyPosition]; + setNextNonEmptyKey(nonAllNullKeyPosition); } } } - protected abstract void setNextNonNullKey(int nonNullKeyPosition); + public abstract void setNextNonEmptyKey(int nonAllNullKeyPosition); } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesBytesFast.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesBytesFast.java new file mode 100644 index 0000000..ce0f8bf --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesBytesFast.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytesBase; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import com.google.common.base.Preconditions; + +/** + * A key series of a single column of byte array keys where the keys get serialized using + * fast SerializeWrite style serialization. + */ +public class VectorKeySeriesBytesFast + extends VectorKeySeriesBytesBase implements VectorKeySeriesFast { + + private T serializeWrite; + + // The serialized (non-NULL) series keys. These 3 members represent the value. + private byte[] serializedBytes; + private int serializedStart; + private int serializedLength; + + private final Output output; + + private int outputOffset; + + private final int[] serializedKeyLengths; + + public VectorKeySeriesBytesFast(int columnNum, T serializeWrite) { + super(columnNum); + this.serializeWrite = serializeWrite; + output = new Output(); + serializedKeyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + @Override + public byte[] getSerializedBytes() { + return serializedBytes; + } + + @Override + public int getSerializedStart() { + return serializedStart; + } + + @Override + public int getSerializedLength() { + return serializedLength; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + outputOffset = 0; + output.reset(); + + super.processBatch(batch); + + Preconditions.checkState( + VectorKeySeriesFastUtil.validate(keyCount, nonAllNullKeyCount, keyAllNulls, + serializedKeyLengths, output.getLength())); + + if (nonAllNullKeyCount > 0) { + // Compute hash codes of fast serialized keys. + VectorKeySeriesFastUtil.computeSerializedHashCodes(output, serializedKeyLengths, + nonAllNullKeyCount, hashCodes); + } + + // Do the posiiton after we compute the checksums. + positionToFirst(); + } + + @Override + public void saveBytesKey(int nonAllNullKeyPosition, byte[] keyBytes, int keyByteStart, + int keyByteLength) throws IOException { + serializeWrite.setAppend(output); + serializeWrite.writeString(keyBytes, keyByteStart, keyByteLength); + int outputNewOffset = output.getLength(); + serializedKeyLengths[nonAllNullKeyPosition] = outputNewOffset - outputOffset; + outputOffset = outputNewOffset; + } + + @Override + public void positionToFirst() { + + // Reset this before calling positionToFirst. + outputOffset = 0; + + super.positionToFirst(); + + // This is constant for whole series. + serializedBytes = output.getData(); + } + + @Override + public void setNextNonEmptyKey(int nonAllNullKeyPosition) { + serializedStart = outputOffset; + serializedLength = serializedKeyLengths[nonAllNullKeyPosition]; + Preconditions.checkState(serializedStart + serializedLength <= output.getData().length); + outputOffset += serializedLength; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesFast.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesFast.java new file mode 100644 index 0000000..53cbfff --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesFast.java @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeries; + +/** + * An abstract adding to key series the fast SerializeWrite style serialization. + * + * A key with no or some nulls has serialized bytes, offset, and length. + */ +public interface VectorKeySeriesFast extends VectorKeySeries { + + /** + * @return the serialized bytes, start offset of the key in the + * bytes, and key byte length. + */ + byte[] getSerializedBytes(); + int getSerializedStart(); + int getSerializedLength(); +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesFastUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesFastUtil.java new file mode 100644 index 0000000..7b14039 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesFastUtil.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import java.util.Arrays; + +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hive.common.util.HashCodeUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * Implementation of base fast SerializeWrite style serialization interface. + * + */ +public class VectorKeySeriesFastUtil { + + private static final String CLASS_NAME = VectorKeySeriesFastUtil.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + public static boolean validate(int keyCount, int nonAllNullKeyCount, boolean[] keyAllNulls, + int[] serializedKeyLengths, int outputUsedLength) { + + int nullCount = 0; + for (int i = 0; i < keyCount; i++) { + if (keyAllNulls[i]) { + nullCount++; + } + } + Preconditions.checkState(nullCount + nonAllNullKeyCount == keyCount); + + Preconditions.checkState(validateKeyLengthSum(nonAllNullKeyCount, serializedKeyLengths, outputUsedLength)); + + return true; + } + + public static boolean validateKeyLengthSum(int nonAllNullKeyCount, + int[] serializedKeyLengths, int outputUsedLength) { + int lengthSum = 0; + int keyLength; + for (int i = 0; i < nonAllNullKeyCount; i++) { + keyLength = serializedKeyLengths[i]; + Preconditions.checkState(keyLength > 0); + lengthSum += keyLength; + Preconditions.checkState(lengthSum <= outputUsedLength); + } + return true; + } + + /** + * Batch compute the hash codes for all the serialized keys. + * + * NOTE: MAJOR MAJOR ASSUMPTION: + * We assume that HashCodeUtil.murmurHash produces the same result + * as MurmurHash.hash with seed = 0 (the method used by ReduceSinkOperator for + * UNIFORM distribution). + */ + protected static void computeSerializedHashCodes(Output output, int[] serializedKeyLengths, + int nonAllNullKeyCount, int[] hashCodes) { + int offset = 0; + int keyLength; + byte[] bytes = output.getData(); + for (int i = 0; i < nonAllNullKeyCount; i++) { + keyLength = serializedKeyLengths[i]; + hashCodes[i] = HashCodeUtil.murmurHash(bytes, offset, keyLength); + offset += keyLength; + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesLongFast.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesLongFast.java new file mode 100644 index 0000000..2120a2d --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesLongFast.java @@ -0,0 +1,147 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytesBase; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLongBase; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; + +import com.google.common.base.Preconditions; + +/** + * A key series of a single column of long keys where the keys get serialized using + * fast SerializeWrite style serialization + */ +public class VectorKeySeriesLongFast + extends VectorKeySeriesLongBase implements VectorKeySeriesFast { + + private T serializeWrite; + + // The serialized (non-NULL) series keys. These 3 members represent the value. + private byte[] serializedBytes; + private int serializedStart; + private int serializedLength; + + private final Output output; + + private int outputOffset; + + private final int[] serializedKeyLengths; + + public VectorKeySeriesLongFast(int columnNum, PrimitiveTypeInfo primitiveTypeInfo, + T serializeWrite) { + super(columnNum, primitiveTypeInfo); + this.serializeWrite = serializeWrite; + output = new Output(); + serializedKeyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + + @Override + public byte[] getSerializedBytes() { + return serializedBytes; + } + + @Override + public int getSerializedStart() { + return serializedStart; + } + + @Override + public int getSerializedLength() { + return serializedLength; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + outputOffset = 0; + output.reset(); + + super.processBatch(batch); + + Preconditions.checkState( + VectorKeySeriesFastUtil.validate(keyCount, nonAllNullKeyCount, keyAllNulls, + serializedKeyLengths, output.getLength())); + + if (nonAllNullKeyCount > 0) { + // Compute hash codes of fast serialized keys. + VectorKeySeriesFastUtil.computeSerializedHashCodes(output, serializedKeyLengths, + nonAllNullKeyCount, hashCodes); + } + + // Do the posiiton after we compute the checksums. + positionToFirst(); + } + + protected void saveLongKey(int nonAllNullKeyPosition, long key) throws IOException { + serializeWrite.setAppend(output); + + // UNDONE: Add support for DATE, TIMESTAMP, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME... + switch (primitiveCategory) { + case BOOLEAN: + serializeWrite.writeBoolean(key != 0); + break; + case BYTE: + serializeWrite.writeByte((byte) key); + break; + case SHORT: + serializeWrite.writeShort((short) key); + break; + case INT: + serializeWrite.writeInt((int) key); + break; + case LONG: + serializeWrite.writeLong(key); + break; + default: + throw new RuntimeException("Unexpected primitive category " + primitiveCategory.name()); + } + int outputNewPosition = output.getLength(); + serializedKeyLengths[nonAllNullKeyPosition] = outputNewPosition - outputOffset; + outputOffset = outputNewPosition; + } + + @Override + public void positionToFirst() { + + // Reset this before calling positionToFirst. + outputOffset = 0; + + super.positionToFirst(); + + // This is constant for whole series. + serializedBytes = output.getData(); + } + + @Override + public void setNextNonEmptyKey(int nonAllNullKeyPosition) { + serializedStart = outputOffset; + serializedLength = serializedKeyLengths[nonAllNullKeyPosition]; + Preconditions.checkState(serializedStart + serializedLength <= output.getData().length); + outputOffset += serializedLength; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFast.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFast.java new file mode 100644 index 0000000..bc4356c --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFast.java @@ -0,0 +1,220 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesMultiBase; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * A key series of a multiple columns of keys where the keys get serialized using + * fast SerializeWrite style serialization. (Or, it can be 1 column). + */ +public class VectorKeySeriesMultiFast + extends VectorKeySeriesMultiBase implements VectorKeySeriesFast { + + private static final String CLASS_NAME = VectorKeySeriesMultiFast.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + private final T serializeWrite; + + private VectorSerializeRow keySerializeRow; + + // The serialized (non-NULL) series keys. These 3 members represent the value. + private int serializedStart; + private int serializedLength; + private byte[] serializedBytes; + + private final Output output; + + private int currentKeyOffset; + + private final int[] serializedKeyLengths; + + public VectorKeySeriesMultiFast(T serializeWrite) { + this.serializeWrite = serializeWrite; + output = new Output(); + serializedKeyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + public void init(TypeInfo[] typeInfos, int[] columnNums) throws HiveException { + keySerializeRow = new VectorSerializeRow(serializeWrite); + keySerializeRow.init(typeInfos, columnNums); + } + + @Override + public byte[] getSerializedBytes() { + return serializedBytes; + } + + @Override + public int getSerializedStart() { + return serializedStart; + } + + @Override + public int getSerializedLength() { + return serializedLength; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + currentKeyOffset = 0; + output.reset(); + + super.processBatch(batch); + + Preconditions.checkState( + VectorKeySeriesFastUtil.validate(keyCount, nonAllNullKeyCount, keyAllNulls, + serializedKeyLengths, output.getLength())); + + if (nonAllNullKeyCount > 0) { + // Compute hash codes of fast serialized keys. + VectorKeySeriesFastUtil.computeSerializedHashCodes(output, serializedKeyLengths, + nonAllNullKeyCount, hashCodes); + } + + // Do the position after we compute the checksums. + positionToFirst(); + } + + @Override + protected void writeMultiKey(VectorizedRowBatch batch, int index, int nonAllNullKeyCount) + throws IOException { + currentKeyOffset = output.getLength(); + keySerializeRow.setOutputAppend(output); + keySerializeRow.serializeWrite(batch, index); + if (keySerializeRow.getIsAllNulls()) { + output.setWritePosition(currentKeyOffset); + return; + } + + serializedKeyLengths[nonAllNullKeyCount] = output.getLength() - currentKeyOffset; + // Preconditions.checkState( + // VectorKeySeriesFastUtil.validateKeyLengthSum(nonAllNullKeyCount + 1, serializedKeyLengths, + // output.getLength())); + } + + @Override + protected boolean isAllNulls() { + return keySerializeRow.getIsAllNulls(); + } + + @Override + protected boolean hasAnyNulls() { + return keySerializeRow.getHasAnyNulls(); + } + + @Override + protected boolean equalsPrevKey(int nonAllNullKeyCount) { + int prevKeyLength = serializedKeyLengths[nonAllNullKeyCount - 1]; + int keyLength = serializedKeyLengths[nonAllNullKeyCount]; + byte[] bytes = output.getData(); + boolean result = + StringExpr.equal(bytes, currentKeyOffset - prevKeyLength, prevKeyLength, + bytes, currentKeyOffset, keyLength); + return result; + } + + @Override + protected void forgetKey() { + output.setWritePosition(currentKeyOffset); + } + + @Override + public void positionToFirst() { + + // Reset this before calling positionToFirst. + currentKeyOffset = 0; + + super.positionToFirst(); + + // This is constant for whole series. + serializedBytes = output.getData(); + } + + @Override + public void setNextNonEmptyKey(int nonAllNullKeyPosition) { + serializedStart = currentKeyOffset; + serializedLength = serializedKeyLengths[nonAllNullKeyPosition]; + Preconditions.checkState(serializedStart + serializedLength <= output.getData().length); + currentKeyOffset += serializedLength; + currentHasAnyNulls = hasAnyNulls[nonAllNullKeyPosition]; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(CLASS_NAME); + sb.append(" "); + sb.append("keyCount "); + sb.append(keyCount); + sb.append(" nonAllNullKeyCount "); + sb.append(nonAllNullKeyCount); + sb.append("\n"); + int logical = 0; + int nonNullIndex = 0; + int duplicateCount; + boolean isAllNull; + boolean hasAnyNull; + int keyOffset = 0; + int keyLength; + int hashCode; + byte[] bytes = output.getData(); + for (int i = 0; i < keyCount; i++) { + sb.append(logical); + sb.append(" "); + duplicateCount = duplicateCounts[i]; + isAllNull = keyAllNulls[i]; + if (isAllNull) { + sb.append("NULL "); + } else { + keyLength = serializedKeyLengths[nonNullIndex]; + sb.append(displayBytes(bytes, keyOffset, keyLength)); + keyOffset += keyLength; + hashCode = hashCodes[nonNullIndex]; + sb.append(" hashCode "); + sb.append(hashCode); + hasAnyNull = hasAnyNulls[nonNullIndex]; + sb.append(" hasAnyNull "); + sb.append(hasAnyNull); + nonNullIndex++; + } + if (duplicateCount > 1) { + sb.append(" repeat "); + sb.append(duplicateCount); + } + sb.append("\n"); + logical += duplicateCount; + } + return sb.toString(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java index 8ad7ca4..4933af9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.List; import java.util.Map; + import org.apache.commons.lang.ArrayUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,6 +33,9 @@ import org.apache.hadoop.hive.ql.exec.HashTableLoader; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnMapping; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping; @@ -43,19 +47,14 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion; import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedCreateHashTable; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTable; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastHashTableLoader; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; import org.apache.hadoop.hive.serde2.objectinspector.StructField; @@ -63,6 +62,8 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import com.google.common.base.Preconditions; + /** * This class is common operator class for native vectorized map join. * @@ -72,7 +73,43 @@ */ public abstract class VectorMapJoinCommonOperator extends MapJoinOperator implements VectorizationContextRegion { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinCommonOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + + private static final String CLASS_NAME = VectorMapJoinCommonOperator.class.getName(); +private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected abstract String getLoggingPrefix(); + + // For debug tracing: information about the map or reduce task, operator, operator class, etc. + protected transient String loggingPrefix; + + protected String getLoggingPrefix(String className) { + if (loggingPrefix == null) { + initLoggingPrefix(className); + } + return loggingPrefix; + } + + protected void initLoggingPrefix(String className) { + if (hconf == null) { + // Constructor time... + loggingPrefix = className; + } else { + // Determine the name of our map or reduce task for debug tracing. + BaseWork work = Utilities.getMapWork(hconf); + if (work == null) { + work = Utilities.getReduceWork(hconf); + } + loggingPrefix = className + " " + work.getName() + " " + getOperatorId(); + } + } + + //------------------------------------------------------------------------------------------------ + + protected VectorMapJoinDesc vectorDesc; + + protected VectorMapJoinInfo vectorMapJoinInfo; // Whether this operator is an outer join. protected boolean isOuterJoin; @@ -88,10 +125,10 @@ // a mixture of input big table columns and new scratch columns. protected VectorizationContext vOutContext; - // The output column projection of the vectorized row batch. And, the type names of the output + // The output column projection of the vectorized row batch. And, the type infos of the output // columns. protected int[] outputProjection; - protected String[] outputTypeNames; + protected TypeInfo[] outputTypeInfos; // These are the vectorized batch expressions for filtering, key expressions, and value // expressions. @@ -101,15 +138,17 @@ // This is map of which vectorized row batch columns are the big table key columns. Since // we may have key expressions that produce new scratch columns, we need a mapping. - // And, we have their type names. + // And, we have their type infos. protected int[] bigTableKeyColumnMap; - protected ArrayList bigTableKeyTypeNames; + protected String[] bigTableKeyColumnNames; + protected TypeInfo[] bigTableKeyTypeInfos; // Similarly, this is map of which vectorized row batch columns are the big table value columns. // Since we may have value expressions that produce new scratch columns, we need a mapping. - // And, we have their type names. + // And, we have their type infos. protected int[] bigTableValueColumnMap; - protected ArrayList bigTableValueTypeNames; + protected String[] bigTableValueColumnNames; + protected TypeInfo[] bigTableValueTypeInfos; // This is a mapping of which big table columns (input and key/value expressions) will be // part of the big table portion of the join output result. @@ -124,6 +163,8 @@ // to output batch scratch columns for the small table portion. protected VectorColumnSourceMapping smallTableMapping; + protected VectorColumnSourceMapping projectionMapping; + // These are the output columns for the small table and the outer small table keys. protected int[] smallTableOutputVectorColumns; protected int[] bigTableOuterKeyOutputVectorColumns; @@ -137,9 +178,6 @@ // transient. //--------------------------------------------------------------------------- - // For debug tracing: the name of the map or reduce task. - protected transient String taskName; - // The threshold where we should use a repeating vectorized row batch optimization for // generating join output results. protected transient boolean useOverflowRepeatedThreshold; @@ -175,7 +213,12 @@ protected transient boolean needHashTableSetup; // The small table hash table for the native vectorized map join operator. - protected transient VectorMapJoinHashTable vectorMapJoinHashTable; + protected transient MapJoinHashTableFind vectorMapJoinHashTableFind; + + // The map join hash table factory for create hash table results. + protected transient MapJoinHashTableFactory vectormapJoinHashTableFactory; + + protected transient long totalNumSmallTableKeys; /** Kryo ctor. */ protected VectorMapJoinCommonOperator() { @@ -192,6 +235,9 @@ public VectorMapJoinCommonOperator(CompilationOpContext ctx, MapJoinDesc desc = (MapJoinDesc) conf; this.conf = desc; + vectorDesc = desc.getVectorDesc(); + vectorMapJoinInfo = vectorDesc.getVectorMapJoinInfo(); + Preconditions.checkState(vectorMapJoinInfo != null); this.vContext = vContext; @@ -210,214 +256,28 @@ public VectorMapJoinCommonOperator(CompilationOpContext ctx, bigTableFilterExpressions = vContext.getVectorExpressions(filterExpressions.get(posBigTable), VectorExpressionDescriptor.Mode.FILTER); - List keyDesc = desc.getKeys().get(posBigTable); - bigTableKeyExpressions = vContext.getVectorExpressions(keyDesc); - - // Since a key expression can be a calculation and the key will go into a scratch column, - // we need the mapping and type information. - bigTableKeyColumnMap = new int[bigTableKeyExpressions.length]; - bigTableKeyTypeNames = new ArrayList(); - boolean onlyColumns = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - VectorExpression ve = bigTableKeyExpressions[i]; - if (!IdentityExpression.isColumnOnly(ve)) { - onlyColumns = false; - } - bigTableKeyTypeNames.add(keyDesc.get(i).getTypeString()); - bigTableKeyColumnMap[i] = ve.getOutputColumn(); - } - if (onlyColumns) { - bigTableKeyExpressions = null; - } - - List bigTableExprs = desc.getExprs().get(posBigTable); - bigTableValueExpressions = vContext.getVectorExpressions(bigTableExprs); - - /* - * Similarly, we need a mapping since a value expression can be a calculation and the value - * will go into a scratch column. - */ - bigTableValueColumnMap = new int[bigTableValueExpressions.length]; - bigTableValueTypeNames = new ArrayList(); - onlyColumns = true; - for (int i = 0; i < bigTableValueColumnMap.length; i++) { - VectorExpression ve = bigTableValueExpressions[i]; - if (!IdentityExpression.isColumnOnly(ve)) { - onlyColumns = false; - } - bigTableValueTypeNames.add(bigTableExprs.get(i).getTypeString()); - bigTableValueColumnMap[i] = ve.getOutputColumn(); - } - if (onlyColumns) { - bigTableValueExpressions = null; - } - - determineCommonInfo(isOuterJoin); - } - - protected void determineCommonInfo(boolean isOuter) throws HiveException { - - bigTableRetainedMapping = new VectorColumnOutputMapping("Big Table Retained Mapping"); - - bigTableOuterKeyMapping = new VectorColumnOutputMapping("Big Table Outer Key Mapping"); - - // The order of the fields in the LazyBinary small table value must be used, so - // we use the source ordering flavor for the mapping. - smallTableMapping = new VectorColumnSourceMapping("Small Table Mapping"); - - // We use a mapping object here so we can build the projection in any order and - // get the ordered by 0 to n-1 output columns at the end. - // - // Also, to avoid copying a big table key into the small table result area for inner joins, - // we reference it with the projection so there can be duplicate output columns - // in the projection. - VectorColumnSourceMapping projectionMapping = new VectorColumnSourceMapping("Projection Mapping"); - - /* - * Gather up big and small table output result information from the MapJoinDesc. - */ - List bigTableRetainList = conf.getRetainList().get(posBigTable); - int bigTableRetainSize = bigTableRetainList.size(); - - int[] smallTableIndices; - int smallTableIndicesSize; - List smallTableExprs = conf.getExprs().get(posSingleVectorMapJoinSmallTable); - if (conf.getValueIndices() != null && conf.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) { - smallTableIndices = conf.getValueIndices().get(posSingleVectorMapJoinSmallTable); - smallTableIndicesSize = smallTableIndices.length; - } else { - smallTableIndices = null; - smallTableIndicesSize = 0; - } - - List smallTableRetainList = conf.getRetainList().get(posSingleVectorMapJoinSmallTable); - int smallTableRetainSize = smallTableRetainList.size(); - - int smallTableResultSize = 0; - if (smallTableIndicesSize > 0) { - smallTableResultSize = smallTableIndicesSize; - } else if (smallTableRetainSize > 0) { - smallTableResultSize = smallTableRetainSize; - } - - /* - * Determine the big table retained mapping first so we can optimize out (with - * projection) copying inner join big table keys in the subsequent small table results section. - */ - int nextOutputColumn = (order[0] == posBigTable ? 0 : smallTableResultSize); - for (int i = 0; i < bigTableRetainSize; i++) { - - // Since bigTableValueExpressions may do a calculation and produce a scratch column, we - // need to map to the right batch column. - - int retainColumn = bigTableRetainList.get(i); - int batchColumnIndex = bigTableValueColumnMap[retainColumn]; - String typeName = bigTableValueTypeNames.get(i); + bigTableKeyColumnMap = vectorMapJoinInfo.getBigTableKeyColumnMap(); + bigTableKeyColumnNames = vectorMapJoinInfo.getBigTableKeyColumnNames(); + bigTableKeyTypeInfos = vectorMapJoinInfo.getBigTableKeyTypeInfos(); + bigTableKeyExpressions = vectorMapJoinInfo.getBigTableKeyExpressions(); - // With this map we project the big table batch to make it look like an output batch. - projectionMapping.add(nextOutputColumn, batchColumnIndex, typeName); + bigTableValueColumnMap = vectorMapJoinInfo.getBigTableValueColumnMap(); + bigTableValueColumnNames = vectorMapJoinInfo.getBigTableValueColumnNames(); + bigTableValueTypeInfos = vectorMapJoinInfo.getBigTableValueTypeInfos(); + bigTableValueExpressions = vectorMapJoinInfo.getBigTableValueExpressions(); - // Collect columns we copy from the big table batch to the overflow batch. - if (!bigTableRetainedMapping.containsOutputColumn(batchColumnIndex)) { - // Tolerate repeated use of a big table column. - bigTableRetainedMapping.add(batchColumnIndex, batchColumnIndex, typeName); - } - - nextOutputColumn++; - } - - /* - * Now determine the small table results. - */ - int firstSmallTableOutputColumn; - firstSmallTableOutputColumn = (order[0] == posBigTable ? bigTableRetainSize : 0); - int smallTableOutputCount = 0; - nextOutputColumn = firstSmallTableOutputColumn; - - // Small table indices has more information (i.e. keys) than retain, so use it if it exists... - if (smallTableIndicesSize > 0) { - smallTableOutputCount = smallTableIndicesSize; - - for (int i = 0; i < smallTableIndicesSize; i++) { - if (smallTableIndices[i] >= 0) { - - // Zero and above numbers indicate a big table key is needed for - // small table result "area". - - int keyIndex = smallTableIndices[i]; - - // Since bigTableKeyExpressions may do a calculation and produce a scratch column, we - // need to map the right column. - int batchKeyColumn = bigTableKeyColumnMap[keyIndex]; - String typeName = bigTableKeyTypeNames.get(keyIndex); - - if (!isOuter) { - - // Optimize inner join keys of small table results. - - // Project the big table key into the small table result "area". - projectionMapping.add(nextOutputColumn, batchKeyColumn, typeName); - - if (!bigTableRetainedMapping.containsOutputColumn(batchKeyColumn)) { - // If necessary, copy the big table key into the overflow batch's small table - // result "area". - bigTableRetainedMapping.add(batchKeyColumn, batchKeyColumn, typeName); - } - } else { + bigTableRetainedMapping = vectorMapJoinInfo.getBigTableRetainedMapping(); - // For outer joins, since the small table key can be null when there is no match, - // we must have a physical (scratch) column for those keys. We cannot use the - // projection optimization used by inner joins above. + bigTableOuterKeyMapping = vectorMapJoinInfo.getBigTableOuterKeyMapping(); - int scratchColumn = vOutContext.allocateScratchColumn(typeName); - projectionMapping.add(nextOutputColumn, scratchColumn, typeName); - - bigTableRetainedMapping.add(batchKeyColumn, scratchColumn, typeName); - - bigTableOuterKeyMapping.add(batchKeyColumn, scratchColumn, typeName); - } - } else { - - // Negative numbers indicate a column to be (deserialize) read from the small table's - // LazyBinary value row. - int smallTableValueIndex = -smallTableIndices[i] - 1; - - String typeName = smallTableExprs.get(i).getTypeString(); - - // Make a new big table scratch column for the small table value. - int scratchColumn = vOutContext.allocateScratchColumn(typeName); - projectionMapping.add(nextOutputColumn, scratchColumn, typeName); - - smallTableMapping.add(smallTableValueIndex, scratchColumn, typeName); - } - nextOutputColumn++; - } - } else if (smallTableRetainSize > 0) { - smallTableOutputCount = smallTableRetainSize; - - // Only small table values appear in join output result. - - for (int i = 0; i < smallTableRetainSize; i++) { - int smallTableValueIndex = smallTableRetainList.get(i); - - // Make a new big table scratch column for the small table value. - String typeName = smallTableExprs.get(i).getTypeString(); - int scratchColumn = vOutContext.allocateScratchColumn(typeName); - - projectionMapping.add(nextOutputColumn, scratchColumn, typeName); - - smallTableMapping.add(smallTableValueIndex, scratchColumn, typeName); - nextOutputColumn++; - } - } + smallTableMapping = vectorMapJoinInfo.getSmallTableMapping(); - // Convert dynamic arrays and maps to simple arrays. + projectionMapping = vectorMapJoinInfo.getProjectionMapping(); - bigTableRetainedMapping.finalize(); - - bigTableOuterKeyMapping.finalize(); + determineCommonInfo(isOuterJoin); + } - smallTableMapping.finalize(); + protected void determineCommonInfo(boolean isOuter) throws HiveException { bigTableOuterKeyOutputVectorColumns = bigTableOuterKeyMapping.getOutputColumns(); smallTableOutputVectorColumns = smallTableMapping.getOutputColumns(); @@ -429,46 +289,37 @@ protected void determineCommonInfo(boolean isOuter) throws HiveException { smallTableByteColumnVectorColumns = getByteColumnVectorColumns(smallTableMapping); - projectionMapping.finalize(); - - // Verify we added an entry for each output. - assert projectionMapping.isSourceSequenceGood(); - outputProjection = projectionMapping.getOutputColumns(); - outputTypeNames = projectionMapping.getTypeNames(); + outputTypeInfos = projectionMapping.getTypeInfos(); if (isLogDebugEnabled) { int[] orderDisplayable = new int[order.length]; for (int i = 0; i < order.length; i++) { orderDisplayable[i] = (int) order[i]; } - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor order " + Arrays.toString(orderDisplayable)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor posBigTable " + (int) posBigTable); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor posSingleVectorMapJoinSmallTable " + (int) posSingleVectorMapJoinSmallTable); - - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableKeyColumnMap " + Arrays.toString(bigTableKeyColumnMap)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableKeyTypeNames " + bigTableKeyTypeNames); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor order " + Arrays.toString(orderDisplayable)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor posBigTable " + (int) posBigTable); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor posSingleVectorMapJoinSmallTable " + (int) posSingleVectorMapJoinSmallTable); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableValueColumnMap " + Arrays.toString(bigTableValueColumnMap)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableValueTypeNames " + bigTableValueTypeNames); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableKeyColumnMap " + Arrays.toString(bigTableKeyColumnMap)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableKeyColumnNames " + Arrays.toString(bigTableKeyColumnNames)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableKeyTypeInfos " + Arrays.toString(bigTableKeyTypeInfos)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableIndices " + Arrays.toString(smallTableIndices)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableRetainList " + smallTableRetainList); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableValueColumnMap " + Arrays.toString(bigTableValueColumnMap)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableValueColumnNames " + Arrays.toString(bigTableValueColumnNames)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableValueTypeNames " + Arrays.toString(bigTableValueTypeInfos)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor firstSmallTableOutputColumn " + firstSmallTableOutputColumn); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableOutputCount " + smallTableOutputCount); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableRetainedMapping " + bigTableRetainedMapping.toString()); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableRetainedMapping " + bigTableRetainedMapping.toString()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableOuterKeyMapping " + bigTableOuterKeyMapping.toString()); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableOuterKeyMapping " + bigTableOuterKeyMapping.toString()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor smallTableMapping " + smallTableMapping.toString()); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableMapping " + smallTableMapping.toString()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableByteColumnVectorColumns " + Arrays.toString(bigTableByteColumnVectorColumns)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor smallTableByteColumnVectorColumns " + Arrays.toString(smallTableByteColumnVectorColumns)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableByteColumnVectorColumns " + Arrays.toString(bigTableByteColumnVectorColumns)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableByteColumnVectorColumns " + Arrays.toString(smallTableByteColumnVectorColumns)); - - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor outputProjection " + Arrays.toString(outputProjection)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor outputTypeNames " + Arrays.toString(outputTypeNames)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor outputProjection " + Arrays.toString(outputProjection)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor outputTypeInfos " + Arrays.toString(outputTypeInfos)); } setupVOutContext(conf.getOutputColumnNames()); @@ -482,10 +333,10 @@ protected void determineCommonInfo(boolean isOuter) throws HiveException { ArrayList list = new ArrayList(); int count = mapping.getCount(); int[] outputColumns = mapping.getOutputColumns(); - String[] typeNames = mapping.getTypeNames(); + TypeInfo[] typeInfos = mapping.getTypeInfos(); for (int i = 0; i < count; i++) { int outputColumn = outputColumns[i]; - String typeName = typeNames[i]; + String typeName = typeInfos[i].getTypeName(); if (VectorizationContext.isStringFamily(typeName)) { list.add(outputColumn); } @@ -500,10 +351,10 @@ protected void determineCommonInfo(boolean isOuter) throws HiveException { */ protected void setupVOutContext(List outputColumnNames) { if (isLogDebugEnabled) { - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor outputColumnNames " + outputColumnNames); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor outputColumnNames " + outputColumnNames); } if (outputColumnNames.size() != outputProjection.length) { - throw new RuntimeException("Output column names " + outputColumnNames + " length and output projection " + Arrays.toString(outputProjection) + " / " + Arrays.toString(outputTypeNames) + " length mismatch"); + throw new RuntimeException("Output column names " + outputColumnNames + " length and output projection " + Arrays.toString(outputProjection) + " / " + Arrays.toString(outputTypeInfos) + " length mismatch"); } vOutContext.resetProjectionColumns(); for (int i = 0; i < outputColumnNames.size(); ++i) { @@ -512,49 +363,15 @@ protected void setupVOutContext(List outputColumnNames) { vOutContext.addProjectionColumn(columnName, outputColumn); if (isLogDebugEnabled) { - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor addProjectionColumn " + i + " columnName " + columnName + " outputColumn " + outputColumn); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor addProjectionColumn " + i + " columnName " + columnName + " outputColumn " + outputColumn); } } } - /** - * This override lets us substitute our own fast vectorized hash table loader. - */ - @Override - protected HashTableLoader getHashTableLoader(Configuration hconf) { - VectorMapJoinDesc vectorDesc = conf.getVectorDesc(); - HashTableImplementationType hashTableImplementationType = vectorDesc.hashTableImplementationType(); - HashTableLoader hashTableLoader; - switch (vectorDesc.hashTableImplementationType()) { - case OPTIMIZED: - // Use the Tez hash table loader. - hashTableLoader = HashTableLoaderFactory.getLoader(hconf); - break; - case FAST: - // Use our specialized hash table loader. - hashTableLoader = HiveConf.getVar( - hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark") ? - HashTableLoaderFactory.getLoader(hconf) : new VectorMapJoinFastHashTableLoader(); - break; - default: - throw new RuntimeException("Unknown vector map join hash table implementation type " + hashTableImplementationType.name()); - } - return hashTableLoader; - } - @Override protected void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); - if (isLogDebugEnabled) { - // Determine the name of our map or reduce task for debug tracing. - BaseWork work = Utilities.getMapWork(hconf); - if (work == null) { - work = Utilities.getReduceWork(hconf); - } - taskName = work.getName(); - } - /* * Get configuration parameters. */ @@ -570,8 +387,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { smallTableVectorDeserializeRow = new VectorDeserializeRow( new LazyBinaryDeserializeRead( - VectorizedBatchUtil.typeInfosFromTypeNames( - smallTableMapping.getTypeNames()))); + smallTableMapping.getTypeInfos())); smallTableVectorDeserializeRow.init(smallTableMapping.getOutputColumns()); } @@ -595,13 +411,13 @@ protected void initializeOp(Configuration hconf) throws HiveException { if (isLogDebugEnabled) { int[] currentScratchColumns = vOutContext.currentScratchColumns(); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator initializeOp currentScratchColumns " + Arrays.toString(currentScratchColumns)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator initializeOp currentScratchColumns " + Arrays.toString(currentScratchColumns)); StructObjectInspector structOutputObjectInspector = (StructObjectInspector) outputObjInspector; List fields = structOutputObjectInspector.getAllStructFieldRefs(); int i = 0; for (StructField field : fields) { - LOG.debug("VectorMapJoinInnerBigOnlyCommonOperator initializeOp " + i + " field " + field.getFieldName() + " type " + field.getFieldObjectInspector().getTypeName()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator initializeOp " + i + " field " + field.getFieldName() + " type " + field.getFieldObjectInspector().getTypeName()); i++; } } @@ -612,30 +428,16 @@ protected void completeInitializationOp(Object[] os) throws HiveException { // setup mapJoinTables and serdes super.completeInitializationOp(os); - VectorMapJoinDesc vectorDesc = conf.getVectorDesc(); - HashTableImplementationType hashTableImplementationType = vectorDesc.hashTableImplementationType(); - switch (vectorDesc.hashTableImplementationType()) { - case OPTIMIZED: - { - // Create our vector map join optimized hash table variation *above* the - // map join table container. - vectorMapJoinHashTable = VectorMapJoinOptimizedCreateHashTable.createHashTable(conf, - mapJoinTables[posSingleVectorMapJoinSmallTable]); - } - break; - - case FAST: - { - // Get our vector map join fast hash table variation from the - // vector map join table container. - VectorMapJoinTableContainer vectorMapJoinTableContainer = - (VectorMapJoinTableContainer) mapJoinTables[posSingleVectorMapJoinSmallTable]; - vectorMapJoinHashTable = vectorMapJoinTableContainer.vectorMapJoinHashTable(); - } - break; - default: - throw new RuntimeException("Unknown vector map join hash table implementation type " + hashTableImplementationType.name()); - } + MapJoinTableContainer mapJoinTableContainer = + mapJoinTables[posSingleVectorMapJoinSmallTable]; + + // The hash table for the specialized operator. + vectorMapJoinHashTableFind = mapJoinTableContainer.getMapJoinHashTableFind(); + + // The factory so we can create result objects. + vectormapJoinHashTableFactory = mapJoinTableContainer.getMapJoinHashTableFactory(); + + totalNumSmallTableKeys = mapJoinTableContainer.size(); } /* @@ -653,7 +455,7 @@ protected VectorizedRowBatch setupOverflowBatch() throws HiveException { // First, just allocate just the projection columns we will be using. for (int i = 0; i < outputProjection.length; i++) { int outputColumn = outputProjection[i]; - String typeName = outputTypeNames[i]; + String typeName = outputTypeInfos[i].getTypeName(); allocateOverflowBatchColumnVector(overflowBatch, outputColumn, typeName); } @@ -685,7 +487,7 @@ private void allocateOverflowBatchColumnVector(VectorizedRowBatch overflowBatch, overflowBatch.cols[outputColumn] = VectorizedBatchUtil.createColumnVector(typeInfo); if (isLogDebugEnabled) { - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator initializeOp overflowBatch outputColumn " + outputColumn + " class " + overflowBatch.cols[outputColumn].getClass().getSimpleName()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator initializeOp overflowBatch outputColumn " + outputColumn + " class " + overflowBatch.cols[outputColumn].getClass().getSimpleName()); } } } @@ -722,9 +524,9 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { } protected void displayBatchColumns(VectorizedRowBatch batch, String batchName) { - LOG.debug("commonSetup " + batchName + " column count " + batch.numCols); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator commonSetup " + batchName + " column count " + batch.numCols); for (int column = 0; column < batch.numCols; column++) { - LOG.debug("commonSetup " + batchName + " column " + column + " type " + (batch.cols[column] == null ? "NULL" : batch.cols[column].getClass().getSimpleName())); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator commonSetup " + batchName + " column " + column + " type " + (batch.cols[column] == null ? "NULL" : batch.cols[column].getClass().getSimpleName())); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java index 5cbace4..e43a09f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java @@ -23,13 +23,15 @@ import java.util.List; import org.apache.commons.lang.ArrayUtils; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorDeserializeRow; import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; @@ -37,9 +39,6 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedCreateHashTable; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.SerDeException; @@ -47,11 +46,11 @@ import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.ByteStream.Output; +import com.google.common.base.Preconditions; + /** * This class has methods for generating vectorized join results and forwarding batchs. * @@ -73,8 +72,16 @@ public abstract class VectorMapJoinGenerateResultOperator extends VectorMapJoinCommonOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinGenerateResultOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinGenerateResultOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix(String className) { + // Use operator's class name. + return super.getLoggingPrefix(className); + } //------------------------------------------------------------------------------------------------ @@ -86,6 +93,13 @@ // Debug display. protected transient long batchCounter; + protected transient long spilledRowCounter; + protected transient long inputRowCounter; + protected transient long bigTableOutputRowCounter; + protected transient long overflowOutputRowCounter; + + protected transient long singleValueCounter; + protected transient long multiValueCounter; /** Kryo ctor. */ protected VectorMapJoinGenerateResultOperator() { @@ -105,13 +119,138 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); batchCounter = 0; + spilledRowCounter = 0; + inputRowCounter = 0; + bigTableOutputRowCounter = 0; + overflowOutputRowCounter = 0; + singleValueCounter = 0; + multiValueCounter = 0; } //------------------------------------------------------------------------------------------------ - protected void performValueExpressions(VectorizedRowBatch batch, - int[] allMatchs, int allMatchCount) { + protected static int makeSelectedByRemovingSeries( + boolean selectedInUse, int[] selected, int selectedSize, + int[] seriesLogicalIndices, int[] seriesDuplicateCounts, int keyCount, + boolean seriesSelectedInUse, int[] seriesSelected, int seriesSelectedSize, + int[] resultSelected) { + int resultCount = 0; + int logical = 0; + for (int i = 0; i < keyCount; i++) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + + int seriesLogical = seriesLogicalIndices[i]; + int seriesBatchIndex = (seriesSelectedInUse ? seriesSelected[seriesLogical] : seriesLogical); + + // Add any selected batch indices before series batch index; + while (batchIndex < seriesBatchIndex) { + resultSelected[resultCount++] = batchIndex; + logical++; + batchIndex = (selectedInUse ? selected[logical] : logical); + } + Preconditions.checkState(batchIndex == seriesBatchIndex); + + // Skip series + logical += seriesDuplicateCounts[i]; + } + + // Grab non series after last series. + while (logical < selectedSize) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + resultSelected[resultCount++] = batchIndex; + logical++; + } + return resultCount; + } + + protected static int makeSelectedByRemovingMultiValues( + boolean selectedInUse, int[] selected, int selectedSize, + int[] matchLogicalIndices, int[] matchDuplicateCounts, boolean[] matchIsSingleValue, + int matchSeriesCount, + boolean seriesSelectedInUse, int[] seriesSelected, int seriesSelectedSize, + int[] resultSelected) { + int resultCount = 0; + int logical = 0; + for (int i = 0; i < matchSeriesCount; i++) { + if (matchIsSingleValue[i]) { + continue; + } + + int batchIndex = (selectedInUse ? selected[logical] : logical); + + int seriesLogical = matchLogicalIndices[i]; + int seriesBatchIndex = (seriesSelectedInUse ? seriesSelected[seriesLogical] : seriesLogical); + + // Add any selected batch indices before match series multi-value batch index, including + // match single-value series. + while (batchIndex < seriesBatchIndex) { + resultSelected[resultCount++] = batchIndex; + logical++; + batchIndex = (selectedInUse ? selected[logical] : logical); + } + Preconditions.checkState(batchIndex == seriesBatchIndex); + + // Skip series + logical += matchDuplicateCounts[i]; + } + + // Grab non series after last match series multi-value. + while (logical < selectedSize) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + resultSelected[resultCount++] = batchIndex; + logical++; + } + return resultCount; + } + + + protected static int flattenLogicalSeriesIntoSelected( + boolean selectedInUse, int[] selected, int size, + int[] seriesLogicalIndices, int[] seriesDuplicateCounts, int keyCount, + int[] resultSelected) { + + int resultCount = 0; + for (int i = 0; i < keyCount; i++) { + int seriesLogical = seriesLogicalIndices[i]; + int seriesDuplicateCount = seriesDuplicateCounts[i]; + for (int s = seriesLogical; s < seriesLogical + seriesDuplicateCount; s++) { + int batchIndex = (selectedInUse ? selected[s] : s); + resultSelected[resultCount++] = batchIndex; + } + } + return resultCount; + } + + protected static int makeMatchSelectedWithoutMultiValues( + boolean selectedInUse, int[] selected, int size, + int[] matchLogicalIndices, int[] matchDuplicateCounts, boolean[] matchIsSingleValue, + int matchSeriesCount, + int[] resultSelected) { + + int resultCount = 0; + int matchLogical; + int matchDuplicateCount; + for (int i = 0; i < matchSeriesCount; i++) { + if (matchIsSingleValue[i]) { + // Only include single-value small table result series. + matchLogical = matchLogicalIndices[i]; + matchDuplicateCount = matchDuplicateCounts[i]; + for (int m = matchLogical; m < matchLogical + matchDuplicateCount; m++) { + int batchIndex = (selectedInUse ? selected[m] : m); + resultSelected[resultCount++] = batchIndex; + } + } + } + + return resultCount; + } + + //------------------------------------------------------------------------------------------------ + + protected void performValueExpressions(VectorizedRowBatch batch, int[] newSelected, + int newSelectedCount) { + /* * For the moment, pretend all matched are selected so we can evaluate the value * expressions. @@ -120,10 +259,10 @@ protected void performValueExpressions(VectorizedRowBatch batch, * selected and real batch size later... */ int[] saveSelected = batch.selected; - batch.selected = allMatchs; + batch.selected = newSelected; boolean saveSelectedInUse = batch.selectedInUse; batch.selectedInUse = true; - batch.size = allMatchCount; + batch.size = newSelectedCount; // Run our value expressions over whole batch. for(VectorExpression ve: bigTableValueExpressions) { @@ -158,9 +297,11 @@ protected void performValueExpressions(VectorizedRowBatch batch, * @return * The new count of selected rows. */ - protected int generateHashMapResultSingleValue(VectorizedRowBatch batch, - VectorMapJoinHashMapResult hashMapResult, int[] allMatchs, int allMatchesIndex, - int duplicateCount, int numSel) throws HiveException, IOException { + protected void generateHashMapResultSingleValue(VectorizedRowBatch batch, + MapJoinHashMapResult hashMapResult, int logical, int duplicateCount, + boolean selectedInUse, int[] selected, int size) throws HiveException, IOException { + + singleValueCounter += duplicateCount; // Read single value. @@ -168,9 +309,9 @@ protected int generateHashMapResultSingleValue(VectorizedRowBatch batch, // Generate result within big table batch itself. - for (int i = 0; i < duplicateCount; i++) { + for (int i = logical; i < logical + duplicateCount; i++) { - int batchIndex = allMatchs[allMatchesIndex + i]; + int batchIndex = (selectedInUse ? selected[i] : i); // Outer key copying is only used when we are using the input BigTable batch as the output. // @@ -189,13 +330,8 @@ protected int generateHashMapResultSingleValue(VectorizedRowBatch batch, smallTableVectorDeserializeRow.deserializeByValue(batch, batchIndex); } - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, "generateHashMapResultSingleValue big table"); - - // Use the big table row as output. - batch.selected[numSel++] = batchIndex; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, getLoggingPrefix() + " generateHashMapResultSingleValue"); } - - return numSel; } /** @@ -213,8 +349,10 @@ protected int generateHashMapResultSingleValue(VectorizedRowBatch batch, * Number of equal key rows. */ protected void generateHashMapResultMultiValue(VectorizedRowBatch batch, - VectorMapJoinHashMapResult hashMapResult, int[] allMatchs, int allMatchesIndex, - int duplicateCount) throws HiveException, IOException { + MapJoinHashMapResult hashMapResult, int logical, int duplicateCount, + boolean selectedInUse, int[] selected, int size) throws HiveException, IOException { + + multiValueCounter += duplicateCount; if (useOverflowRepeatedThreshold && hashMapResult.isCappedCountAvailable() && @@ -224,16 +362,16 @@ protected void generateHashMapResultMultiValue(VectorizedRowBatch batch, // row batch optimization in the overflow batch. generateHashMapResultLargeMultiValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount); + batch, hashMapResult, logical, duplicateCount, selectedInUse, selected, size); return; } // We do the cross product of the N big table equal key row's values against the // small table matching key which has M value rows into overflow batch. - for (int i = 0; i < duplicateCount; i++) { + for (int i = logical; i < logical + duplicateCount; i++) { - int batchIndex = allMatchs[allMatchesIndex + i]; + int batchIndex = (selectedInUse ? selected[i] : i); ByteSegmentRef byteSegmentRef = hashMapResult.first(); while (byteSegmentRef != null) { @@ -283,8 +421,8 @@ protected void generateHashMapResultMultiValue(VectorizedRowBatch batch, * Number of equal key rows. */ private void generateHashMapResultLargeMultiValue(VectorizedRowBatch batch, - VectorMapJoinHashMapResult hashMapResult, int[] allMatchs, int allMatchesIndex, - int duplicateCount) throws HiveException, IOException { + MapJoinHashMapResult hashMapResult, int logical, int duplicateCount, + boolean selectedInUse, int[] selected, int size) throws HiveException, IOException { // Kick out previous overflow batch results. if (overflowBatch.size > 0) { @@ -325,9 +463,9 @@ private void generateHashMapResultLargeMultiValue(VectorizedRowBatch batch, // And, not set repeating every time... // - for (int i = 0; i < duplicateCount; i++) { + for (int i = logical; i < logical + duplicateCount; i++) { - int batchIndex = allMatchs[allMatchesIndex + i]; + int batchIndex = (selectedInUse ? selected[i] : i); if (bigTableRetainedVectorCopy != null) { // The one big table row's values repeat. @@ -360,41 +498,6 @@ private void generateHashMapResultLargeMultiValue(VectorizedRowBatch batch, overflowBatch.reset(); } - /** - * Generate optimized results when entire batch key is repeated and it matched the hash map. - * - * @param batch - * The big table batch. - * @param hashMapResult - * The hash map results for the repeated key. - */ - protected void generateHashMapResultRepeatedAll(VectorizedRowBatch batch, - VectorMapJoinHashMapResult hashMapResult) throws IOException, HiveException { - - int[] selected = batch.selected; - - if (batch.selectedInUse) { - // The selected array is already filled in as we want it. - } else { - for (int i = 0; i < batch.size; i++) { - selected[i] = i; - } - batch.selectedInUse = true; - } - - int numSel = 0; - if (hashMapResult.isSingleRow()) { - numSel = generateHashMapResultSingleValue(batch, hashMapResult, - batch.selected, 0, batch.size, numSel); - - } else { - generateHashMapResultMultiValue(batch, hashMapResult, - batch.selected, 0, batch.size); - } - - batch.size = numSel; - } - //----------------------------------------------------------------------------------------------- /* @@ -441,27 +544,23 @@ private void setupSpillSerDe(VectorizedRowBatch batch) throws HiveException { } private void spillSerializeRow(VectorizedRowBatch batch, int batchIndex, - VectorMapJoinHashTableResult hashTableResult) throws IOException { + MapJoinHashTableResult hashTableResult) throws IOException { - int partitionId = hashTableResult.spillPartitionId(); + int partitionId = hashTableResult.getSpillPartitionId(); HybridHashTableContainer ht = (HybridHashTableContainer) mapJoinTables[posSingleVectorMapJoinSmallTable]; HashPartition hp = ht.getHashPartitions()[partitionId]; VectorMapJoinRowBytesContainer rowBytesContainer = hp.getMatchfileRowBytesContainer(); Output output = rowBytesContainer.getOuputForRowBytes(); -// int offset = output.getLength(); bigTableVectorSerializeRow.setOutputAppend(output); bigTableVectorSerializeRow.serializeWrite(batch, batchIndex); -// int length = output.getLength() - offset; rowBytesContainer.finishRow(); - -// LOG.debug("spillSerializeRow spilled batchIndex " + batchIndex + ", length " + length); } - protected void spillHashMapBatch(VectorizedRowBatch batch, - VectorMapJoinHashTableResult[] hashTableResults, - int[] spills, int[] spillHashTableResultIndices, int spillCount) + protected void spillHashMapBatch(VectorizedRowBatch batch, int[] spillLogicalIndices, + int[] spillDuplicateCounts, MapJoinHashTableResult[] spillHashTableResults, + int spillCount, boolean selectedInUse, int[] selected, int size) throws HiveException, IOException { if (bigTableVectorSerializeRow == null) { @@ -469,27 +568,31 @@ protected void spillHashMapBatch(VectorizedRowBatch batch, } for (int i = 0; i < spillCount; i++) { - int batchIndex = spills[i]; - - int hashTableResultIndex = spillHashTableResultIndices[i]; - VectorMapJoinHashTableResult hashTableResult = hashTableResults[hashTableResultIndex]; - - spillSerializeRow(batch, batchIndex, hashTableResult); + int logical = spillLogicalIndices[i]; + int duplicateCount = spillDuplicateCounts[i]; + MapJoinHashTableResult hashTableResult = spillHashTableResults[i]; + for (int s = logical; s < logical + duplicateCount; s++) { + int batchIndex = (selectedInUse ? selected[s] : s); + spillSerializeRow(batch, batchIndex, hashTableResult); + } } } - protected void spillBatchRepeated(VectorizedRowBatch batch, - VectorMapJoinHashTableResult hashTableResult) throws HiveException, IOException { + protected void spillHashMapBatch(VectorizedRowBatch batch, + MapJoinHashTableResult[] hashTableResults, + int[] spills, int[] spillHashTableResultIndices, int spillCount) + throws HiveException, IOException { if (bigTableVectorSerializeRow == null) { setupSpillSerDe(batch); } - int[] selected = batch.selected; - boolean selectedInUse = batch.selectedInUse; + for (int i = 0; i < spillCount; i++) { + int batchIndex = spills[i]; + + int hashTableResultIndex = spillHashTableResultIndices[i]; + MapJoinHashTableResult hashTableResult = hashTableResults[hashTableResultIndex]; - for (int logical = 0; logical < batch.size; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); spillSerializeRow(batch, batchIndex, hashTableResult); } } @@ -504,8 +607,8 @@ protected void reloadHashTable(byte pos, int partitionId) MapJoinTableContainer smallTable = spilledMapJoinTables[pos]; - vectorMapJoinHashTable = VectorMapJoinOptimizedCreateHashTable.createHashTable(conf, - smallTable); + vectorMapJoinHashTableFind = smallTable.getMapJoinHashTableFind(); + needHashTableSetup = true; if (isLogDebugEnabled) { @@ -542,14 +645,11 @@ protected void reProcessBigTable(int partitionId) int offset = bigTable.currentOffset(); int length = bigTable.currentLength(); -// LOG.debug(CLASS_NAME + " reProcessBigTable serialized row #" + rowCount + ", offset " + offset + ", length " + length); - bigTableVectorDeserializeRow.setBytes(bytes, offset, length); bigTableVectorDeserializeRow.deserializeByValue(spillReplayBatch, spillReplayBatch.size); spillReplayBatch.size++; if (spillReplayBatch.size == VectorizedRowBatch.DEFAULT_SIZE) { - // LOG.debug("reProcessBigTable going to call process with spillReplayBatch.size " + spillReplayBatch.size + " rows"); process(spillReplayBatch, posBigTable); // call process once we have a full batch spillReplayBatch.reset(); batchCount++; @@ -557,7 +657,6 @@ protected void reProcessBigTable(int partitionId) } // Process the row batch that has less than DEFAULT_SIZE rows if (spillReplayBatch.size > 0) { - // LOG.debug("reProcessBigTable going to call process with spillReplayBatch.size " + spillReplayBatch.size + " rows"); process(spillReplayBatch, posBigTable); spillReplayBatch.reset(); batchCount++; @@ -596,7 +695,10 @@ public void forwardBigTableBatch(VectorizedRowBatch batch) throws HiveException batch.projectionSize = outputProjection.length; batch.projectedColumns = outputProjection; + VectorizedBatchUtil.debugDisplayBatch(batch, CLASS_NAME); + forward(batch, null); + bigTableOutputRowCounter += batch.size; // Revert the projected columns back, because batch can be re-used by our parent operators. batch.projectionSize = originalProjectionSize; @@ -609,6 +711,7 @@ public void forwardBigTableBatch(VectorizedRowBatch batch) throws HiveException */ protected void forwardOverflow() throws HiveException { forward(overflowBatch, null); + overflowOutputRowCounter += overflowBatch.size; overflowBatch.reset(); } @@ -617,6 +720,7 @@ protected void forwardOverflow() throws HiveException { */ private void forwardOverflowNoReset() throws HiveException { forward(overflowBatch, null); + overflowOutputRowCounter += overflowBatch.size; } /* @@ -629,11 +733,20 @@ private void forwardOverflowNoReset() throws HiveException { @Override public void closeOp(boolean aborted) throws HiveException { super.closeOp(aborted); - if (!aborted && overflowBatch.size > 0) { - forwardOverflow(); - } - if (isLogDebugEnabled) { - LOG.debug("VectorMapJoinInnerLongOperator closeOp " + batchCounter + " batches processed"); + if (!aborted) { + if (overflowBatch.size > 0) { + forwardOverflow(); + } + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " closeOp " + batchCounter + " batches processed, " + + inputRowCounter + " big table input rows, " + + totalNumSmallTableKeys + " small table keys, " + + spilledRowCounter + " spilled rows, " + + singleValueCounter + " single value rows, " + + multiValueCounter + " multiple value rows " + + bigTableOutputRowCounter + " big table output rows, " + + overflowOutputRowCounter + " overflow output rows"); + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyGenerateResultOperator.java index dfb5bf8..f7663f2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyGenerateResultOperator.java @@ -23,13 +23,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -48,42 +44,41 @@ extends VectorMapJoinGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyGenerateResultOperator.class.getName()); - //--------------------------------------------------------------------------- + //------------------------------------------------------------------------------------------------ + + private static final String CLASS_NAME = VectorMapJoinInnerBigOnlyGenerateResultOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------------------------ // Inner big-table only join specific members. // // An array of hash multi-set results so we can do lookups on the whole batch before output result // generation. - protected transient VectorMapJoinHashMultiSetResult hashMultiSetResults[]; + protected transient MapJoinHashMultiSetResult hashMultiSetResults[]; - // Pre-allocated member for storing the (physical) batch index of matching row (single- or - // multi-small-table-valued) indexes during a process call. - protected transient int[] allMatchs; + // Pre-allocated member for storing the batch indices of the matched rows. + protected transient int[] matchSelected; - /* - * Pre-allocated members for storing information on single- and multi-valued-small-table matches. - * - * ~ValueCounts - * Number of (empty) small table values. - * ~AllMatchIndices - * (Logical) indices into allMatchs to the first row of a match of a - * possible series of duplicate keys. - * ~DuplicateCounts - * The duplicate count for each matched key. - * - */ - protected transient long[] equalKeySeriesValueCounts; - protected transient int[] equalKeySeriesAllMatchIndices; - protected transient int[] equalKeySeriesDuplicateCounts; + // Pre-allocated member for storing the new logical batch index (within newSelected) and + // series count of rows that matched. + protected transient int[] matchLogicalIndices; + protected transient int[] matchDuplicateCounts; + protected transient long[] matchValueCounts; + // Pre-allocated member for storing the logical batch index and series count of rows that spilled. + protected transient int[] spillLogicalIndices; + protected transient int[] spillDuplicateCounts; - // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. - protected transient int[] spills; - - // Pre-allocated member for storing index into the hashMultiSetResults for each spilled row. - protected transient int[] spillHashMapResultIndices; + // Pre-allocated member for storing the reference to the hash multi-set results of rows that spilled. + protected transient MapJoinHashMultiSetResult[] spillHashMultiSetResults; /** Kryo ctor. */ protected VectorMapJoinInnerBigOnlyGenerateResultOperator() { @@ -106,21 +101,20 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); // Inner big-table only join specific. - VectorMapJoinHashMultiSet baseHashMultiSet = (VectorMapJoinHashMultiSet) vectorMapJoinHashTable; - - hashMultiSetResults = new VectorMapJoinHashMultiSetResult[batch.DEFAULT_SIZE]; + hashMultiSetResults = new MapJoinHashMultiSetResult[batch.DEFAULT_SIZE]; for (int i = 0; i < hashMultiSetResults.length; i++) { - hashMultiSetResults[i] = baseHashMultiSet.createHashMultiSetResult(); + hashMultiSetResults[i] = vectormapJoinHashTableFactory.createHashMultiSetResult(); } - allMatchs = new int[batch.DEFAULT_SIZE]; + matchSelected = new int[batch.DEFAULT_SIZE]; - equalKeySeriesValueCounts = new long[batch.DEFAULT_SIZE]; - equalKeySeriesAllMatchIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchLogicalIndices = new int[batch.DEFAULT_SIZE]; + matchDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchValueCounts = new long[batch.DEFAULT_SIZE]; - spills = new int[batch.DEFAULT_SIZE]; - spillHashMapResultIndices = new int[batch.DEFAULT_SIZE]; + spillLogicalIndices = new int[batch.DEFAULT_SIZE]; + spillDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillHashMultiSetResults = new MapJoinHashMultiSetResult[batch.DEFAULT_SIZE]; } //----------------------------------------------------------------------------------------------- @@ -135,50 +129,59 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { * @param batch * The big table batch with any matching and any non matching rows both as * selected in use. - * @param allMatchCount - * Number of matches in allMatchs. - * @param equalKeySeriesCount - * Number of single value matches. - * @param spillCount - * Number of spills in spills. - * @param hashTableResults - * The array of all hash table results for the batch. We need the - * VectorMapJoinHashTableResult for the spill information. - * @param hashMapResultCount - * Number of entries in hashMapResults. + * @param matchSeriesCount + * Number of matches in the match* arrays. + * @param matchSelectedCount + * The selected count in matchSelected. + * @param spillSeriesCount + * Number of spills in spill* arrays. * **/ - protected void finishInnerBigOnly(VectorizedRowBatch batch, - int allMatchCount, int equalKeySeriesCount, int spillCount, - VectorMapJoinHashTableResult[] hashTableResults, int hashMapResultCount) - throws HiveException, IOException { + protected void finishInnerBigOnly(VectorizedRowBatch batch, int matchSeriesCount, + int spillSeriesCount) throws HiveException, IOException { + + final int selectedSize = batch.size; + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; + + // Dump out the spill rows now. + if (spillSeriesCount > 0) { - // Get rid of spills before we start modifying the batch. - if (spillCount > 0) { - spillHashMapBatch(batch, hashTableResults, - spills, spillHashMapResultIndices, spillCount); + spillHashMapBatch(batch, spillLogicalIndices, spillDuplicateCounts, spillHashMultiSetResults, + spillSeriesCount, selectedInUse, selected, selectedSize); + + if (spillSeriesCount == selectedSize) { + batch.size = 0; + batch.selectedInUse = false; + return; + } } + int matchSelectedCount = + flattenLogicalSeriesIntoSelected( + selectedInUse, selected, selectedSize, + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + matchSelected); + /* * Optimize by running value expressions only over the matched rows. */ - if (allMatchCount > 0 && bigTableValueExpressions != null) { - performValueExpressions(batch, allMatchs, allMatchCount); + if (matchSeriesCount > 0 && bigTableValueExpressions != null) { + performValueExpressions(batch, matchSelected, matchSelectedCount); } int numSel = 0; - for (int i = 0; i < equalKeySeriesCount; i++) { - long count = equalKeySeriesValueCounts[i]; - int allMatchesIndex = equalKeySeriesAllMatchIndices[i]; - int duplicateCount = equalKeySeriesDuplicateCounts[i]; + for (int i = 0; i < matchSeriesCount; i++) { + int logical = matchLogicalIndices[i]; + int duplicateCount = matchDuplicateCounts[i]; + long count = matchValueCounts[i]; if (count == 1) { - numSel = generateHashMultiSetResultSingleValue( - batch, allMatchs, allMatchesIndex, duplicateCount, numSel); + numSel = generateHashMultiSetResultSingleValue(batch, + matchSelected, logical, duplicateCount, numSel); } else { generateHashMultiSetResultMultiValue(batch, - allMatchs, allMatchesIndex, - duplicateCount, count); + matchSelected, logical, duplicateCount, count); } } batch.size = numSel; @@ -206,12 +209,8 @@ private int generateHashMultiSetResultSingleValue(VectorizedRowBatch batch, int[] allMatchs, int allMatchesIndex, int duplicateCount, int numSel) throws HiveException, IOException { - // LOG.debug("generateHashMultiSetResultSingleValue enter..."); - // Generate result within big table batch itself. - // LOG.debug("generateHashMultiSetResultSingleValue with big table..."); - for (int i = 0; i < duplicateCount; i++) { int batchIndex = allMatchs[allMatchesIndex + i]; @@ -241,8 +240,6 @@ private void generateHashMultiSetResultMultiValue(VectorizedRowBatch batch, int[] allMatchs, int allMatchesIndex, int duplicateCount, long count) throws HiveException, IOException { - // LOG.debug("generateHashMultiSetResultMultiValue allMatchesIndex " + allMatchesIndex + " duplicateCount " + duplicateCount + " count " + count); - // TODO: Look at repeating optimizations... for (int i = 0; i < duplicateCount; i++) { @@ -265,70 +262,4 @@ private void generateHashMultiSetResultMultiValue(VectorizedRowBatch batch, } } } - - /** - * Generate the inner big table only join output results for one vectorized row batch with - * a repeated key. - * - * @param batch - * The big table batch with any matching and any non matching rows both as - * selected in use. - * @param hashMultiSetResult - * The hash multi-set results for the batch. - */ - protected int generateHashMultiSetResultRepeatedAll(VectorizedRowBatch batch, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws HiveException { - - long count = hashMultiSetResult.count(); - - if (batch.selectedInUse) { - // The selected array is already filled in as we want it. - } else { - int[] selected = batch.selected; - for (int i = 0; i < batch.size; i++) { - selected[i] = i; - } - batch.selectedInUse = true; - } - - do { - forwardBigTableBatch(batch); - count--; - } while (count > 0); - - // We forwarded the batch in this method. - return 0; - } - - protected void finishInnerBigOnlyRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws HiveException, IOException { - - switch (joinResult) { - case MATCH: - - if (bigTableValueExpressions != null) { - // Run our value expressions over whole batch. - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } - } - - // Generate special repeated case. - int numSel = generateHashMultiSetResultRepeatedAll(batch, hashMultiSetResult); - batch.size = numSel; - batch.selectedInUse = true; - break; - - case SPILL: - // Whole batch is spilled. - spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashMultiSetResult); - batch.size = 0; - break; - - case NOMATCH: - // No match for entire batch. - batch.size = 0; - break; - } - } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyLongOperator.java index 0bba141..1748d15 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyLongOperator.java @@ -25,18 +25,16 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; - -// Single-Column Long hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMultiSet; - -// Single-Column Long specific imports. -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /* * Specialized class for doing a vectorized map join that is an inner join on a Single-Column Long @@ -45,8 +43,17 @@ public class VectorMapJoinInnerBigOnlyLongOperator extends VectorMapJoinInnerBigOnlyGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerBigOnlyLongOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -55,7 +62,7 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinLongHashMultiSet hashMultiSet; + private transient MapJoinHashTableFind hashMultiSet; //--------------------------------------------------------------------------- // Single-Column Long specific members. @@ -68,6 +75,10 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + private transient PrimitiveTypeInfo singleJoinColumnPrimitiveTypeInfo; + + // The object that determines equal key series. + private transient VectorKeySeriesLong longKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -108,6 +119,10 @@ public void process(Object row, int tag) throws HiveException { */ singleJoinColumn = bigTableKeyColumnMap[0]; + singleJoinColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) bigTableKeyTypeInfos[0]; + + longKeySeries = + new VectorKeySeriesLong(singleJoinColumn, singleJoinColumnPrimitiveTypeInfo); needCommonSetup = false; } @@ -119,8 +134,7 @@ public void process(Object row, int tag) throws HiveException { /* * Get our Single-Column Long hash multi-set information for this specialized class. */ - - hashMultiSet = (VectorMapJoinLongHashMultiSet) vectorMapJoinHashTable; + hashMultiSet = vectorMapJoinHashTableFind; useMinMax = hashMultiSet.useMinMax(); if (useMinMax) { min = hashMultiSet.min(); @@ -131,6 +145,7 @@ public void process(Object row, int tag) throws HiveException { } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner big-only join. @@ -142,9 +157,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -158,232 +173,89 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column Long specific declarations. - */ + longKeySeries.processBatch(batch); - // The one join column for this specialized class. - LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn]; - long[] vector = joinColVector.vector; + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashMultiSetResultCount = 0; - /* - * Single-Column Long check for repeating. - */ + MapJoinHashMultiSetResult hashMultiSetResult; + MapJoinHashTableResult.MapJoinResult containsResult; + long key; + do { + // Use the next hash multi-set result entry. + hashMultiSetResult = hashMultiSetResults[hashMultiSetResultCount]; - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + if (longKeySeries.getCurrentKeyAllNull()) { - if (allKeyInputColumnsRepeating) { + // CONSIDER: Add support for NullSafe option. - /* - * Repeating. - */ + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Single-Column Long specific repeated lookup. - */ - - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; } else { - long key = vector[0]; + + key = longKeySeries.getCurrentKey(); if (useMinMax && (key < min || key > max)) { // Out of range for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; } else { - joinResult = hashMultiSet.contains(key, hashMultiSetResults[0]); + hashMultiSet.hashMultiSetContains( + key, + longKeySeries.getCurrentHashCode(), + hashMultiSetResult); + containsResult = hashMultiSetResult.getMapJoinResult(); } - } - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); } - finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]); - } else { /* - * NOT Repeating. + * Common inner join result processing. */ - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); + switch (containsResult) { + case MATCH: + // We have extracted the existence and count from the hash multi-set result, so we don't + // keep it. + matchLogicalIndices[matchSeriesCount] = longKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = longKeySeries.getCurrentDuplicateCount(); + matchValueCounts[matchSeriesCount] = hashMultiSetResult.count(); + matchSeriesCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = longKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = longKeySeries.getCurrentDuplicateCount(); + spillHashMultiSetResults[spillSeriesCount] = hashMultiSetResult; + spillSeriesCount++; + hashMultiSetResultCount++; + spilledRowCounter += longKeySeries.getCurrentDuplicateCount(); + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashMultiSetResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - - /* - * Single-Column Long specific variables. - */ - - long saveKey = 0; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Single-Column Long get key. - */ - - long currentKey; - boolean isNull; - if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) { - currentKey = 0; - isNull = true; - } else { - currentKey = vector[batchIndex]; - isNull = false; - } - - /* - * Equal key series checking. - */ - - if (isNull || !haveSaveKey || currentKey != saveKey) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; - break; - case SPILL: - // We keep the hash multi-set result for its spill information. - hashMultiSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column Long specific save key. - */ - - saveKey = currentKey; - - /* - * Single-Column Long specific lookup key. - */ - - if (useMinMax && (currentKey < min || currentKey > max)) { - // Key out of range for whole hash table. - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - } else { - saveJoinResult = hashMultiSet.contains(currentKey, hashMultiSetResults[hashMultiSetResultCount]); - } - } - - /* - * Common inner big-only join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count(); - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; - break; - case SPILL: - // We keep the hash multi-set result for its spill information. - hashMultiSetResultCount++; - break; - case NOMATCH: - break; - } + if (!longKeySeries.next()) { + break; } - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount))); - } - - finishInnerBigOnly(batch, - allMatchCount, equalKeySeriesCount, spillCount, - (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount); + } while (true); + + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } + finishInnerBigOnly(batch, matchSeriesCount, spillSeriesCount); + if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyMultiKeyOperator.java index 621804b..15f6d68 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyMultiKeyOperator.java @@ -25,19 +25,17 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -// Multi-Key hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; - // Multi-Key specific imports. -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; /* @@ -48,8 +46,17 @@ public class VectorMapJoinInnerBigOnlyMultiKeyOperator extends VectorMapJoinInnerBigOnlyGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyMultiKeyOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerBigOnlyMultiKeyOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -58,21 +65,17 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMultiSet hashMultiSet; + private transient MapJoinHashTableFind hashMultiSet; //--------------------------------------------------------------------------- // Multi-Key specific members. // - // Object that can take a set of columns in row in a vectorized row batch and serialized it. - // Known to not have any nulls. - private transient VectorSerializeRow keyVectorSerializeWrite; + // Binary sortable multi-key serializer. + private transient BinarySortableSerializeWrite binarySortableSerializeWrite; - // The BinarySortable serialization of the current key. - private transient Output currentKeyOutput; - - // The BinarySortable serialization of the saved key for a possible series of equal keys. - private transient Output saveKeyOutput; + // The object that determines equal key series. + private transient VectorKeySeriesMultiFast serializedMultiKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -112,12 +115,15 @@ public void process(Object row, int tag) throws HiveException { * Initialize Multi-Key members for this specialized class. */ - keyVectorSerializeWrite = new VectorSerializeRow( - new BinarySortableSerializeWrite(bigTableKeyColumnMap.length)); - keyVectorSerializeWrite.init(bigTableKeyTypeNames, bigTableKeyColumnMap); + binarySortableSerializeWrite = + new BinarySortableSerializeWrite(bigTableKeyColumnMap.length); - currentKeyOutput = new Output(); - saveKeyOutput = new Output(); + // For Multi-Key the Fast hash table computes the serialized key's hash code, + // so we don't use VectorKeySeriesMulti's hash code. + serializedMultiKeySeries = + new VectorKeySeriesMultiFast( + binarySortableSerializeWrite); + serializedMultiKeySeries.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); needCommonSetup = false; } @@ -130,12 +136,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Multi-Key hash multi-set information for this specialized class. */ - hashMultiSet = (VectorMapJoinBytesHashMultiSet) vectorMapJoinHashTable; + hashMultiSet = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner big-only join. @@ -147,9 +154,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -163,234 +170,88 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Multi-Key specific declarations. - */ - - // None. - - /* - * Multi-Key check for repeating. - */ - - // If all BigTable input columns to key expressions are isRepeating, then - // calculate key once; lookup once. - boolean allKeyInputColumnsRepeating; - if (bigTableKeyColumnMap.length == 0) { - allKeyInputColumnsRepeating = false; - } else { - allKeyInputColumnsRepeating = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) { - allKeyInputColumnsRepeating = false; - break; - } - } - } + serializedMultiKeySeries.processBatch(batch); - if (allKeyInputColumnsRepeating) { + LOG.info(CLASS_NAME + " " + serializedMultiKeySeries.toString()); - /* - * Repeating. - */ + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashMultiSetResultCount = 0; - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + MapJoinHashMultiSetResult hashMultiSetResult; + MapJoinHashTableResult.MapJoinResult containsResult; + do { + // Use the next hash multi-set result entry. + hashMultiSetResult = hashMultiSetResults[hashMultiSetResultCount]; - /* - * Multi-Key specific repeated lookup. - */ + // NOTE: Any null column in the key for inner join is a non-match. + if (serializedMultiKeySeries.getCurrentKeyHasAnyNulls()) { - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, 0); - JoinUtil.JoinResult joinResult; - if (keyVectorSerializeWrite.getHasAnyNulls()) { - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - byte[] keyBytes = currentKeyOutput.getData(); - int keyLength = currentKeyOutput.getLength(); - joinResult = hashMultiSet.contains(keyBytes, 0, keyLength, hashMultiSetResults[0]); - } + // CONSIDER: Add support for NullSafe option. - /* - * Common repeated join result processing. - */ + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]); - } else { + LOG.info(CLASS_NAME + " logical " + serializedMultiKeySeries.getCurrentLogical() + " hasAnyNulls true"); + } else { - /* - * NOT Repeating. - */ + hashMultiSet.hashMultiSetContains( + serializedMultiKeySeries.getSerializedBytes(), + serializedMultiKeySeries.getSerializedStart(), + serializedMultiKeySeries.getSerializedLength(), + serializedMultiKeySeries.getCurrentHashCode(), + hashMultiSetResult); + containsResult = hashMultiSetResult.getMapJoinResult(); - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); } - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashMultiSetResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - /* - * Multi-Key specific variables. + * Common inner join result processing. */ - Output temp; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Multi-Key get key. - */ - - // Generate binary sortable key for current row in vectorized row batch. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, batchIndex); - boolean isAnyNulls = keyVectorSerializeWrite.getHasAnyNulls(); - - /* - * Equal key series checking. - */ - - if (isAnyNulls || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; - break; - case SPILL: - // We keep the hash multi-set result for its spill information. - hashMultiSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isAnyNulls) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Multi-Key specific save key. - */ - - temp = saveKeyOutput; - saveKeyOutput = currentKeyOutput; - currentKeyOutput = temp; - - /* - * Single-Column Long specific lookup key. - */ - - byte[] keyBytes = saveKeyOutput.getData(); - int keyLength = saveKeyOutput.getLength(); - saveJoinResult = hashMultiSet.contains(keyBytes, 0, keyLength, hashMultiSetResults[hashMultiSetResultCount]); - } - - /* - * Common inner big-only join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count(); - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; - break; - case SPILL: - // We keep the hash multi-set result for its spill information. - hashMultiSetResultCount++; - break; - case NOMATCH: - break; - } + switch (containsResult) { + case MATCH: + // We have extracted the existence and count from the hash multi-set result, so we don't + // keep it. + matchLogicalIndices[matchSeriesCount] = serializedMultiKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = serializedMultiKeySeries.getCurrentDuplicateCount(); + matchValueCounts[matchSeriesCount] = hashMultiSetResult.count(); + matchSeriesCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = serializedMultiKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = serializedMultiKeySeries.getCurrentDuplicateCount(); + spillHashMultiSetResults[spillSeriesCount] = hashMultiSetResult; + spillSeriesCount++; + hashMultiSetResultCount++; + spilledRowCounter += serializedMultiKeySeries.getCurrentDuplicateCount(); + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount))); + if (!serializedMultiKeySeries.next()) { + break; } - - finishInnerBigOnly(batch, - allMatchCount, equalKeySeriesCount, spillCount, - (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount); + } while (true); + + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } + finishInnerBigOnly(batch, matchSeriesCount, spillSeriesCount); + if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); @@ -402,4 +263,9 @@ public void process(Object row, int tag) throws HiveException { throw new HiveException(e); } } + + private String getCurentLogical() { + // TODO Auto-generated method stub + return null; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyStringOperator.java index 10e75ab..7387faa 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyStringOperator.java @@ -25,19 +25,17 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytes; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column String hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; - -// Single-Column String specific imports. -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; /* * Specialized class for doing a vectorized map join that is an inner join on a Single-Column String @@ -46,8 +44,17 @@ public class VectorMapJoinInnerBigOnlyStringOperator extends VectorMapJoinInnerBigOnlyGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyStringOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerBigOnlyStringOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -56,7 +63,7 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMultiSet hashMultiSet; + private transient MapJoinHashTableFind hashMultiSet; //--------------------------------------------------------------------------- // Single-Column String specific members. @@ -65,6 +72,9 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + // The object that determines equal key series. + private transient VectorKeySeriesBytes bytesKeySeries; + //--------------------------------------------------------------------------- // Pass-thru constructors. // @@ -105,6 +115,8 @@ public void process(Object row, int tag) throws HiveException { singleJoinColumn = bigTableKeyColumnMap[0]; + bytesKeySeries = new VectorKeySeriesBytes(singleJoinColumn); + needCommonSetup = false; } @@ -116,12 +128,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column String hash multi-set information for this specialized class. */ - hashMultiSet = (VectorMapJoinBytesHashMultiSet) vectorMapJoinHashTable; + hashMultiSet = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner big-only join. @@ -133,9 +146,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -149,227 +162,84 @@ public void process(Object row, int tag) throws HiveException { } } - // We rebuild in-place the selected array with rows destine to be forwarded. - int numSel = 0; + bytesKeySeries.processBatch(batch); - /* - * Single-Column String specific declarations. - */ + int matchCount = 0; + int spillCount = 0; + int hashMultiSetResultCount = 0; - // The one join column for this specialized class. - BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; - byte[][] vector = joinColVector.vector; - int[] start = joinColVector.start; - int[] length = joinColVector.length; + MapJoinHashMultiSetResult hashMultiSetResult; + MapJoinHashTableResult.MapJoinResult containsResult; + do { + // Use the next hash multi-set result entry. + hashMultiSetResult = hashMultiSetResults[hashMultiSetResultCount]; - /* - * Single-Column String check for repeating. - */ + if (bytesKeySeries.getCurrentKeyAllNull()) { - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + // CONSIDER: Add support for NullSafe option. - if (allKeyInputColumnsRepeating) { + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Single-Column String specific repeated lookup. - */ - - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; } else { - byte[] keyBytes = vector[0]; - int keyStart = start[0]; - int keyLength = length[0]; - joinResult = hashMultiSet.contains(keyBytes, keyStart, keyLength, hashMultiSetResults[0]); - } - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]); - } else { - - /* - * NOT Repeating. - */ + hashMultiSet.hashMultiSetContains( + bytesKeySeries.getCurrentBytes(), + bytesKeySeries.getCurrentStart(), + bytesKeySeries.getCurrentLength(), + bytesKeySeries.getCurrentHashCode(), + hashMultiSetResult); + containsResult = hashMultiSetResult.getMapJoinResult(); - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); } - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashMultiSetResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - /* - * Single-Column String specific variables. + * Common inner join result processing. */ - int saveKeyBatchIndex = -1; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Single-Column String get key. - */ - - // Implicit -- use batchIndex. - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; - - /* - * Equal key series checking. - */ - - if (isNull || !haveSaveKey || - StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], - vector[batchIndex], start[batchIndex], length[batchIndex]) == false) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; - break; - case SPILL: - // We keep the hash multi-set result for its spill information. - hashMultiSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column String specific save key. - */ - - saveKeyBatchIndex = batchIndex; - - /* - * Single-Column String specific lookup key. - */ - - byte[] keyBytes = vector[batchIndex]; - int keyStart = start[batchIndex]; - int keyLength = length[batchIndex]; - saveJoinResult = hashMultiSet.contains(keyBytes, keyStart, keyLength, hashMultiSetResults[hashMultiSetResultCount]); - } - - /* - * Common inner big-only join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count(); - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; - break; - case SPILL: - // We keep the hash multi-set result for its spill information. - hashMultiSetResultCount++; - break; - case NOMATCH: - break; - } + switch (containsResult) { + case MATCH: + // We have extracted the existence and count from the hash multi-set result, so we don't + // keep it. + matchLogicalIndices[matchCount] = bytesKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchCount] = bytesKeySeries.getCurrentDuplicateCount(); + matchValueCounts[matchCount] = hashMultiSetResult.count(); + matchCount++; + break; + + case SPILL: + spillLogicalIndices[spillCount] = bytesKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillCount] = bytesKeySeries.getCurrentDuplicateCount(); + spillHashMultiSetResults[spillCount] = hashMultiSetResult; + spillCount++; + hashMultiSetResultCount++; + spilledRowCounter += bytesKeySeries.getCurrentDuplicateCount(); + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount))); + if (!bytesKeySeries.next()) { + break; } - - finishInnerBigOnly(batch, - allMatchCount, equalKeySeriesCount, spillCount, - (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount); + } while (true); + + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchCount " + matchCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchCount)) + + " spillCount " + spillCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillCount))); } + finishInnerBigOnly(batch, matchCount, spillCount); + if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerGenerateResultOperator.java index 319a2b0..7eb16bb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerGenerateResultOperator.java @@ -19,18 +19,13 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin; import java.io.IOException; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -59,36 +54,25 @@ // An array of hash map results so we can do lookups on the whole batch before output result // generation. - protected transient VectorMapJoinHashMapResult hashMapResults[]; + protected transient MapJoinHashMapResult[] hashMapResults; - // Pre-allocated member for storing the (physical) batch index of matching row (single- or - // multi-small-table-valued) indexes during a process call. - protected transient int[] allMatchs; + // Pre-allocated member for storing the new logical batch index (within matchSelected), + // series count of rows, and hash map results that matched. + protected transient int[] matchLogicalIndices; + protected transient int[] matchDuplicateCounts; + protected transient boolean[] matchIsSingleValue; + protected transient MapJoinHashMapResult[] matchHashMapResults; - /* - * Pre-allocated members for storing information equal key series for small-table matches. - * - * ~HashMapResultIndices - * Index into the hashMapResults array for the match. - * ~AllMatchIndices - * (Logical) indices into allMatchs to the first row of a match of a - * possible series of duplicate keys. - * ~IsSingleValue - * Whether there is 1 or multiple small table values. - * ~DuplicateCounts - * The duplicate count for each matched key. - * - */ - protected transient int[] equalKeySeriesHashMapResultIndices; - protected transient int[] equalKeySeriesAllMatchIndices; - protected transient boolean[] equalKeySeriesIsSingleValue; - protected transient int[] equalKeySeriesDuplicateCounts; + // Pre-allocated member for storing the logical batch index and series count of rows that spilled. + protected transient int[] spillLogicalIndices; + protected transient int[] spillDuplicateCounts; - // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. - protected transient int[] spills; + // Pre-allocated member for storing the reference to the hash map results of rows that spilled. + protected transient MapJoinHashMapResult[] spillHashMapResults; - // Pre-allocated member for storing index into the hashMapResults for each spilled row. - protected transient int[] spillHashMapResultIndices; + // Pre-allocated member for storing the batch indices of the matched rows. + protected transient int[] matchSelected; + protected transient int[] resultSelected; /** Kryo ctor. */ protected VectorMapJoinInnerGenerateResultOperator() { @@ -111,22 +95,23 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); // Inner join specific. - VectorMapJoinHashMap baseHashMap = (VectorMapJoinHashMap) vectorMapJoinHashTable; - - hashMapResults = new VectorMapJoinHashMapResult[batch.DEFAULT_SIZE]; + hashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; for (int i = 0; i < hashMapResults.length; i++) { - hashMapResults[i] = baseHashMap.createHashMapResult(); + hashMapResults[i] = vectormapJoinHashTableFactory.createHashMapResult(); } - allMatchs = new int[batch.DEFAULT_SIZE]; + matchLogicalIndices = new int[batch.DEFAULT_SIZE]; + matchDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchIsSingleValue = new boolean[batch.DEFAULT_SIZE]; + matchHashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; - equalKeySeriesHashMapResultIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesAllMatchIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesIsSingleValue = new boolean[batch.DEFAULT_SIZE]; - equalKeySeriesDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillLogicalIndices = new int[batch.DEFAULT_SIZE]; + spillDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillHashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; + + matchSelected = new int[batch.DEFAULT_SIZE]; + resultSelected = new int[batch.DEFAULT_SIZE]; - spills = new int[batch.DEFAULT_SIZE]; - spillHashMapResultIndices = new int[batch.DEFAULT_SIZE]; } /* @@ -153,82 +138,79 @@ protected void innerPerBatchSetup(VectorizedRowBatch batch) { * @param batch * The big table batch with any matching and any non matching rows both as * selected in use. - * @param allMatchCount - * Number of matches in allMatchs. - * @param equalKeySeriesCount - * Number of single value matches. - * @param spillCount - * Number of spills in spills. - * @param hashMapResultCount - * Number of entries in hashMapResults. + * @param matchSeriesCount + * Number of matches in the match* arrays. + * @param matchSelectedCount + * The selected count in matchSelected. + * @param spillSeriesCount + * Number of spills in spill* arrays. */ - protected void finishInner(VectorizedRowBatch batch, - int allMatchCount, int equalKeySeriesCount, int spillCount, int hashMapResultCount) - throws HiveException, IOException { + protected void finishInner(VectorizedRowBatch batch, int matchSeriesCount, int spillSeriesCount) + throws HiveException, IOException { - int numSel = 0; + final int size = batch.size; + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; - /* - * Optimize by running value expressions only over the matched rows. - */ - if (allMatchCount > 0 && bigTableValueExpressions != null) { - performValueExpressions(batch, allMatchs, allMatchCount); - } + // Dump out the spill rows now. + if (spillSeriesCount > 0) { - for (int i = 0; i < equalKeySeriesCount; i++) { - int hashMapResultIndex = equalKeySeriesHashMapResultIndices[i]; - VectorMapJoinHashMapResult hashMapResult = hashMapResults[hashMapResultIndex]; - int allMatchesIndex = equalKeySeriesAllMatchIndices[i]; - boolean isSingleValue = equalKeySeriesIsSingleValue[i]; - int duplicateCount = equalKeySeriesDuplicateCounts[i]; + spillHashMapBatch(batch, spillLogicalIndices, spillDuplicateCounts, spillHashMapResults, + spillSeriesCount, selectedInUse, selected, size); - if (isSingleValue) { - numSel = generateHashMapResultSingleValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount, numSel); - } else { - generateHashMapResultMultiValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount); + if (spillSeriesCount == size) { + batch.size = 0; + batch.selectedInUse = false; + return; } } - if (spillCount > 0) { - spillHashMapBatch(batch, (VectorMapJoinHashTableResult[]) hashMapResults, - spills, spillHashMapResultIndices, spillCount); - } + /* + * Optimize by running value expressions only over the matched rows. + */ + int matchSelectedCount = 0; + if (matchSeriesCount > 0 && bigTableValueExpressions != null) { - batch.size = numSel; - batch.selectedInUse = true; - } + // Create matchSelected by adding the batch indices for the match logical index ranges from + // the input batch range. + matchSelectedCount = + flattenLogicalSeriesIntoSelected( + selectedInUse, selected, size, + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + matchSelected); - protected void finishInnerRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, - VectorMapJoinHashTableResult hashMapResult) throws HiveException, IOException { + performValueExpressions(batch, matchSelected, matchSelectedCount); - int numSel = 0; + } - switch (joinResult) { - case MATCH: + // Output matches + for (int i = 0; i < matchSeriesCount; i++) { + MapJoinHashMapResult hashMapResult = matchHashMapResults[i]; + int logical = matchLogicalIndices[i]; + int duplicateCount = matchDuplicateCounts[i]; - if (bigTableValueExpressions != null) { - // Run our value expressions over whole batch. - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } + if (matchIsSingleValue[i]) { + generateHashMapResultSingleValue(batch, hashMapResult, + logical, duplicateCount, selectedInUse, selected, size); + } else { + generateHashMapResultMultiValue(batch, hashMapResult, + logical, duplicateCount, selectedInUse, selected, size); } + } - // Generate special repeated case. - generateHashMapResultRepeatedAll(batch, hashMapResults[0]); - break; + // Create selected by not including match logical indices for multi-value small table results + // from input batch logical range, and then putting the batch indices in selected. - case SPILL: - // Whole batch is spilled. - spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashMapResults[0]); - batch.size = 0; - break; + int numSel = + makeMatchSelectedWithoutMultiValues( + selectedInUse, selected, size, + matchLogicalIndices, matchDuplicateCounts, matchIsSingleValue, matchSeriesCount, + resultSelected); - case NOMATCH: - // No match for entire batch. - batch.size = 0; - break; + batch.selectedInUse = true; + batch.size = numSel; + if (numSel > 0) { + System.arraycopy(resultSelected, 0, selected, 0, numSel); } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerLongOperator.java index 804d69c..8b7755f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerLongOperator.java @@ -25,17 +25,19 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -// Single-Column Long hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; - // Single-Column Long specific imports. -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /* * Specialized class for doing a vectorized map join that is an inner join on a Single-Column Long @@ -44,8 +46,17 @@ public class VectorMapJoinInnerLongOperator extends VectorMapJoinInnerGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerLongOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -54,7 +65,7 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinLongHashMap hashMap; + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Single-Column Long specific members. @@ -67,6 +78,10 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + private transient PrimitiveTypeInfo singleJoinColumnPrimitiveTypeInfo; + + // The object that determines equal key series. + private transient VectorKeySeriesLong longKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -107,6 +122,10 @@ public void process(Object row, int tag) throws HiveException { */ singleJoinColumn = bigTableKeyColumnMap[0]; + singleJoinColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) bigTableKeyTypeInfos[0]; + + longKeySeries = + new VectorKeySeriesLong(singleJoinColumn, singleJoinColumnPrimitiveTypeInfo); needCommonSetup = false; } @@ -119,7 +138,7 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column Long hash map information for this specialized class. */ - hashMap = (VectorMapJoinLongHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; useMinMax = hashMap.useMinMax(); if (useMinMax) { min = hashMap.min(); @@ -130,6 +149,7 @@ public void process(Object row, int tag) throws HiveException { } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner join. @@ -140,9 +160,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -156,231 +176,89 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column Long specific declarations. - */ - - // The one join column for this specialized class. - LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn]; - long[] vector = joinColVector.vector; + longKeySeries.processBatch(batch); - /* - * Single-Column Long check for repeating. - */ + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashMapResultCount = 0; - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + MapJoinHashMapResult hashMapResult; + MapJoinHashTableResult.MapJoinResult lookupResult; + long key; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - if (allKeyInputColumnsRepeating) { - - /* - * Repeating. - */ + if (longKeySeries.getCurrentKeyAllNull()) { - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + // CONSIDER: Add support for NullSafe option. - /* - * Single-Column Long specific repeated lookup. - */ + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; } else { - long key = vector[0]; + + key = longKeySeries.getCurrentKey(); if (useMinMax && (key < min || key > max)) { // Out of range for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; } else { - joinResult = hashMap.lookup(key, hashMapResults[0]); + hashMap.hashMapLookup( + key, + longKeySeries.getCurrentHashCode(), + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); } - } - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); } - finishInnerRepeated(batch, joinResult, hashMapResults[0]); - } else { /* - * NOT Repeating. + * Common inner join result processing. */ - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); + switch (lookupResult) { + case MATCH: + matchLogicalIndices[matchSeriesCount] = longKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = longKeySeries.getCurrentDuplicateCount(); + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; + hashMapResultCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = longKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = longKeySeries.getCurrentDuplicateCount(); + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; + hashMapResultCount++; + spilledRowCounter += longKeySeries.getCurrentDuplicateCount(); + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); } - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - - /* - * Single-Column Long specific variables. - */ - - long saveKey = 0; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Single-Column Long get key. - */ - - long currentKey; - boolean isNull; - if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) { - currentKey = 0; - isNull = true; - } else { - currentKey = vector[batchIndex]; - isNull = false; - } - - /* - * Equal key series checking. - */ - - if (isNull || !haveSaveKey || currentKey != saveKey) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column Long specific save key. - */ - - saveKey = currentKey; - - /* - * Single-Column Long specific lookup key. - */ - - if (useMinMax && (currentKey < min || currentKey > max)) { - // Key out of range for whole hash table. - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - } else { - saveJoinResult = hashMap.lookup(currentKey, hashMapResults[hashMapResultCount]); - } - } - - /* - * Common inner join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } + if (!longKeySeries.next()) { + break; } - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); - } - - finishInner(batch, - allMatchCount, equalKeySeriesCount, spillCount, hashMapResultCount); + } while (true); + + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } + finishInner(batch, matchSeriesCount, spillSeriesCount); + if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerMultiKeyOperator.java index fcfa0bd..c4e16b7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerMultiKeyOperator.java @@ -25,18 +25,18 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -// Multi-Key hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; // Multi-Key specific imports. -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; /* @@ -46,8 +46,17 @@ public class VectorMapJoinInnerMultiKeyOperator extends VectorMapJoinInnerGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerMultiKeyOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerMultiKeyOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerMultiKeyOperator.class.getName()); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -56,21 +65,17 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMap hashMap; + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Multi-Key specific members. // - // Object that can take a set of columns in row in a vectorized row batch and serialized it. - // Known to not have any nulls. - private transient VectorSerializeRow keyVectorSerializeWrite; - - // The BinarySortable serialization of the current key. - private transient Output currentKeyOutput; + // Binary sortable multi-key serializer. + private transient BinarySortableSerializeWrite binarySortableSerializeWrite; - // The BinarySortable serialization of the saved key for a possible series of equal keys. - private transient Output saveKeyOutput; + // The object that determines equal key series. + private transient VectorKeySeriesMultiFast serializedMultiKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -110,12 +115,15 @@ public void process(Object row, int tag) throws HiveException { * Initialize Multi-Key members for this specialized class. */ - keyVectorSerializeWrite = new VectorSerializeRow( - new BinarySortableSerializeWrite(bigTableKeyColumnMap.length)); - keyVectorSerializeWrite.init(bigTableKeyTypeNames, bigTableKeyColumnMap); + binarySortableSerializeWrite = + new BinarySortableSerializeWrite(bigTableKeyColumnMap.length); - currentKeyOutput = new Output(); - saveKeyOutput = new Output(); + // For Multi-Key the Fast hash table computes the serialized key's hash code, + // so we don't use VectorKeySeriesMulti's hash code. + serializedMultiKeySeries = + new VectorKeySeriesMultiFast( + binarySortableSerializeWrite); + serializedMultiKeySeries.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); needCommonSetup = false; } @@ -128,12 +136,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Multi-Key hash map information for this specialized class. */ - hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner join. @@ -144,9 +153,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -160,233 +169,85 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Multi-Key specific declarations. - */ - - // None. - - /* - * Multi-Key check for repeating. - */ - - // If all BigTable input columns to key expressions are isRepeating, then - // calculate key once; lookup once. - boolean allKeyInputColumnsRepeating; - if (bigTableKeyColumnMap.length == 0) { - allKeyInputColumnsRepeating = false; - } else { - allKeyInputColumnsRepeating = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) { - allKeyInputColumnsRepeating = false; - break; - } - } - } + serializedMultiKeySeries.processBatch(batch); - if (allKeyInputColumnsRepeating) { - - /* - * Repeating. - */ + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashMapResultCount = 0; - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + MapJoinHashMapResult hashMapResult; + MapJoinHashTableResult.MapJoinResult lookupResult; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - /* - * Multi-Key specific repeated lookup. - */ + // NOTE: Any null column in the key for inner join is a non-match. + if (serializedMultiKeySeries.getCurrentKeyHasAnyNulls()) { - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, 0); - JoinUtil.JoinResult joinResult; - if (keyVectorSerializeWrite.getHasAnyNulls()) { - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - byte[] keyBytes = currentKeyOutput.getData(); - int keyLength = currentKeyOutput.getLength(); - joinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[0]); - } + // CONSIDER: Add support for NullSafe option. - /* - * Common repeated join result processing. - */ + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishInnerRepeated(batch, joinResult, hashMapResults[0]); - } else { + } else { - /* - * NOT Repeating. - */ + hashMap.hashMapLookup( + serializedMultiKeySeries.getSerializedBytes(), + serializedMultiKeySeries.getSerializedStart(), + serializedMultiKeySeries.getSerializedLength(), + serializedMultiKeySeries.getCurrentHashCode(), + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); } - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - /* - * Multi-Key specific variables. + * Common inner join result processing. */ - Output temp; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Multi-Key get key. - */ - - // Generate binary sortable key for current row in vectorized row batch. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, batchIndex); - boolean isAnyNull = keyVectorSerializeWrite.getHasAnyNulls(); - - /* - * Equal key series checking. - */ - - if (isAnyNull || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isAnyNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Multi-Key specific save key. - */ - - temp = saveKeyOutput; - saveKeyOutput = currentKeyOutput; - currentKeyOutput = temp; - - /* - * Multi-Key specific lookup key. - */ - - byte[] keyBytes = saveKeyOutput.getData(); - int keyLength = saveKeyOutput.getLength(); - saveJoinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[hashMapResultCount]); - } - - /* - * Common inner join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } + switch (lookupResult) { + case MATCH: + matchLogicalIndices[matchSeriesCount] = serializedMultiKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = serializedMultiKeySeries.getCurrentDuplicateCount(); + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; + hashMapResultCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = serializedMultiKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = serializedMultiKeySeries.getCurrentDuplicateCount(); + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; + hashMapResultCount++; + spilledRowCounter += serializedMultiKeySeries.getCurrentDuplicateCount(); + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); } - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); + if (!serializedMultiKeySeries.next()) { + break; } - - finishInner(batch, - allMatchCount, equalKeySeriesCount, spillCount, hashMapResultCount); + } while (true); + + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } + finishInner(batch, matchSeriesCount, spillSeriesCount); + if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerStringOperator.java index 0f9baae..0f2ea55 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerStringOperator.java @@ -24,7 +24,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; @@ -32,11 +34,9 @@ import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column String hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytes; // Single-Column String specific imports. -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; /* * Specialized class for doing a vectorized map join that is an inner join on a Single-Column String @@ -45,8 +45,17 @@ public class VectorMapJoinInnerStringOperator extends VectorMapJoinInnerGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerStringOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerStringOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -55,7 +64,7 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMap hashMap; + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Single-Column String specific members. @@ -64,6 +73,9 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + // The object that determines equal key series. + private transient VectorKeySeriesBytes bytesKeySeries; + //--------------------------------------------------------------------------- // Pass-thru constructors. // @@ -104,6 +116,8 @@ public void process(Object row, int tag) throws HiveException { singleJoinColumn = bigTableKeyColumnMap[0]; + bytesKeySeries = new VectorKeySeriesBytes(singleJoinColumn); + needCommonSetup = false; } @@ -115,12 +129,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column String hash map information for this specialized class. */ - hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner join. @@ -131,9 +146,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -147,222 +162,88 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column String specific declarations. - */ + bytesKeySeries.processBatch(batch); - // The one join column for this specialized class. - BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; - byte[][] vector = joinColVector.vector; - int[] start = joinColVector.start; - int[] length = joinColVector.length; + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashMapResultCount = 0; - /* - * Single-Column String check for repeating. - */ + MapJoinHashMapResult hashMapResult; + MapJoinHashTableResult.MapJoinResult lookupResult; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + if (bytesKeySeries.getCurrentKeyAllNull()) { - if (allKeyInputColumnsRepeating) { + // CONSIDER: Add support for NullSafe option. - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - /* - * Single-Column String specific repeated lookup. - */ - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; } else { - byte[] keyBytes = vector[0]; - int keyStart = start[0]; - int keyLength = length[0]; - joinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[0]); - } - /* - * Common repeated join result processing. - */ + hashMap.hashMapLookup( + bytesKeySeries.getCurrentBytes(), + bytesKeySeries.getCurrentStart(), + bytesKeySeries.getCurrentLength(), + bytesKeySeries.getCurrentHashCode(), + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishInnerRepeated(batch, joinResult, hashMapResults[0]); - } else { + // LOG.info(CLASS_NAME + " " + + // VectorizedBatchUtil.displayBytes(bytesKeySeries.currentBytes, bytesKeySeries.currentStart, + // bytesKeySeries.currentLength) + " hashCode " + Integer.toHexString(bytesKeySeries.getCurrentHashCode()) + + // " lookupResult " + lookupResult.name()); - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); } - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - /* - * Single-Column String specific variables. + * Common inner join result processing. */ - - int saveKeyBatchIndex = -1; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Single-Column String get key. - */ - - // Implicit -- use batchIndex. - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; - - /* - * Equal key series checking. - */ - - if (isNull || !haveSaveKey || - StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], - vector[batchIndex], start[batchIndex], length[batchIndex]) == false) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column String specific save key. - */ - - saveKeyBatchIndex = batchIndex; - - /* - * Single-Column String specific lookup key. - */ - - byte[] keyBytes = vector[batchIndex]; - int keyStart = start[batchIndex]; - int keyLength = length[batchIndex]; - saveJoinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[hashMapResultCount]); - } - - /* - * Common inner join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } + switch (lookupResult) { + case MATCH: + matchLogicalIndices[matchSeriesCount] = bytesKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = bytesKeySeries.getCurrentDuplicateCount(); + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; + hashMapResultCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = bytesKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = bytesKeySeries.getCurrentDuplicateCount(); + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; + hashMapResultCount++; + spilledRowCounter += bytesKeySeries.getCurrentDuplicateCount(); + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); } - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } + if (!bytesKeySeries.next()) { + break; } - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); - } - - finishInner(batch, - allMatchCount, equalKeySeriesCount, spillCount, hashMapResultCount); + } while (true); + + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } + finishInner(batch, matchSeriesCount, spillSeriesCount); + if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiGenerateResultOperator.java index c71ebba..acc9f14 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiGenerateResultOperator.java @@ -23,13 +23,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -59,17 +55,22 @@ // An array of hash set results so we can do lookups on the whole batch before output result // generation. - protected transient VectorMapJoinHashSetResult hashSetResults[]; + protected transient MapJoinHashSetResult hashSetResults[]; - // Pre-allocated member for storing the (physical) batch index of matching row (single- or - // multi-small-table-valued) indexes during a process call. - protected transient int[] allMatchs; + // Pre-allocated member for storing the batch indices of the matched rows. + protected transient int[] matchSelected; - // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. - protected transient int[] spills; + // Pre-allocated member for storing the new logical batch index (within newSelected) and + // series count of rows that matched. + protected transient int[] matchLogicalIndices; + protected transient int[] matchDuplicateCounts; - // Pre-allocated member for storing index into the hashSetResults for each spilled row. - protected transient int[] spillHashMapResultIndices; + // Pre-allocated member for storing the logical batch index and series count of rows that spilled. + protected transient int[] spillLogicalIndices; + protected transient int[] spillDuplicateCounts; + + // Pre-allocated member for storing the reference to the hash map results of rows that spilled. + protected transient MapJoinHashSetResult[] spillHashSetResults; /** Kryo ctor. */ protected VectorMapJoinLeftSemiGenerateResultOperator() { @@ -92,17 +93,19 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); // Semi join specific. - VectorMapJoinHashSet baseHashSet = (VectorMapJoinHashSet) vectorMapJoinHashTable; - - hashSetResults = new VectorMapJoinHashSetResult[batch.DEFAULT_SIZE]; + hashSetResults = new MapJoinHashSetResult[batch.DEFAULT_SIZE]; for (int i = 0; i < hashSetResults.length; i++) { - hashSetResults[i] = baseHashSet.createHashSetResult(); + hashSetResults[i] = vectormapJoinHashTableFactory.createHashSetResult(); } - allMatchs = new int[batch.DEFAULT_SIZE]; + matchSelected = new int[batch.DEFAULT_SIZE]; - spills = new int[batch.DEFAULT_SIZE]; - spillHashMapResultIndices = new int[batch.DEFAULT_SIZE]; + matchLogicalIndices = new int[batch.DEFAULT_SIZE]; + matchDuplicateCounts = new int[batch.DEFAULT_SIZE]; + + spillLogicalIndices = new int[batch.DEFAULT_SIZE]; + spillDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillHashSetResults = new MapJoinHashSetResult[batch.DEFAULT_SIZE]; } //----------------------------------------------------------------------------------------------- @@ -117,32 +120,47 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { * @param batch * The big table batch with any matching and any non matching rows both as * selected in use. - * @param allMatchCount - * Number of matches in allMatchs. - * @param spillCount - * Number of spills in spills. - * @param hashTableResults - * The array of all hash table results for the batch. We need the - * VectorMapJoinHashTableResult for the spill information. + * @param matchSeriesCount + * Number of matches in the match* arrays. + * @param matchSelectedCount + * The selected count in matchSelected. + * @param spillSeriesCount + * Number of spills in spill* arrays. */ - protected void finishLeftSemi(VectorizedRowBatch batch, - int allMatchCount, int spillCount, - VectorMapJoinHashTableResult[] hashTableResults) throws HiveException, IOException { - - // Get rid of spills before we start modifying the batch. - if (spillCount > 0) { - spillHashMapBatch(batch, hashTableResults, - spills, spillHashMapResultIndices, spillCount); + protected void finishLeftSemi(VectorizedRowBatch batch, int matchSeriesCount, + int spillSeriesCount) throws HiveException, IOException { + + final int selectedSize = batch.size; + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; + + // Dump out the spill rows now. + if (spillSeriesCount > 0) { + + spillHashMapBatch(batch, spillLogicalIndices, spillDuplicateCounts, spillHashSetResults, + spillSeriesCount, selectedInUse, selected, selectedSize); + + if (spillSeriesCount == selectedSize) { + batch.size = 0; + batch.selectedInUse = false; + return; + } } + int matchSelectedCount = + flattenLogicalSeriesIntoSelected( + selectedInUse, selected, selectedSize, + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + matchSelected); + /* * Optimize by running value expressions only over the matched rows. */ - if (allMatchCount > 0 && bigTableValueExpressions != null) { - performValueExpressions(batch, allMatchs, allMatchCount); + if (matchSeriesCount > 0 && bigTableValueExpressions != null) { + performValueExpressions(batch, matchSelected, matchSelectedCount); } - int numSel = generateHashSetResults(batch, allMatchs, allMatchCount); + int numSel = generateHashSetResults(batch, matchSelected, matchSelectedCount); batch.size = numSel; batch.selectedInUse = true; } @@ -175,57 +193,4 @@ private int generateHashSetResults(VectorizedRowBatch batch, return numSel; } - - /** - * Generate the left semi join output results for one vectorized row batch with a repeated key. - * - * @param batch - * The big table batch whose repeated key matches. - */ - protected int generateHashSetResultRepeatedAll(VectorizedRowBatch batch) throws HiveException { - - if (batch.selectedInUse) { - // The selected array is already filled in as we want it. - } else { - int[] selected = batch.selected; - for (int i = 0; i < batch.size; i++) { - selected[i] = i; - } - batch.selectedInUse = true; - } - - return batch.size; - } - - protected void finishLeftSemiRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, - VectorMapJoinHashTableResult hashSetResult) throws HiveException, IOException { - - switch (joinResult) { - case MATCH: - - if (bigTableValueExpressions != null) { - // Run our value expressions over whole batch. - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } - } - - // Generate special repeated case. - int numSel = generateHashSetResultRepeatedAll(batch); - batch.size = numSel; - batch.selectedInUse = true; - break; - - case SPILL: - // Whole batch is spilled. - spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashSetResult); - batch.size = 0; - break; - - case NOMATCH: - // No match for entire batch. - batch.size = 0; - break; - } - } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiLongOperator.java index 1149a9d..9af5d75 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiLongOperator.java @@ -25,18 +25,19 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column Long hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashSet; - -// Single-Column Long specific imports. -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.common.util.HashCodeUtil; /* * Specialized class for doing a vectorized map join that is an left semi join on a Single-Column Long @@ -45,8 +46,17 @@ public class VectorMapJoinLeftSemiLongOperator extends VectorMapJoinLeftSemiGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinLeftSemiLongOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -55,7 +65,7 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinLongHashSet hashSet; + private transient MapJoinHashTableFind hashSet; //--------------------------------------------------------------------------- // Single-Column Long specific members. @@ -68,6 +78,10 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + private transient PrimitiveTypeInfo singleJoinColumnPrimitiveTypeInfo; + + // The object that determines equal key series. + private transient VectorKeySeriesLong longKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -108,6 +122,10 @@ public void process(Object row, int tag) throws HiveException { */ singleJoinColumn = bigTableKeyColumnMap[0]; + singleJoinColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) bigTableKeyTypeInfos[0]; + + longKeySeries = + new VectorKeySeriesLong(singleJoinColumn, singleJoinColumnPrimitiveTypeInfo); needCommonSetup = false; } @@ -120,7 +138,7 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column Long hash set information for this specialized class. */ - hashSet = (VectorMapJoinLongHashSet) vectorMapJoinHashTable; + hashSet = vectorMapJoinHashTableFind; useMinMax = hashSet.useMinMax(); if (useMinMax) { min = hashSet.min(); @@ -131,6 +149,7 @@ public void process(Object row, int tag) throws HiveException { } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an left semi join. @@ -142,9 +161,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -158,222 +177,87 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column Long specific declarations. - */ - - // The one join column for this specialized class. - LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn]; - long[] vector = joinColVector.vector; - /* - * Single-Column Long check for repeating. - */ + longKeySeries.processBatch(batch); - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashSetResultCount = 0; - if (allKeyInputColumnsRepeating) { + MapJoinHashSetResult hashSetResult; + MapJoinHashTableResult.MapJoinResult containsResult; + long key; + do { + // Use the next hash set result entry. + hashSetResult = hashSetResults[hashSetResultCount]; - /* - * Repeating. - */ + if (longKeySeries.getCurrentKeyAllNull()) { - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + // CONSIDER: Add support for NullSafe option. - /* - * Single-Column Long specific repeated lookup. - */ + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; } else { - long key = vector[0]; + + key = longKeySeries.getCurrentKey(); if (useMinMax && (key < min || key > max)) { // Out of range for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; } else { - joinResult = hashSet.contains(key, hashSetResults[0]); + hashSet.hashSetContains( + key, + longKeySeries.getCurrentHashCode(), + hashSetResult); + containsResult = hashSetResult.getMapJoinResult(); } - } - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); } - finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]); - } else { /* - * NOT Repeating. + * Common inner join result processing. */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); + switch (containsResult) { + case MATCH: + // We have extracted the existence from the hash set result, so we don't keep it. + matchLogicalIndices[matchSeriesCount] = longKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = longKeySeries.getCurrentDuplicateCount(); + matchSeriesCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = longKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = longKeySeries.getCurrentDuplicateCount(); + spillHashSetResults[spillSeriesCount] = hashSetResult; + spillSeriesCount++; + hashSetResultCount++; + spilledRowCounter += longKeySeries.getCurrentDuplicateCount(); + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashSetResultCount = 0; - int allMatchCount = 0; - int spillCount = 0; - - /* - * Single-Column Long specific variables. - */ - - long saveKey = 0; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Single-Column Long get key. - */ - - long currentKey; - boolean isNull; - if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) { - currentKey = 0; - isNull = true; - } else { - currentKey = vector[batchIndex]; - isNull = false; - } - - /* - * Equal key series checking. - */ - - if (isNull || !haveSaveKey || currentKey != saveKey) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. - break; - case SPILL: - // We keep the hash set result for its spill information. - hashSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column Long specific save key. - */ - - saveKey = currentKey; - - /* - * Single-Column Long specific lookup key. - */ - - if (useMinMax && (currentKey < min || currentKey > max)) { - // Key out of range for whole hash table. - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - } else { - saveJoinResult = hashSet.contains(currentKey, hashSetResults[hashSetResultCount]); - } - } - - /* - * Common left-semi join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. - break; - case SPILL: - // We keep the hash set result for its spill information. - hashSetResultCount++; - break; - case NOMATCH: - break; - } + if (!longKeySeries.next()) { + break; } - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount))); - } - - finishLeftSemi(batch, - allMatchCount, spillCount, - (VectorMapJoinHashTableResult[]) hashSetResults); + } while (true); + + if (isLogDebugEnabled) { + LOG.info(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } + finishLeftSemi(batch, matchSeriesCount, spillSeriesCount); + if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiMultiKeyOperator.java index e0baebc..bf4cb69 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiMultiKeyOperator.java @@ -25,19 +25,17 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -// Multi-Key hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; - // Multi-Key specific imports. -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; /* @@ -47,8 +45,17 @@ public class VectorMapJoinLeftSemiMultiKeyOperator extends VectorMapJoinLeftSemiGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinLeftSemiMultiKeyOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -57,21 +64,17 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinBytesHashSet hashSet; + private transient MapJoinHashTableFind hashSet; //--------------------------------------------------------------------------- // Multi-Key specific members. // - // Object that can take a set of columns in row in a vectorized row batch and serialized it. - // Known to not have any nulls. - private transient VectorSerializeRow keyVectorSerializeWrite; + // Binary sortable multi-key serializer. + private transient BinarySortableSerializeWrite binarySortableSerializeWrite; - // The BinarySortable serialization of the current key. - private transient Output currentKeyOutput; - - // The BinarySortable serialization of the saved key for a possible series of equal keys. - private transient Output saveKeyOutput; + // The object that determines equal key series. + private transient VectorKeySeriesMultiFast serializedMultiKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -111,12 +114,15 @@ public void process(Object row, int tag) throws HiveException { * Initialize Multi-Key members for this specialized class. */ - keyVectorSerializeWrite = new VectorSerializeRow( - new BinarySortableSerializeWrite(bigTableKeyColumnMap.length)); - keyVectorSerializeWrite.init(bigTableKeyTypeNames, bigTableKeyColumnMap); + binarySortableSerializeWrite = + new BinarySortableSerializeWrite(bigTableKeyColumnMap.length); - currentKeyOutput = new Output(); - saveKeyOutput = new Output(); + // For Multi-Key the Fast hash table computes the serialized key's hash code, + // so we don't use VectorKeySeriesMulti's hash code. + serializedMultiKeySeries = + new VectorKeySeriesMultiFast( + binarySortableSerializeWrite); + serializedMultiKeySeries.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); needCommonSetup = false; } @@ -129,12 +135,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Multi-Key hash set information for this specialized class. */ - hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable; + hashSet = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an left semi join. @@ -146,9 +153,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -162,228 +169,83 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Multi-Key specific declarations. - */ - - // None. - - /* - * Multi-Key Long check for repeating. - */ - - // If all BigTable input columns to key expressions are isRepeating, then - // calculate key once; lookup once. - boolean allKeyInputColumnsRepeating; - if (bigTableKeyColumnMap.length == 0) { - allKeyInputColumnsRepeating = false; - } else { - allKeyInputColumnsRepeating = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) { - allKeyInputColumnsRepeating = false; - break; - } - } - } - if (allKeyInputColumnsRepeating) { + serializedMultiKeySeries.processBatch(batch); - /* - * Repeating. - */ + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashSetResultCount = 0; - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + MapJoinHashSetResult hashSetResult; + MapJoinHashTableResult.MapJoinResult containsResult; + do { + // Use the next hash set result entry. + hashSetResult = hashSetResults[hashSetResultCount]; - /* - * Multi-Key specific repeated lookup. - */ + // NOTE: Any null column in the key for inner join is a non-match. + if (serializedMultiKeySeries.getCurrentKeyHasAnyNulls()) { - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, 0); - JoinUtil.JoinResult joinResult; - if (keyVectorSerializeWrite.getHasAnyNulls()) { - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - byte[] keyBytes = currentKeyOutput.getData(); - int keyLength = currentKeyOutput.getLength(); - // LOG.debug(CLASS_NAME + " processOp all " + displayBytes(keyBytes, 0, keyLength)); - joinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[0]); - } + // CONSIDER: Add support for NullSafe option. - /* - * Common repeated join result processing. - */ + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]); - } else { + } else { - /* - * NOT Repeating. - */ + hashSet.hashSetContains( + serializedMultiKeySeries.getSerializedBytes(), + serializedMultiKeySeries.getSerializedStart(), + serializedMultiKeySeries.getSerializedLength(), + serializedMultiKeySeries.getCurrentHashCode(), + hashSetResult); + containsResult = hashSetResult.getMapJoinResult(); - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); } - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashSetResultCount = 0; - int allMatchCount = 0; - int spillCount = 0; - /* - * Multi-Key specific variables. + * Common inner join result processing. */ - - Output temp; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Multi-Key get key. - */ - - // Generate binary sortable key for current row in vectorized row batch. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, batchIndex); - boolean isAnyNull = keyVectorSerializeWrite.getHasAnyNulls(); - - // LOG.debug(CLASS_NAME + " currentKey " + - // VectorizedBatchUtil.displayBytes(currentKeyOutput.getData(), 0, currentKeyOutput.getLength())); - - /* - * Equal key series checking. - */ - - if (isAnyNull || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. - break; - case SPILL: - // We keep the hash set result for its spill information. - hashSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isAnyNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Multi-Key specific save key and lookup. - */ - - temp = saveKeyOutput; - saveKeyOutput = currentKeyOutput; - currentKeyOutput = temp; - - /* - * Multi-key specific lookup key. - */ - - byte[] keyBytes = saveKeyOutput.getData(); - int keyLength = saveKeyOutput.getLength(); - saveJoinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[hashSetResultCount]); - } - - /* - * Common left-semi join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } + switch (containsResult) { + case MATCH: + // We have extracted the existence from the hash set result, so we don't keep it. + matchLogicalIndices[matchSeriesCount] = serializedMultiKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = serializedMultiKeySeries.getCurrentDuplicateCount(); + matchSeriesCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = serializedMultiKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = serializedMultiKeySeries.getCurrentDuplicateCount(); + spillHashSetResults[spillSeriesCount] = hashSetResult; + spillSeriesCount++; + hashSetResultCount++; + spilledRowCounter += serializedMultiKeySeries.getCurrentDuplicateCount(); + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. - break; - case SPILL: - // We keep the hash set result for its spill information. - hashSetResultCount++; - break; - case NOMATCH: - break; - } + if (!serializedMultiKeySeries.next()) { + break; } - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount))); - } - - finishLeftSemi(batch, - allMatchCount, spillCount, - (VectorMapJoinHashTableResult[]) hashSetResults); + } while (true); + + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } + finishLeftSemi(batch, matchSeriesCount, spillSeriesCount); + if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiStringOperator.java index 49e1177..69d5275 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiStringOperator.java @@ -25,19 +25,17 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column String hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; - -// Single-Column String specific imports. -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytes; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; /* * Specialized class for doing a vectorized map join that is an left semi join on a Single-Column String @@ -46,8 +44,17 @@ public class VectorMapJoinLeftSemiStringOperator extends VectorMapJoinLeftSemiGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinLeftSemiStringOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -56,7 +63,7 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinBytesHashSet hashSet; + private transient MapJoinHashTableFind hashSet; //--------------------------------------------------------------------------- // Single-Column String specific members. @@ -65,6 +72,9 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + // The object that determines equal key series. + private transient VectorKeySeriesBytes bytesKeySeries; + //--------------------------------------------------------------------------- // Pass-thru constructors. // @@ -105,6 +115,8 @@ public void process(Object row, int tag) throws HiveException { singleJoinColumn = bigTableKeyColumnMap[0]; + bytesKeySeries = new VectorKeySeriesBytes(singleJoinColumn); + needCommonSetup = false; } @@ -116,12 +128,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column String hash set information for this specialized class. */ - hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable; + hashSet = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an left semi join. @@ -133,9 +146,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -149,214 +162,79 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column String specific declarations. - */ - - // The one join column for this specialized class. - BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; - byte[][] vector = joinColVector.vector; - int[] start = joinColVector.start; - int[] length = joinColVector.length; + bytesKeySeries.processBatch(batch); - /* - * Single-Column Long check for repeating. - */ + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashSetResultCount = 0; - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + MapJoinHashSetResult hashSetResult; + MapJoinHashTableResult.MapJoinResult containsResult; + do { + // Use the next hash set result entry. + hashSetResult = hashSetResults[hashSetResultCount]; - if (allKeyInputColumnsRepeating) { + if (bytesKeySeries.getCurrentKeyAllNull()) { - /* - * Repeating. - */ + // CONSIDER: Add support for NullSafe option. - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - /* - * Single-Column String specific repeated lookup. - */ - - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; } else { - byte[] keyBytes = vector[0]; - int keyStart = start[0]; - int keyLength = length[0]; - joinResult = hashSet.contains(keyBytes, keyStart, keyLength, hashSetResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + hashSet.hashSetContains( + bytesKeySeries.getCurrentBytes(), + bytesKeySeries.getCurrentStart(), + bytesKeySeries.getCurrentLength(), + bytesKeySeries.getCurrentHashCode(), + hashSetResult); + containsResult = hashSetResult.getMapJoinResult(); } - finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]); - } else { /* - * NOT Repeating. + * Common inner join result processing. */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); + switch (containsResult) { + case MATCH: + // We have extracted the existence from the hash set result, so we don't keep it. + matchLogicalIndices[matchSeriesCount] = bytesKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = bytesKeySeries.getCurrentDuplicateCount(); + matchSeriesCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = bytesKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = bytesKeySeries.getCurrentDuplicateCount(); + spillHashSetResults[spillSeriesCount] = hashSetResult; + spillSeriesCount++; + hashSetResultCount++; + spilledRowCounter += bytesKeySeries.getCurrentDuplicateCount(); + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashSetResultCount = 0; - int allMatchCount = 0; - int spillCount = 0; - - /* - * Single-Column String specific variables. - */ - - int saveKeyBatchIndex = -1; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Single-Column String get key. - */ - - // Implicit -- use batchIndex. - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; - - /* - * Equal key series checking. - */ - - if (isNull || !haveSaveKey || - StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], - vector[batchIndex], start[batchIndex], length[batchIndex]) == false) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. - break; - case SPILL: - // We keep the hash set result for its spill information. - hashSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column String specific save key and lookup. - */ - - saveKeyBatchIndex = batchIndex; - - /* - * Single-Column String specific lookup key. - */ - - byte[] keyBytes = vector[batchIndex]; - int keyStart = start[batchIndex]; - int keyLength = length[batchIndex]; - saveJoinResult = hashSet.contains(keyBytes, keyStart, keyLength, hashSetResults[hashSetResultCount]); - } - - /* - * Common left-semi join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } + if (!bytesKeySeries.next()) { + break; } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. - break; - case SPILL: - // We keep the hash set result for its spill information. - hashSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount))); - } - - finishLeftSemi(batch, - allMatchCount, spillCount, - (VectorMapJoinHashTableResult[]) hashSetResults); + } while (true); + + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } + finishLeftSemi(batch, matchSeriesCount, spillSeriesCount); + if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java index 0e2d65a..d5602ea 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java @@ -19,20 +19,17 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin; import java.io.IOException; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; /** * This class has methods for generating vectorized join results for outer joins. @@ -67,50 +64,44 @@ // Outer join specific members. // + protected transient long outerJoinNullKeyCounter; + protected transient long outerJoinFilteredOutCounter; + // An array of hash map results so we can do lookups on the whole batch before output result // generation. - protected transient VectorMapJoinHashMapResult hashMapResults[]; + protected transient MapJoinHashMapResult hashMapResults[]; - // Pre-allocated member for remembering the big table's selected array at the beginning of - // the process method before applying any filter. For outer join we need to remember which - // rows did not match since they will appear the in outer join result with NULLs for the - // small table. - protected transient int[] inputSelected; + // For outer join, we must some how retain our input row selection before ON expression + // filtering and before hash table matching so we can generate results for all rows (matching + // and non matching) later. Since we are knocking rows out in different phases, we use a + // copy of the selected array. + protected int inputLogicalSize; + protected int filteredSize; + protected boolean inputSelectedInUse; - // Pre-allocated member for storing the (physical) batch index of matching row (single- or - // multi-small-table-valued) indexes during a process call. - protected transient int[] allMatchs; + // Pre-allocated member for storing the batch indices of the input batch rows. + protected transient int[] inputSelected; - /* - * Pre-allocated members for storing information equal key series for small-table matches. - * - * ~HashMapResultIndices - * Index into the hashMapResults array for the match. - * ~AllMatchIndices - * (Logical) indices into allMatchs to the first row of a match of a - * possible series of duplicate keys. - * ~IsSingleValue - * Whether there is 1 or multiple small table values. - * ~DuplicateCounts - * The duplicate count for each matched key. - * - */ - protected transient int[] equalKeySeriesHashMapResultIndices; - protected transient int[] equalKeySeriesAllMatchIndices; - protected transient boolean[] equalKeySeriesIsSingleValue; - protected transient int[] equalKeySeriesDuplicateCounts; + // Pre-allocated member for storing the logical batch index (within input batch), + // series count of rows, and hash map results that matched. + protected transient int[] matchLogicalIndices; + protected transient int[] matchDuplicateCounts; + protected transient boolean[] matchIsSingleValue; + protected transient MapJoinHashMapResult[] matchHashMapResults; - // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. - protected transient int[] spills; + // Pre-allocated member for storing the logical batch index and series count of rows that spilled. + protected transient int[] spillLogicalIndices; + protected transient int[] spillDuplicateCounts; - // Pre-allocated member for storing index into the hashSetResults for each spilled row. - protected transient int[] spillHashMapResultIndices; + // Pre-allocated member for storing the reference to the hash map results of rows that spilled. + protected transient MapJoinHashMapResult[] spillHashMapResults; - // Pre-allocated member for storing any non-spills, non-matches, or merged row indexes during a - // process method call. - protected transient int[] nonSpills; - protected transient int[] noMatchs; - protected transient int[] merged; + // Pre-allocated member for storing any non-spills, matches, and non-matches batch indexes during + // a process method call. + protected transient int[] nonSpillSelected; + protected transient int[] matchSelected; + protected transient int[] nonMatchSelected; + protected transient int[] resultSelected; /** Kryo ctor. */ protected VectorMapJoinOuterGenerateResultOperator() { @@ -132,29 +123,30 @@ public VectorMapJoinOuterGenerateResultOperator(CompilationOpContext ctx, protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); - // Outer join specific. - VectorMapJoinHashMap baseHashMap = (VectorMapJoinHashMap) vectorMapJoinHashTable; + outerJoinNullKeyCounter = 0; + outerJoinFilteredOutCounter = 0; - hashMapResults = new VectorMapJoinHashMapResult[batch.DEFAULT_SIZE]; + // Outer join specific. + hashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; for (int i = 0; i < hashMapResults.length; i++) { - hashMapResults[i] = baseHashMap.createHashMapResult(); + hashMapResults[i] = vectormapJoinHashTableFactory.createHashMapResult(); } inputSelected = new int[batch.DEFAULT_SIZE]; - allMatchs = new int[batch.DEFAULT_SIZE]; + matchLogicalIndices = new int[batch.DEFAULT_SIZE]; + matchDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchIsSingleValue = new boolean[batch.DEFAULT_SIZE]; + matchHashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; - equalKeySeriesHashMapResultIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesAllMatchIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesIsSingleValue = new boolean[batch.DEFAULT_SIZE]; - equalKeySeriesDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillLogicalIndices = new int[batch.DEFAULT_SIZE]; + spillDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillHashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; - spills = new int[batch.DEFAULT_SIZE]; - spillHashMapResultIndices = new int[batch.DEFAULT_SIZE]; - - nonSpills = new int[batch.DEFAULT_SIZE]; - noMatchs = new int[batch.DEFAULT_SIZE]; - merged = new int[batch.DEFAULT_SIZE]; + nonSpillSelected = new int[batch.DEFAULT_SIZE]; + matchSelected = new int[batch.DEFAULT_SIZE]; + nonMatchSelected = new int[batch.DEFAULT_SIZE]; + resultSelected = new int[batch.DEFAULT_SIZE]; } @@ -167,7 +159,20 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { /** * Do the per-batch setup for an outer join. */ - protected void outerPerBatchSetup(VectorizedRowBatch batch) { + protected boolean outerPerBatchSetup(VectorizedRowBatch batch) { + + inputLogicalSize = batch.size; + if (inputLogicalSize == 0) { + return false; + } + + inputSelectedInUse = batch.selectedInUse; + if (inputSelectedInUse) { + // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) { + // throw new HiveException("batch.selected is not in sort order and unique"); + // } + System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize); + } // For join operators that can generate small table results, reset their // (target) scratch columns. @@ -181,373 +186,184 @@ protected void outerPerBatchSetup(VectorizedRowBatch batch) { ColumnVector bigTableOuterKeyColumn = batch.cols[column]; bigTableOuterKeyColumn.reset(); } - } - - /** - * Apply the value expression to rows in the (original) input selected array. - * - * @param batch - * The vectorized row batch. - * @param inputSelectedInUse - * Whether the (original) input batch is selectedInUse. - * @param inputLogicalSize - * The (original) input batch size. - */ - private void doValueExprOnInputSelected(VectorizedRowBatch batch, - boolean inputSelectedInUse, int inputLogicalSize) { - - int saveBatchSize = batch.size; - int[] saveSelected = batch.selected; - boolean saveSelectedInUse = batch.selectedInUse; - batch.size = inputLogicalSize; - batch.selected = inputSelected; - batch.selectedInUse = inputSelectedInUse; - - if (bigTableValueExpressions != null) { - for(VectorExpression ve: bigTableValueExpressions) { + // Filtering for outer join just removes rows available for hash table matching. + if (bigTableFilterExpressions.length > 0) { + // Since the input + for (VectorExpression ve : bigTableFilterExpressions) { ve.evaluate(batch); } - } - batch.size = saveBatchSize; - batch.selected = saveSelected; - batch.selectedInUse = saveSelectedInUse; - } - - /** - * Apply the value expression to rows specified by a selected array. - * - * @param batch - * The vectorized row batch. - * @param selected - * The (physical) batch indices to apply the expression to. - * @param size - * The size of selected. - */ - private void doValueExpr(VectorizedRowBatch batch, - int[] selected, int size) { - - int saveBatchSize = batch.size; - int[] saveSelected = batch.selected; - boolean saveSelectedInUse = batch.selectedInUse; - - batch.size = size; - batch.selected = selected; - batch.selectedInUse = true; - - if (bigTableValueExpressions != null) { - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } + // Since outer join outputs non matches, we do not return here on filteredSize == 0. + filteredSize = batch.size; + outerJoinFilteredOutCounter += (inputLogicalSize - filteredSize); + } else { + filteredSize = inputLogicalSize; } - batch.size = saveBatchSize; - batch.selected = saveSelected; - batch.selectedInUse = saveSelectedInUse; + return true; } /** - * Remove (subtract) members from the input selected array and produce the results into - * a difference array. + * Generate the outer join output results for one vectorized row batch. * - * @param inputSelectedInUse - * Whether the (original) input batch is selectedInUse. - * @param inputLogicalSize - * The (original) input batch size. - * @param remove - * The indices to remove. They must all be present in input selected array. - * @param removeSize - * The size of remove. - * @param difference - * The resulting difference -- the input selected array indices not in the - * remove array. - * @return - * The resulting size of the difference array. - * @throws HiveException + * @param batch + * The big table batch with any matching and any non matching rows both as + * selected in use. + * @param matchSeriesCount + * Number of match duplicate key series. + * @param spillSeriesCount + * Number of spill duplicate key series. */ - private int subtractFromInputSelected(boolean inputSelectedInUse, int inputLogicalSize, - int[] remove, int removeSize, int[] difference) throws HiveException { - - // if (!verifyMonotonicallyIncreasing(remove, removeSize)) { - // throw new HiveException("remove is not in sort order and unique"); - // } - - int differenceCount = 0; - - // Determine which rows are left. - int removeIndex = 0; - if (inputSelectedInUse) { - for (int i = 0; i < inputLogicalSize; i++) { - int candidateIndex = inputSelected[i]; - if (removeIndex < removeSize && candidateIndex == remove[removeIndex]) { - removeIndex++; - } else { - difference[differenceCount++] = candidateIndex; - } - } - } else { - for (int candidateIndex = 0; candidateIndex < inputLogicalSize; candidateIndex++) { - if (removeIndex < removeSize && candidateIndex == remove[removeIndex]) { - removeIndex++; - } else { - difference[differenceCount++] = candidateIndex; - } - } - } + protected void finishOuter(VectorizedRowBatch batch, int matchSeriesCount, int spillSeriesCount) + throws IOException, HiveException { - if (removeIndex != removeSize) { - throw new HiveException("Not all batch indices removed"); - } + // The match and spill information is with respect to the batch selected not inputSelected. + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; + final int selectedSize = batch.size; - // if (!verifyMonotonicallyIncreasing(difference, differenceCount)) { - // throw new HiveException("difference is not in sort order and unique"); - // } + // Dump out the spill rows now and determine what is left. + int nonSpillCount = 0; + if (spillSeriesCount > 0) { - return differenceCount; - } + spillHashMapBatch(batch, spillLogicalIndices, spillDuplicateCounts, spillHashMapResults, + spillSeriesCount, selectedInUse, selected, selectedSize); - /** - * Remove (subtract) members from an array and produce the results into - * a difference array. - - * @param all - * The selected array containing all members. - * @param allSize - * The size of all. - * @param remove - * The indices to remove. They must all be present in input selected array. - * @param removeSize - * The size of remove. - * @param difference - * The resulting difference -- the all array indices not in the - * remove array. - * @return - * The resulting size of the difference array. - * @throws HiveException - */ - private int subtract(int[] all, int allSize, - int[] remove, int removeSize, int[] difference) throws HiveException { + if (spillSeriesCount == inputLogicalSize) { + batch.size = 0; + batch.selectedInUse = false; + return; + } - // if (!verifyMonotonicallyIncreasing(remove, removeSize)) { - // throw new HiveException("remove is not in sort order and unique"); - // } + // Create nonSpillSelected by not including spill batch indices. Note the spill series + // logical indices are with respect to the batch selected not inputSelected. - int differenceCount = 0; + nonSpillCount = + makeSelectedByRemovingSeries( + /* Remove batch indices from input batch: */ + inputSelectedInUse, inputSelected, inputLogicalSize, + /* That are in the series: */ + spillLogicalIndices, spillDuplicateCounts, spillSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + nonSpillSelected); - // Determine which rows are left. - int removeIndex = 0; - for (int i = 0; i < allSize; i++) { - int candidateIndex = all[i]; - if (removeIndex < removeSize && candidateIndex == remove[removeIndex]) { - removeIndex++; - } else { - difference[differenceCount++] = candidateIndex; - } } - if (removeIndex != removeSize) { - throw new HiveException("Not all batch indices removed"); - } + /* + * Optimize by running value expressions only over the matched rows. + */ + if (matchSeriesCount > 0 && bigTableValueExpressions != null) { - return differenceCount; - } + // Create matchSelected by adding the batch indices for the match logical index ranges from + // the input batch range. - /** - * Sort merge two select arrays so the resulting array is ordered by (batch) index. - * - * @param selected1 - * @param selected1Count - * @param selected2 - * @param selected2Count - * @param sortMerged - * The resulting sort merge of selected1 and selected2. - * @return - * The resulting size of the sortMerged array. - * @throws HiveException - */ - private int sortMerge(int[] selected1, int selected1Count, - int[] selected2, int selected2Count, int[] sortMerged) throws HiveException { - - // if (!verifyMonotonicallyIncreasing(selected1, selected1Count)) { - // throw new HiveException("selected1 is not in sort order and unique"); - // } - - // if (!verifyMonotonicallyIncreasing(selected2, selected2Count)) { - // throw new HiveException("selected1 is not in sort order and unique"); - // } - - - int sortMergeCount = 0; - - int selected1Index = 0; - int selected2Index = 0; - for (int i = 0; i < selected1Count + selected2Count; i++) { - if (selected1Index < selected1Count && selected2Index < selected2Count) { - if (selected1[selected1Index] < selected2[selected2Index]) { - sortMerged[sortMergeCount++] = selected1[selected1Index++]; - } else { - sortMerged[sortMergeCount++] = selected2[selected2Index++]; - } - } else if (selected1Index < selected1Count) { - sortMerged[sortMergeCount++] = selected1[selected1Index++]; - } else { - sortMerged[sortMergeCount++] = selected2[selected2Index++]; - } - } + int matchSelectedCount = + flattenLogicalSeriesIntoSelected( + selectedInUse, selected, selectedSize, + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + matchSelected); - // if (!verifyMonotonicallyIncreasing(sortMerged, sortMergeCount)) { - // throw new HiveException("sortMerged is not in sort order and unique"); - // } + performValueExpressions(batch, matchSelected, matchSelectedCount); - return sortMergeCount; - } - - /** - * Generate the outer join output results for one vectorized row batch. - * - * @param batch - * The big table batch with any matching and any non matching rows both as - * selected in use. - * @param allMatchCount - * Number of matches in allMatchs. - * @param equalKeySeriesCount - * Number of single value matches. - * @param atLeastOneNonMatch - * Whether at least one row was a non-match. - * @param inputSelectedInUse - * A copy of the batch's selectedInUse flag on input to the process method. - * @param inputLogicalSize - * The batch's size on input to the process method. - * @param spillCount - * Number of spills in spills. - * @param hashMapResultCount - * Number of entries in hashMapResults. - */ - public void finishOuter(VectorizedRowBatch batch, - int allMatchCount, int equalKeySeriesCount, boolean atLeastOneNonMatch, - boolean inputSelectedInUse, int inputLogicalSize, - int spillCount, int hashMapResultCount) throws IOException, HiveException { - - // Get rid of spills before we start modifying the batch. - if (spillCount > 0) { - spillHashMapBatch(batch, (VectorMapJoinHashTableResult[]) hashMapResults, - spills, spillHashMapResultIndices, spillCount); } - int noMatchCount = 0; - if (spillCount > 0) { + int nonMatchSelectedCount = 0; + if (spillSeriesCount > 0) { - // Subtract the spills to get all match and non-match rows. - int nonSpillCount = subtractFromInputSelected( - inputSelectedInUse, inputLogicalSize, spills, spillCount, nonSpills); + // Create non match selected by not including match batch indices from non spill selected. + // Note the spill series logical indices are with respect to the batch selected not + // inputSelected. - if (isLogDebugEnabled) { - LOG.debug("finishOuter spillCount > 0" + - " nonSpills " + intArrayToRangesString(nonSpills, nonSpillCount)); - } - - // Big table value expressions apply to ALL matching and non-matching rows. - if (bigTableValueExpressions != null) { - - doValueExpr(batch, nonSpills, nonSpillCount); - - } - - if (atLeastOneNonMatch) { - noMatchCount = subtract(nonSpills, nonSpillCount, allMatchs, allMatchCount, - noMatchs); - - if (isLogDebugEnabled) { - LOG.debug("finishOuter spillCount > 0" + - " noMatchs " + intArrayToRangesString(noMatchs, noMatchCount)); - } + nonMatchSelectedCount = + makeSelectedByRemovingSeries( + /* Remove batch indices from non spill: */ + true, nonSpillSelected, nonSpillCount, + /* That are in the series: */ + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + nonMatchSelected); - } } else { - // Run value expressions over original (whole) input batch. - doValueExprOnInputSelected(batch, inputSelectedInUse, inputLogicalSize); - - if (atLeastOneNonMatch) { - noMatchCount = subtractFromInputSelected( - inputSelectedInUse, inputLogicalSize, allMatchs, allMatchCount, noMatchs); - - if (isLogDebugEnabled) { - LOG.debug("finishOuter spillCount == 0" + - " noMatchs " + intArrayToRangesString(noMatchs, noMatchCount)); - } - } + // Create non match selected by not including match batch indices from input batch. + // Note the spill series logical indices are with respect to the batch selected + // not inputSelected. + + nonMatchSelectedCount = + makeSelectedByRemovingSeries( + /* Remove batch indices from input batch: */ + inputSelectedInUse, inputSelected, inputLogicalSize, + /* That are in the series: */ + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + nonMatchSelected); } - // When we generate results into the overflow batch, we may still end up with fewer rows - // in the big table batch. So, nulSel and the batch's selected array will be rebuilt with - // just the big table rows that need to be forwarded, minus any rows processed with the - // overflow batch. - if (allMatchCount > 0) { - - int numSel = 0; - for (int i = 0; i < equalKeySeriesCount; i++) { - int hashMapResultIndex = equalKeySeriesHashMapResultIndices[i]; - VectorMapJoinHashMapResult hashMapResult = hashMapResults[hashMapResultIndex]; - int allMatchesIndex = equalKeySeriesAllMatchIndices[i]; - boolean isSingleValue = equalKeySeriesIsSingleValue[i]; - int duplicateCount = equalKeySeriesDuplicateCounts[i]; - - if (isSingleValue) { - numSel = generateHashMapResultSingleValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount, numSel); - } else { - generateHashMapResultMultiValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount); - } - } + for (int i = 0; i < matchSeriesCount; i++) { + MapJoinHashMapResult hashMapResult = matchHashMapResults[i]; + int logical = matchLogicalIndices[i]; + int duplicateCount = matchDuplicateCounts[i]; - // The number of single value rows that were generated in the big table batch. - batch.size = numSel; - batch.selectedInUse = true; - if (isLogDebugEnabled) { - LOG.debug("finishOuter allMatchCount > 0" + - " batch.selected " + intArrayToRangesString(batch.selected, batch.size)); + // Logical indices are with respect to the batch selected not inputSelected. + if (matchIsSingleValue[i]) { + generateHashMapResultSingleValue(batch, hashMapResult, + logical, duplicateCount, selectedInUse, selected, selectedSize); + } else { + generateHashMapResultMultiValue(batch, hashMapResult, + logical, duplicateCount, selectedInUse, selected, selectedSize); } - - } else { - batch.size = 0; } - if (noMatchCount > 0) { - if (batch.size > 0) { - - generateOuterNulls(batch, noMatchs, noMatchCount); - - // Merge noMatchs and (match) selected. - int mergeCount = sortMerge( - noMatchs, noMatchCount, batch.selected, batch.size, merged); - - if (isLogDebugEnabled) { - LOG.debug("finishOuter noMatchCount > 0 && batch.size > 0" + - " merged " + intArrayToRangesString(merged, mergeCount)); - } - - System.arraycopy(merged, 0, batch.selected, 0, mergeCount); - batch.size = mergeCount; - batch.selectedInUse = true; - } else { - - // We can use the whole batch for output of no matches. + // Output non matches + if (nonMatchSelectedCount > 0) { + generateOuterNulls(batch, nonMatchSelectedCount); + } - generateOuterNullsRepeatedAll(batch); + // Create selected by not including match logical indices for multi-value small table results + // from input batch logical range, and then putting the batch indices in selected. + + int numSel; + if (spillSeriesCount > 0) { + + // Create non match selected by not including match batch indices from non spills. + // Note the spill series logical indices are with respect to the batch selected + // not inputSelected. + + numSel = + makeSelectedByRemovingMultiValues( + /* Remove batch indices from non spill: */ + true, nonSpillSelected, nonSpillCount, + /* That are in the series: */ + matchLogicalIndices, matchDuplicateCounts, matchIsSingleValue, matchSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + resultSelected); + } else { - System.arraycopy(noMatchs, 0, batch.selected, 0, noMatchCount); - batch.size = noMatchCount; - batch.selectedInUse = true; + // Create non match selected by not including match batch indices from input batch. + // Note the spill series logical indices are with respect to the batch selected + // not inputSelected. + + numSel = + makeSelectedByRemovingMultiValues( + /* Remove batch indices from input batch: */ + inputSelectedInUse, inputSelected, inputLogicalSize, + /* That are in the series: */ + matchLogicalIndices, matchDuplicateCounts, matchIsSingleValue, matchSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + resultSelected); + } - if (isLogDebugEnabled) { - LOG.debug("finishOuter noMatchCount > 0 && batch.size == 0" + - " batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } - } + batch.selectedInUse = true; + batch.size = numSel; + if (numSel > 0) { + System.arraycopy(resultSelected, 0, selected, 0, numSel); } } @@ -564,13 +380,13 @@ public void finishOuter(VectorizedRowBatch batch, * @param noMatchSize * Number of non matches in noMatchs. */ - protected void generateOuterNulls(VectorizedRowBatch batch, int[] noMatchs, - int noMatchSize) throws IOException, HiveException { + protected void generateOuterNulls(VectorizedRowBatch batch, int noMatchSize) + throws IOException, HiveException { // Set null information in the small table results area. for (int i = 0; i < noMatchSize; i++) { - int batchIndex = noMatchs[i]; + int batchIndex = nonMatchSelected[i]; // Mark any scratch small table scratch columns that would normally receive a copy of the // key as null, too. @@ -589,163 +405,15 @@ protected void generateOuterNulls(VectorizedRowBatch batch, int[] noMatchs, } } - /** - * Generate the outer join output results for one vectorized row batch with a repeated key. - * - * Any filter expressions will apply now since hash map lookup for outer join is complete. - * - * @param batch - * The big table batch with any matching and any non matching rows both as - * selected in use. - * @param joinResult - * The hash map lookup result for the repeated key. - * @param hashMapResults - * The array of all hash map results for the batch. - * @param someRowsFilteredOut - * Whether some rows of the repeated key batch were knocked out by the filter. - * @param inputSelectedInUse - * A copy of the batch's selectedInUse flag on input to the process method. - * @param inputLogicalSize - * The batch's size on input to the process method. - * @param scratch1 - * Pre-allocated storage to internal use. - * @param scratch2 - * Pre-allocated storage to internal use. - */ - public void finishOuterRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, - VectorMapJoinHashMapResult hashMapResult, boolean someRowsFilteredOut, - boolean inputSelectedInUse, int inputLogicalSize) - throws IOException, HiveException { - - // LOG.debug("finishOuterRepeated batch #" + batchCounter + " " + joinResult.name() + " batch.size " + batch.size + " someRowsFilteredOut " + someRowsFilteredOut); - - switch (joinResult) { - case MATCH: - - // Rows we looked up as one repeated key are a match. But filtered out rows - // need to be generated as non-matches, too. - - if (someRowsFilteredOut) { - - // For the filtered out rows that didn't (logically) get looked up in the hash table, - // we need to generate no match results for those too... - - // Run value expressions over original (whole) input batch. - doValueExprOnInputSelected(batch, inputSelectedInUse, inputLogicalSize); - - // Now calculate which rows were filtered out (they are logically no matches). - - // Determine which rows are non matches by determining the delta between inputSelected and - // (current) batch selected. - - int noMatchCount = subtractFromInputSelected( - inputSelectedInUse, inputLogicalSize, batch.selected, batch.size, noMatchs); - - generateOuterNulls(batch, noMatchs, noMatchCount); - - // Now generate the matchs. Single small table values will be put into the big table - // batch and come back in matchs. Any multiple small table value results will go into - // the overflow batch. - generateHashMapResultRepeatedAll(batch, hashMapResult); - - // Merge noMatchs and (match) selected. - int mergeCount = sortMerge( - noMatchs, noMatchCount, batch.selected, batch.size, merged); - - System.arraycopy(merged, 0, batch.selected, 0, mergeCount); - batch.size = mergeCount; - batch.selectedInUse = true; - } else { - - // Just run our value expressions over input batch. - - if (bigTableValueExpressions != null) { - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } - } - - generateHashMapResultRepeatedAll(batch, hashMapResult); - } - break; - - case SPILL: - - // Rows we looked up as one repeated key need to spill. But filtered out rows - // need to be generated as non-matches, too. - - spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashMapResult); - - // After using selected to generate spills, generate non-matches, if any. - if (someRowsFilteredOut) { - - // Determine which rows are non matches by determining the delta between inputSelected and - // (current) batch selected. - - int noMatchCount = subtractFromInputSelected( - inputSelectedInUse, inputLogicalSize, batch.selected, batch.size, noMatchs); - - System.arraycopy(noMatchs, 0, batch.selected, 0, noMatchCount); - batch.size = noMatchCount; - batch.selectedInUse = true; - - generateOuterNullsRepeatedAll(batch); - } else { - batch.size = 0; - } - - break; - - case NOMATCH: - - if (someRowsFilteredOut) { - - // When the repeated no match is due to filtering, we need to restore the - // selected information. - - if (inputSelectedInUse) { - System.arraycopy(inputSelected, 0, batch.selected, 0, inputLogicalSize); - } - batch.size = inputLogicalSize; - } - - // Run our value expressions over whole batch. - if (bigTableValueExpressions != null) { - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } - } - - generateOuterNullsRepeatedAll(batch); - break; - } - } - - /** - * Generate the non-match outer join output results for the whole repeating vectorized - * row batch. - * - * Each row will get nulls for all small table values. - * - * @param batch - * The big table batch. - */ - protected void generateOuterNullsRepeatedAll(VectorizedRowBatch batch) throws HiveException { - - for (int column : smallTableOutputVectorColumns) { - ColumnVector colVector = batch.cols[column]; - colVector.noNulls = false; - colVector.isNull[0] = true; - colVector.isRepeating = true; - } - - // Mark any scratch small table scratch columns that would normally receive a copy of the key - // as null, too. - for (int column : bigTableOuterKeyOutputVectorColumns) { - ColumnVector colVector = batch.cols[column]; - colVector.noNulls = false; - colVector.isNull[0] = true; - colVector.isRepeating = true; - } - } + @Override + public void closeOp(boolean aborted) throws HiveException { + super.closeOp(aborted); + if (!aborted) { + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " closeOp outer join " + + outerJoinNullKeyCounter + " null keys, " + + outerJoinFilteredOutCounter + " filtered out rows"); + } + } + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterLongOperator.java index 58bd0ab..18c5cfe 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterLongOperator.java @@ -26,6 +26,9 @@ import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; @@ -34,10 +37,8 @@ import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column Long hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; - -// Single-Column Long specific imports. -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /* * Specialized class for doing a vectorized map join that is an outer join on a Single-Column Long @@ -45,8 +46,17 @@ */ public class VectorMapJoinOuterLongOperator extends VectorMapJoinOuterGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOuterLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinOuterLongOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -55,7 +65,7 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinLongHashMap hashMap; + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Single-Column Long specific members. @@ -68,6 +78,10 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + private transient PrimitiveTypeInfo singleJoinColumnPrimitiveTypeInfo; + + // The object that determines equal key series. + private transient VectorKeySeriesLong longKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -108,6 +122,10 @@ public void process(Object row, int tag) throws HiveException { */ singleJoinColumn = bigTableKeyColumnMap[0]; + singleJoinColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) bigTableKeyTypeInfos[0]; + + longKeySeries = + new VectorKeySeriesLong(singleJoinColumn, singleJoinColumnPrimitiveTypeInfo); needCommonSetup = false; } @@ -120,7 +138,7 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column Long hash map information for this specialized class. */ - hashMap = (VectorMapJoinLongHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; useMinMax = hashMap.useMinMax(); if (useMinMax) { min = hashMap.min(); @@ -131,313 +149,121 @@ public void process(Object row, int tag) throws HiveException { } batchCounter++; - - final int inputLogicalSize = batch.size; - - if (inputLogicalSize == 0) { - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); - } - return; - } + inputRowCounter += batch.size; // Do the per-batch setup for an outer join. - outerPerBatchSetup(batch); - - // For outer join, remember our input rows before ON expression filtering or before - // hash table matching so we can generate results for all rows (matching and non matching) - // later. - boolean inputSelectedInUse = batch.selectedInUse; - if (inputSelectedInUse) { - // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) { - // throw new HiveException("batch.selected is not in sort order and unique"); - // } - System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize); - } - - // Filtering for outer join just removes rows available for hash table matching. - boolean someRowsFilteredOut = false; - if (bigTableFilterExpressions.length > 0) { - // Since the input - for (VectorExpression ve : bigTableFilterExpressions) { - ve.evaluate(batch); - } - someRowsFilteredOut = (batch.size != inputLogicalSize); + if (!outerPerBatchSetup(batch)) { if (isLogDebugEnabled) { - if (batch.selectedInUse) { - if (inputSelectedInUse) { - LOG.debug(CLASS_NAME + - " inputSelected " + intArrayToRangesString(inputSelected, inputLogicalSize) + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } else { - LOG.debug(CLASS_NAME + - " inputLogicalSize " + inputLogicalSize + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } - } + LOG.info(getLoggingPrefix() + " batch #" + batchCounter + " empty"); } + return; } - // Perform any key expressions. Results will go into scratch columns. - if (bigTableKeyExpressions != null) { - for (VectorExpression ve : bigTableKeyExpressions) { - ve.evaluate(batch); - } - } + if (filteredSize > 0) { + longKeySeries.processBatch(batch); - /* - * Single-Column Long specific declarations. - */ + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashMapResultCount = 0; - // The one join column for this specialized class. - LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn]; - long[] vector = joinColVector.vector; + // UNDONE: Debugging + int logical; + int batchIndex; + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; - /* - * Single-Column Long check for repeating. - */ + MapJoinHashMapResult hashMapResult; + MapJoinHashTableResult.MapJoinResult lookupResult; + long key; + do { + logical = longKeySeries.getCurrentLogical(); + batchIndex = (selectedInUse ? selected[logical] : logical); - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, getLoggingPrefix() + " candidate batch"); - if (allKeyInputColumnsRepeating) { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - /* - * Repeating. - */ + if (longKeySeries.getCurrentKeyAllNull()) { - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + // CONSIDER: Add support for NullSafe option. - /* - * Single-Column Long specific repeated lookup. - */ + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; + outerJoinNullKeyCounter += longKeySeries.getCurrentDuplicateCount(); - JoinUtil.JoinResult joinResult; - if (batch.size == 0) { - // Whole repeated key batch was filtered out. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else if (!joinColVector.noNulls && joinColVector.isNull[0]) { - // Any (repeated) null key column is no match for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - // Handle *repeated* join key, if found. - long key = vector[0]; - // LOG.debug(CLASS_NAME + " repeated key " + key); - if (useMinMax && (key < min || key > max)) { - // Out of range for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; } else { - joinResult = hashMap.lookup(key, hashMapResults[0]); - } - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishOuterRepeated(batch, joinResult, hashMapResults[0], someRowsFilteredOut, - inputSelectedInUse, inputLogicalSize); - } else { - - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - - boolean atLeastOneNonMatch = someRowsFilteredOut; - /* - * Single-Column Long specific variables. - */ - - long saveKey = 0; + key = longKeySeries.getCurrentKey(); - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < batch.size; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + if (useMinMax && (key < min || key > max)) { + // Out of range for whole batch. + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; + } else { + hashMap.hashMapLookup( + key, + longKeySeries.getCurrentHashCode(), + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); + } - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, taskName + ", " + getOperatorId() + " candidate " + CLASS_NAME + " batch"); + } /* - * Single-Column Long outer null detection. + * Common inner join result processing. */ - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; - - if (isNull) { - - // Have that the NULL does not interfere with the current equal key series, if there - // is one. We do not set saveJoinResult. - // - // Let a current MATCH equal key series keep going, or - // Let a current SPILL equal key series keep going, or - // Let a current NOMATCH keep not matching. - - atLeastOneNonMatch = true; - - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " NULL"); - } else { - - /* - * Single-Column Long outer get key. - */ - - long currentKey = vector[batchIndex]; - - /* - * Equal key series checking. - */ - - if (!haveSaveKey || currentKey != saveKey) { - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column Long specific save key. - */ - - saveKey = currentKey; - - /* - * Single-Column Long specific lookup key. - */ - - if (useMinMax && (currentKey < min || currentKey > max)) { - // Key out of range for whole hash table. - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - } else { - saveJoinResult = hashMap.lookup(currentKey, hashMapResults[hashMapResultCount]); - } - - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " New Key " + currentKey + " " + saveJoinResult.name()); - - /* - * Common outer join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - atLeastOneNonMatch = true; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " Key Continues " + saveKey + " " + saveJoinResult.name()); - - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - // if (!verifyMonotonicallyIncreasing(allMatchs, allMatchCount)) { - // throw new HiveException("allMatchs is not in sort order and unique"); - // } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + switch (lookupResult) { case MATCH: + matchLogicalIndices[matchSeriesCount] = longKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = longKeySeries.getCurrentDuplicateCount(); + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; hashMapResultCount++; - equalKeySeriesCount++; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, getLoggingPrefix() + " MATCH"); break; + case SPILL: + spillLogicalIndices[spillSeriesCount] = longKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = longKeySeries.getCurrentDuplicateCount(); + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; hashMapResultCount++; + spilledRowCounter += longKeySeries.getCurrentDuplicateCount(); break; - case NOMATCH: + + case NO_MATCH: + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, getLoggingPrefix() + " NOMATCH"); break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); } - } + + if (!longKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " atLeastOneNonMatch " + atLeastOneNonMatch + - " inputSelectedInUse " + inputSelectedInUse + + LOG.debug(getLoggingPrefix() + " batch #" + batchCounter + " batch info " + " inputLogicalSize " + inputLogicalSize + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " matchHashMapResults " + Arrays.toString(Arrays.copyOfRange(matchHashMapResults, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - - // We will generate results for all matching and non-matching rows. - finishOuter(batch, - allMatchCount, equalKeySeriesCount, atLeastOneNonMatch, - inputSelectedInUse, inputLogicalSize, - spillCount, hashMapResultCount); + finishOuter(batch, matchSeriesCount, spillSeriesCount); + } else { + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " batch #" + batchCounter + " batch info " + + " inputLogicalSize " + inputLogicalSize + " filtered out"); + } + finishOuter(batch, 0, 0); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterMultiKeyOperator.java index 7f9afd2..2a8a865 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterMultiKeyOperator.java @@ -26,18 +26,15 @@ import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; - -// Multi-Key hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; - -// Multi-Key specific imports. -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; /* @@ -47,8 +44,17 @@ public class VectorMapJoinOuterMultiKeyOperator extends VectorMapJoinOuterGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOuterMultiKeyOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinOuterMultiKeyOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -57,20 +63,17 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMap hashMap; + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Multi-Key specific members. // - // Object that can take a set of columns in row in a vectorized row batch and serialized it. - private transient VectorSerializeRow keyVectorSerializeWrite; - - // The BinarySortable serialization of the current key. - private transient Output currentKeyOutput; + // Binary sortable multi-key serializer. + private transient BinarySortableSerializeWrite binarySortableSerializeWrite; - // The BinarySortable serialization of the saved key for a possible series of equal keys. - private transient Output saveKeyOutput; + // The object that determines equal key series. + private transient VectorKeySeriesMultiFast serializedMultiKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -110,12 +113,15 @@ public void process(Object row, int tag) throws HiveException { * Initialize Multi-Key members for this specialized class. */ - keyVectorSerializeWrite = new VectorSerializeRow( - new BinarySortableSerializeWrite(bigTableKeyColumnMap.length)); - keyVectorSerializeWrite.init(bigTableKeyTypeNames, bigTableKeyColumnMap); + binarySortableSerializeWrite = + new BinarySortableSerializeWrite(bigTableKeyColumnMap.length); - currentKeyOutput = new Output(); - saveKeyOutput = new Output(); + // For Multi-Key the Fast hash table computes the serialized key's hash code, + // so we don't use VectorKeySeriesMulti's hash code. + serializedMultiKeySeries = + new VectorKeySeriesMultiFast( + binarySortableSerializeWrite); + serializedMultiKeySeries.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); needCommonSetup = false; } @@ -128,332 +134,113 @@ public void process(Object row, int tag) throws HiveException { * Get our Multi-Key hash map information for this specialized class. */ - hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; - final int inputLogicalSize = batch.size; + // Do the per-batch setup for an outer join. - if (inputLogicalSize == 0) { + if (!outerPerBatchSetup(batch)) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } return; } - // Do the per-batch setup for an outer join. - - outerPerBatchSetup(batch); - - // For outer join, remember our input rows before ON expression filtering or before - // hash table matching so we can generate results for all rows (matching and non matching) - // later. - boolean inputSelectedInUse = batch.selectedInUse; - if (inputSelectedInUse) { - // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) { - // throw new HiveException("batch.selected is not in sort order and unique"); - // } - System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize); - } - - // Filtering for outer join just removes rows available for hash table matching. - boolean someRowsFilteredOut = false; - if (bigTableFilterExpressions.length > 0) { - // Since the input - for (VectorExpression ve : bigTableFilterExpressions) { - ve.evaluate(batch); - } - someRowsFilteredOut = (batch.size != inputLogicalSize); - if (isLogDebugEnabled) { - if (batch.selectedInUse) { - if (inputSelectedInUse) { - LOG.debug(CLASS_NAME + - " inputSelected " + intArrayToRangesString(inputSelected, inputLogicalSize) + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } else { - LOG.debug(CLASS_NAME + - " inputLogicalSize " + inputLogicalSize + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } - } - } - } - - // Perform any key expressions. Results will go into scratch columns. - if (bigTableKeyExpressions != null) { - for (VectorExpression ve : bigTableKeyExpressions) { - ve.evaluate(batch); - } - } - - /* - * Multi-Key specific declarations. - */ - - // None. - - /* - * Multi-Key Long check for repeating. - */ - - // If all BigTable input columns to key expressions are isRepeating, then - // calculate key once; lookup once. - // Also determine if any nulls are present since for a join that means no match. - boolean allKeyInputColumnsRepeating; - boolean someKeyInputColumnIsNull = false; // Only valid if allKeyInputColumnsRepeating is true. - if (bigTableKeyColumnMap.length == 0) { - allKeyInputColumnsRepeating = false; - } else { - allKeyInputColumnsRepeating = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - ColumnVector colVector = batch.cols[bigTableKeyColumnMap[i]]; - if (!colVector.isRepeating) { - allKeyInputColumnsRepeating = false; - break; - } - if (!colVector.noNulls && colVector.isNull[0]) { - someKeyInputColumnIsNull = true; - } - } - } - - if (allKeyInputColumnsRepeating) { - - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Multi-Key specific repeated lookup. - */ - - JoinUtil.JoinResult joinResult; - if (batch.size == 0) { - // Whole repeated key batch was filtered out. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else if (someKeyInputColumnIsNull) { - // Any (repeated) null key column is no match for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - - // All key input columns are repeating. Generate key once. Lookup once. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, 0); - byte[] keyBytes = currentKeyOutput.getData(); - int keyLength = currentKeyOutput.getLength(); - joinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishOuterRepeated(batch, joinResult, hashMapResults[0], someRowsFilteredOut, - inputSelectedInUse, inputLogicalSize); - } else { - - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; + if (filteredSize > 0) { + serializedMultiKeySeries.processBatch(batch); + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - - boolean atLeastOneNonMatch = someRowsFilteredOut; - - /* - * Multi-Key specific variables. - */ - Output temp; + MapJoinHashMapResult hashMapResult; + MapJoinHashTableResult.MapJoinResult lookupResult; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < batch.size; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, taskName + ", " + getOperatorId() + " candidate " + CLASS_NAME + " batch"); - - /* - * Multi-Key outer null detection. + /** + * NOTE: the usage of ~HasAnyNull because any null key in the multi-column key means + * NO_MATCH for Outer Join. */ + if (serializedMultiKeySeries.getCurrentKeyAllNull() || + serializedMultiKeySeries.getCurrentKeyHasAnyNulls()) { - // Generate binary sortable key for current row in vectorized row batch. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, batchIndex); - if (keyVectorSerializeWrite.getHasAnyNulls()) { - - // Have that the NULL does not interfere with the current equal key series, if there - // is one. We do not set saveJoinResult. - // - // Let a current MATCH equal key series keep going, or - // Let a current SPILL equal key series keep going, or - // Let a current NOMATCH keep not matching. + // CONSIDER: Add support for NullSafe option. - atLeastOneNonMatch = true; + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; + outerJoinNullKeyCounter += serializedMultiKeySeries.getCurrentDuplicateCount(); - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " NULL"); } else { - /* - * Multi-Key outer get key. - */ - - // Generated earlier to get possible null(s). - - /* - * Equal key series checking. - */ - - if (!haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Multi-Key specific save key. - */ - - temp = saveKeyOutput; - saveKeyOutput = currentKeyOutput; - currentKeyOutput = temp; - - /* - * Multi-Key specific lookup key. - */ - - byte[] keyBytes = saveKeyOutput.getData(); - int keyLength = saveKeyOutput.getLength(); - saveJoinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[hashMapResultCount]); - - /* - * Common outer join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - atLeastOneNonMatch = true; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " Key Continues " + saveKey + " " + saveJoinResult.name()); - - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - // if (!verifyMonotonicallyIncreasing(allMatchs, allMatchCount)) { - // throw new HiveException("allMatchs is not in sort order and unique"); - // } + hashMap.hashMapLookup( + serializedMultiKeySeries.getSerializedBytes(), + serializedMultiKeySeries.getSerializedStart(), + serializedMultiKeySeries.getSerializedLength(), + serializedMultiKeySeries.getCurrentHashCode(), + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); } - } - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + /* + * Common inner join result processing. + */ + + switch (lookupResult) { case MATCH: + matchLogicalIndices[matchSeriesCount] = serializedMultiKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = serializedMultiKeySeries.getCurrentDuplicateCount(); + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; hashMapResultCount++; - equalKeySeriesCount++; break; + case SPILL: + spillLogicalIndices[spillSeriesCount] = serializedMultiKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = serializedMultiKeySeries.getCurrentDuplicateCount(); + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; hashMapResultCount++; + spilledRowCounter += serializedMultiKeySeries.getCurrentDuplicateCount(); break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); } - } + + if (!serializedMultiKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " atLeastOneNonMatch " + atLeastOneNonMatch + - " inputSelectedInUse " + inputSelectedInUse + + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + " inputLogicalSize " + inputLogicalSize + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " matchHashMapResults " + Arrays.toString(Arrays.copyOfRange(matchHashMapResults, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - // We will generate results for all matching and non-matching rows. - finishOuter(batch, - allMatchCount, equalKeySeriesCount, atLeastOneNonMatch, - inputSelectedInUse, inputLogicalSize, - spillCount, hashMapResultCount); + finishOuter(batch, matchSeriesCount, spillSeriesCount); + } else { + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " batch #" + batchCounter + " batch info " + + " inputLogicalSize " + inputLogicalSize + " filtered out"); + } + finishOuter(batch, 0, 0); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterStringOperator.java index 8ed1ed4..4dac3b5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterStringOperator.java @@ -25,6 +25,9 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; @@ -32,11 +35,7 @@ import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column String hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; - -// Single-Column String specific imports. -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytes; /* * Specialized class for doing a vectorized map join that is an outer join on a Single-Column String @@ -45,8 +44,17 @@ public class VectorMapJoinOuterStringOperator extends VectorMapJoinOuterGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOuterStringOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinOuterStringOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -55,7 +63,7 @@ //--------------------------------------------------------------------------- // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMap hashMap; + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Single-Column String specific members. @@ -64,6 +72,9 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + // The object that determines equal key series. + private transient VectorKeySeriesBytes bytesKeySeries; + //--------------------------------------------------------------------------- // Pass-thru constructors. // @@ -104,6 +115,8 @@ public void process(Object row, int tag) throws HiveException { singleJoinColumn = bigTableKeyColumnMap[0]; + bytesKeySeries = new VectorKeySeriesBytes(singleJoinColumn); + needCommonSetup = false; } @@ -115,315 +128,109 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column String hash map information for this specialized class. */ - hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; - final int inputLogicalSize = batch.size; + // Do the per-batch setup for an outer join. - if (inputLogicalSize == 0) { + if (!outerPerBatchSetup(batch)) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } return; } - // Do the per-batch setup for an outer join. - - outerPerBatchSetup(batch); - - // For outer join, remember our input rows before ON expression filtering or before - // hash table matching so we can generate results for all rows (matching and non matching) - // later. - boolean inputSelectedInUse = batch.selectedInUse; - if (inputSelectedInUse) { - // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) { - // throw new HiveException("batch.selected is not in sort order and unique"); - // } - System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize); - } - - // Filtering for outer join just removes rows available for hash table matching. - boolean someRowsFilteredOut = false; - if (bigTableFilterExpressions.length > 0) { - // Since the input - for (VectorExpression ve : bigTableFilterExpressions) { - ve.evaluate(batch); - } - someRowsFilteredOut = (batch.size != inputLogicalSize); - if (isLogDebugEnabled) { - if (batch.selectedInUse) { - if (inputSelectedInUse) { - LOG.debug(CLASS_NAME + - " inputSelected " + intArrayToRangesString(inputSelected, inputLogicalSize) + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } else { - LOG.debug(CLASS_NAME + - " inputLogicalSize " + inputLogicalSize + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } - } - } - } - - // Perform any key expressions. Results will go into scratch columns. - if (bigTableKeyExpressions != null) { - for (VectorExpression ve : bigTableKeyExpressions) { - ve.evaluate(batch); - } - } - - /* - * Single-Column String specific declarations. - */ - - // The one join column for this specialized class. - BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; - byte[][] vector = joinColVector.vector; - int[] start = joinColVector.start; - int[] length = joinColVector.length; - - /* - * Single-Column String check for repeating. - */ - - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; - - if (allKeyInputColumnsRepeating) { - - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Single-Column String specific repeated lookup. - */ - - JoinUtil.JoinResult joinResult; - if (batch.size == 0) { - // Whole repeated key batch was filtered out. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else if (!joinColVector.noNulls && joinColVector.isNull[0]) { - // Any (repeated) null key column is no match for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - // Handle *repeated* join key, if found. - byte[] keyBytes = vector[0]; - int keyStart = start[0]; - int keyLength = length[0]; - joinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishOuterRepeated(batch, joinResult, hashMapResults[0], someRowsFilteredOut, - inputSelectedInUse, inputLogicalSize); - } else { - - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; + if (filteredSize > 0) { + bytesKeySeries.processBatch(batch); + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - boolean atLeastOneNonMatch = someRowsFilteredOut; + MapJoinHashMapResult hashMapResult; + MapJoinHashTableResult.MapJoinResult lookupResult; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - /* - * Single-Column String specific variables. - */ + if (bytesKeySeries.getCurrentKeyAllNull()) { - int saveKeyBatchIndex = -1; + // CONSIDER: Add support for NullSafe option. - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; + outerJoinNullKeyCounter += bytesKeySeries.getCurrentDuplicateCount(); - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < batch.size; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + } else { + + hashMap.hashMapLookup( + bytesKeySeries.getCurrentBytes(), + bytesKeySeries.getCurrentStart(), + bytesKeySeries.getCurrentLength(), + bytesKeySeries.getCurrentHashCode(), + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, taskName + ", " + getOperatorId() + " candidate " + CLASS_NAME + " batch"); + } /* - * Single-Column String outer null detection. + * Common inner join result processing. */ - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; - - if (isNull) { - - // Have that the NULL does not interfere with the current equal key series, if there - // is one. We do not set saveJoinResult. - // - // Let a current MATCH equal key series keep going, or - // Let a current SPILL equal key series keep going, or - // Let a current NOMATCH keep not matching. - - atLeastOneNonMatch = true; - - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " NULL"); - } else { - - /* - * Single-Column String outer get key. - */ - - // Implicit -- use batchIndex. - - /* - * Equal key series checking. - */ - - if (!haveSaveKey || - StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], - vector[batchIndex], start[batchIndex], length[batchIndex]) == false) { - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column String specific save key. - */ - - saveKeyBatchIndex = batchIndex; - - /* - * Single-Column Long specific lookup key. - */ - - byte[] keyBytes = vector[batchIndex]; - int keyStart = start[batchIndex]; - int keyLength = length[batchIndex]; - saveJoinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[hashMapResultCount]); - - /* - * Common outer join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - atLeastOneNonMatch = true; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " Key Continues " + saveKey + " " + saveJoinResult.name()); - - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - // if (!verifyMonotonicallyIncreasing(allMatchs, allMatchCount)) { - // throw new HiveException("allMatchs is not in sort order and unique"); - // } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + switch (lookupResult) { case MATCH: + matchLogicalIndices[matchSeriesCount] = bytesKeySeries.getCurrentLogical(); + matchDuplicateCounts[matchSeriesCount] = bytesKeySeries.getCurrentDuplicateCount(); + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; hashMapResultCount++; - equalKeySeriesCount++; break; + case SPILL: + spillLogicalIndices[spillSeriesCount] = bytesKeySeries.getCurrentLogical(); + spillDuplicateCounts[spillSeriesCount] = bytesKeySeries.getCurrentDuplicateCount(); + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; hashMapResultCount++; + spilledRowCounter += bytesKeySeries.getCurrentDuplicateCount(); break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); } - } + + if (!bytesKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " atLeastOneNonMatch " + atLeastOneNonMatch + - " inputSelectedInUse " + inputSelectedInUse + + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + " inputLogicalSize " + inputLogicalSize + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " matchHashMapResults " + Arrays.toString(Arrays.copyOfRange(matchHashMapResults, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - // We will generate results for all matching and non-matching rows. - finishOuter(batch, - allMatchCount, equalKeySeriesCount, atLeastOneNonMatch, - inputSelectedInUse, inputLogicalSize, - spillCount, hashMapResultCount); + finishOuter(batch, matchSeriesCount, spillSeriesCount); + } else { + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " batch #" + batchCounter + " batch info " + + " inputLogicalSize " + inputLogicalSize + " filtered out"); + } + finishOuter(batch, 0, 0); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMap.java index 0ff98bd..ee50f93 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMap.java @@ -20,83 +20,214 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; import org.apache.hadoop.io.BytesWritable; -import org.apache.hive.common.util.HashCodeUtil; /* * An single byte array value hash map optimized for vector map join. */ -public abstract class VectorMapJoinFastBytesHashMap - extends VectorMapJoinFastBytesHashTable - implements VectorMapJoinBytesHashMap { +public abstract class VectorMapJoinFastBytesHashMap extends VectorMapJoinFastBytesHashTable { private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastBytesHashMap.class); - private VectorMapJoinFastValueStore valueStore; + private VectorMapJoinFastKeyAndValueStore keyAndValueStore; - @Override - public VectorMapJoinHashMapResult createHashMapResult() { - return new VectorMapJoinFastValueStore.HashMapResult(); - } + public void add(byte[] keyBytes, int keyStart, int keyLength, int hashCode, BytesWritable currentValue) { - @Override - public void assignSlot(int slot, byte[] keyBytes, int keyStart, int keyLength, - long hashCode, boolean isNewKey, BytesWritable currentValue) { + if (resizeThreshold <= keysAssigned) { + expandAndRehash(); + } + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long part1Word; + long part2Word = -1; + boolean isNewKey; + while (true) { + pairIndex = 2 * slot; + part1Word = slotPairs[pairIndex]; + if (part1Word == 0) { + isNewKey = true;; + break; + } + part2Word = slotPairs[pairIndex + 1]; + if (hashCode == VectorMapJoinFastKeyAndValueStore.getHashCode(part1Word) && + keyAndValueStore.equalKey(part1Word, part2Word, keyBytes, keyStart, keyLength)) { + isNewKey = false; + break; + } + + // Some other key (collision) - keep probing. + metricPutConflict++; + probeSlot += (++i); + slot = (int) (probeSlot & logicalHashBucketMask); + } + + if (largestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + largestNumberOfSteps = i; + // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); + } byte[] valueBytes = currentValue.getBytes(); int valueLength = currentValue.getLength(); - int tripleIndex = 3 * slot; if (isNewKey) { // First entry. - slotTriples[tripleIndex] = keyStore.add(keyBytes, keyStart, keyLength); - slotTriples[tripleIndex + 1] = hashCode; - slotTriples[tripleIndex + 2] = valueStore.addFirst(valueBytes, 0, valueLength); - // LOG.debug("VectorMapJoinFastBytesHashMap add first keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); + keyAndValueStore.addFirst(hashCode, keyBytes, keyStart, keyLength, + valueBytes, 0, valueLength); + + // Save 128 key and value reference that includes hash code. + slotPairs[pairIndex] = keyAndValueStore.addPart1Word; + slotPairs[pairIndex + 1] = keyAndValueStore.addPart2Word; keysAssigned++; } else { // Add another value. - // LOG.debug("VectorMapJoinFastBytesHashMap add more keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); - slotTriples[tripleIndex + 2] = valueStore.addMore(slotTriples[tripleIndex + 2], valueBytes, 0, valueLength); - // LOG.debug("VectorMapJoinFastBytesHashMap add more new valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); + keyAndValueStore.addMore(part1Word, part2Word, valueBytes, 0, valueLength); + + // Update. + slotPairs[pairIndex] = keyAndValueStore.addPart1Word; } + numValues++; } @Override - public JoinUtil.JoinResult lookup(byte[] keyBytes, int keyStart, int keyLength, VectorMapJoinHashMapResult hashMapResult) { - VectorMapJoinFastValueStore.HashMapResult optimizedHashMapResult = - (VectorMapJoinFastValueStore.HashMapResult) hashMapResult; - - optimizedHashMapResult.forget(); + protected void expandAndRehashImpl(int capacity) { + + long expandTime = System.currentTimeMillis(); + int newLogicalHashBucketCount = capacity; + int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; + int newMetricPutConflict = 0; + int newLargestNumberOfSteps = 0; + + int newSlotPairArraySize = newLogicalHashBucketCount * 2; + long[] newslotPairs = new long[newSlotPairArraySize]; + + for (int slot = 0; slot < logicalHashBucketCount; slot++) { + int pairIndex = slot * 2; + long part1Word = slotPairs[pairIndex]; + if (part1Word != 0) { + int hashCode = VectorMapJoinFastKeyAndValueStore.getHashCode(part1Word); + long part2Word = slotPairs[pairIndex + 1]; + + // Copy to new slot table. + int newSlot = hashCode & newLogicalHashBucketMask; + long newProbeSlot = newSlot; + int newPairIndex; + int i = 0; + while (true) { + newPairIndex = newSlot * 2; + long newPair1Word = newslotPairs[newPairIndex]; + if (newPair1Word == 0) { + break; + } + ++newMetricPutConflict; + // Some other key (collision) - keep probing. + newProbeSlot += (++i); + newSlot = (int)(newProbeSlot & newLogicalHashBucketMask); + } + + if (newLargestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + newLargestNumberOfSteps = i; + // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); + } + + // Use old reference words. + newslotPairs[newPairIndex] = part1Word; + newslotPairs[newPairIndex + 1] = part2Word; + } + } - long hashCode = HashCodeUtil.murmurHash(keyBytes, keyStart, keyLength); - long valueRefWord = findReadSlot(keyBytes, keyStart, keyLength, hashCode); - JoinUtil.JoinResult joinResult; - if (valueRefWord == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - // LOG.debug("VectorMapJoinFastBytesHashMap lookup hashCode " + Long.toHexString(hashCode) + " valueRefWord " + Long.toHexString(valueRefWord) + " (valueStore != null) " + (valueStore != null)); + slotPairs = newslotPairs; + logicalHashBucketCount = newLogicalHashBucketCount; + logicalHashBucketMask = newLogicalHashBucketMask; + metricPutConflict = newMetricPutConflict; + largestNumberOfSteps = newLargestNumberOfSteps; + resizeThreshold = (int)(logicalHashBucketCount * loadFactor); + metricExpandsMs += (System.currentTimeMillis() - expandTime); + metricExpands++; + } - optimizedHashMapResult.set(valueStore, valueRefWord); + protected int getLongsPerSlot() { + return 2; + } - joinResult = JoinUtil.JoinResult.MATCH; - } + /* + * The hash table slots. For a bytes key hash table, each slot is 2 longs and the array is + * 2X sized. + * + * The slot pair is a 128 bit key and value reference that includes 32 bits for the hash code.. + */ + protected long[] slotPairs; - optimizedHashMapResult.setJoinResult(joinResult); + @Override + public void allocateBucketArray() { + int slotPairArraySize = 2 * logicalHashBucketCount; + slotPairs = new long[slotPairArraySize]; + } - return joinResult; + @Override + public void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, + int hashCode, MapJoinHashMapResult hashMapResult) { + + VectorMapJoinFastKeyAndValueStore.HashMapResult fastHashMapResult = + (VectorMapJoinFastKeyAndValueStore.HashMapResult) hashMapResult; + + fastHashMapResult.forget(); + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long part1Word; + long part2Word = -1; + while (true) { + pairIndex = slot * 2; + part1Word = slotPairs[pairIndex]; + if (part1Word != 0 && hashCode == VectorMapJoinFastKeyAndValueStore.getHashCode(part1Word)) { + + // Finally, verify the key bytes match. + part2Word = slotPairs[pairIndex + 1]; + fastHashMapResult.setKey(keyAndValueStore, part1Word, part2Word); + if (fastHashMapResult.equalKey(keyBytes, keyStart, keyLength)) { + fastHashMapResult.setMatch(); + return; + } + } + // Some other key (collision) - keep probing. + metricGetConflict++; + probeSlot += (++i); + if (i > largestNumberOfSteps) { + // We know we never went that far when we were inserting. + hashMapResult.setNoMatch(); + return; + } + slot = (int)(probeSlot & logicalHashBucketMask); + } } public VectorMapJoinFastBytesHashMap( - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); + + keyAndValueStore = new VectorMapJoinFastKeyAndValueStore(writeBuffersSize); + } - valueStore = new VectorMapJoinFastValueStore(writeBuffersSize); + @Override + public long memorySize() { + return keyAndValueStore.writeBuffers().size() + slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } - // Share the same write buffers with our value store. - keyStore = new VectorMapJoinFastKeyStore(valueStore.writeBuffers()); + @Override + public void clear() { + // UNDONE } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMultiSet.java index 5d8ed2d..78bbe26 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMultiSet.java @@ -20,75 +20,273 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; import org.apache.hadoop.io.BytesWritable; -import org.apache.hive.common.util.HashCodeUtil; + +import com.google.common.base.Preconditions; /* * An single byte array value hash multi-set optimized for vector map join. */ -public abstract class VectorMapJoinFastBytesHashMultiSet - extends VectorMapJoinFastBytesHashTable - implements VectorMapJoinBytesHashMultiSet { +public abstract class VectorMapJoinFastBytesHashMultiSet extends VectorMapJoinFastBytesHashTable { - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastBytesHashMultiSet.class); + private static final String CLASS_NAME = VectorMapJoinFastBytesHashMultiSet.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); - @Override - public VectorMapJoinHashMultiSetResult createHashMultiSetResult() { - return new VectorMapJoinFastHashMultiSet.HashMultiSetResult(); + private VectorMapJoinFastKeyStore keyStore; + + private final static class HashCodeAndCounterWord { + + // Lowest field. + private final class HashCode { + private static final int bitLength = 32; + private static final long allBitsOn = (1L << bitLength) - 1; + private static final long bitMask = allBitsOn; + } + + public static int getHashCode(long hashCodeAndCounterWord) { + return (int) ((hashCodeAndCounterWord & HashCode.bitMask)); + } + + private final class Counter { + private static final int bitLength = 31; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int bitShift = HashCode.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + } + + public static int getCounter(long hashCodeAndCounterWord) { + return (int) ((hashCodeAndCounterWord & Counter.bitMask) >> Counter.bitShift); + } + + // This bit should not be on for valid value references. We use -1 for a no value marker. + private final class IsInvalidFlag { + private static final int bitShift = 63; + private static final long flagOnMask = 1L << bitShift; + } + + public static long newWord(int hashCode) { + long hashCodeAndCounterWord = ((long) hashCode) & HashCode.bitMask; + hashCodeAndCounterWord |= ((long) 1 << Counter.bitShift); + return hashCodeAndCounterWord; + } + + public static long incrementCounter(long hashCodeAndCounterWord) { + int counter = (int) ((hashCodeAndCounterWord & Counter.bitMask) >> Counter.bitShift); + hashCodeAndCounterWord &= ~Counter.bitMask; + hashCodeAndCounterWord |= ((long) counter + 1) << Counter.bitShift; + return hashCodeAndCounterWord; + } } @Override - public void assignSlot(int slot, byte[] keyBytes, int keyStart, int keyLength, - long hashCode, boolean isNewKey, BytesWritable currentValue) { + protected void add(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + BytesWritable currentValue) { + + if (resizeThreshold <= keysAssigned) { + expandAndRehash(); + } + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long keyRef; + long hashCodeAndCounterWord = -1; + boolean isNewKey; + while (true) { + pairIndex = 2 * slot; + keyRef = slotPairs[pairIndex]; + if (keyRef == 0) { + isNewKey = true;; + break; + } + hashCodeAndCounterWord = slotPairs[pairIndex + 1]; + if (hashCode == HashCodeAndCounterWord.getHashCode(hashCodeAndCounterWord) && + keyStore.equalKey(keyRef, keyBytes, keyStart, keyLength)) { + isNewKey = false; + break; + } + + // Some other key (collision) - keep probing. + metricPutConflict++; + probeSlot += (++i); + slot = (int) (probeSlot & logicalHashBucketMask); + } + + if (largestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + largestNumberOfSteps = i; + } - int tripleIndex = 3 * slot; if (isNewKey) { - // First entry. - slotTriples[tripleIndex] = keyStore.add(keyBytes, keyStart, keyLength); - slotTriples[tripleIndex + 1] = hashCode; - slotTriples[tripleIndex + 2] = 1; // Count. - // LOG.debug("VectorMapJoinFastBytesHashMap add first keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); + slotPairs[pairIndex] = keyStore.add(keyBytes, keyStart, keyLength); + slotPairs[pairIndex + 1] = HashCodeAndCounterWord.newWord(hashCode); keysAssigned++; } else { - // Add another value. - // LOG.debug("VectorMapJoinFastBytesHashMap add more keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); - slotTriples[tripleIndex + 2]++; + slotPairs[pairIndex + 1] = HashCodeAndCounterWord.incrementCounter(hashCodeAndCounterWord); } + numValues++; } @Override - public JoinUtil.JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMultiSetResult hashMultiSetResult) { + protected void expandAndRehashImpl(int capacity) { - VectorMapJoinFastHashMultiSet.HashMultiSetResult optimizedHashMultiSetResult = - (VectorMapJoinFastHashMultiSet.HashMultiSetResult) hashMultiSetResult; + long expandTime = System.currentTimeMillis(); + int newLogicalHashBucketCount = capacity; + int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; + int newMetricPutConflict = 0; + int newLargestNumberOfSteps = 0; - optimizedHashMultiSetResult.forget(); + int newSlotPairArraySize = newLogicalHashBucketCount * 3; + long[] newSlotPairs = new long[newSlotPairArraySize]; - long hashCode = HashCodeUtil.murmurHash(keyBytes, keyStart, keyLength); - long count = findReadSlot(keyBytes, keyStart, keyLength, hashCode); - JoinUtil.JoinResult joinResult; - if (count == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { + int pairIndex; + long keyRef; + long hashCodeAndCounterWord = -1; + for (int slot = 0; slot < logicalHashBucketCount; slot++) { + pairIndex = slot * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef != 0) { + + hashCodeAndCounterWord = slotPairs[pairIndex + 1]; + Preconditions.checkState( + (hashCodeAndCounterWord & HashCodeAndCounterWord.IsInvalidFlag.flagOnMask) == 0); - optimizedHashMultiSetResult.set(count); + // Copy to new slot table. + int hashCode = HashCodeAndCounterWord.getHashCode(hashCodeAndCounterWord); + int newSlot = hashCode & newLogicalHashBucketMask; + long newProbeSlot = newSlot; + int newPairIndex; + int i = 0; + while (true) { + newPairIndex = newSlot * 2; + long newKeyRef = newSlotPairs[newPairIndex]; + if (newKeyRef == 0) { + break; + } + ++newMetricPutConflict; + // Some other key (collision) - keep probing. + newProbeSlot += (++i); + newSlot = (int)(newProbeSlot & newLogicalHashBucketMask); + } - joinResult = JoinUtil.JoinResult.MATCH; + if (newLargestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + newLargestNumberOfSteps = i; + } + + // Use old words. + newSlotPairs[newPairIndex] = keyRef; + newSlotPairs[newPairIndex + 1] = hashCodeAndCounterWord; + } } - optimizedHashMultiSetResult.setJoinResult(joinResult); + slotPairs = newSlotPairs; + logicalHashBucketCount = newLogicalHashBucketCount; + logicalHashBucketMask = newLogicalHashBucketMask; + metricPutConflict = newMetricPutConflict; + largestNumberOfSteps = newLargestNumberOfSteps; + resizeThreshold = (int)(logicalHashBucketCount * loadFactor); + metricExpandsMs += (System.currentTimeMillis() - expandTime); + metricExpands++; + } - return joinResult; + protected int getLongsPerSlot() { + return 2; + } + + /* + * The hash table slots. For a bytes key hash counter table, each slot is 2 longs and the + * array is 2X sized. + * + * The slot pair is a 64 bit key reference and the 2nd 64 bit word has the 32 bits for the + * hash code and a 32 bit counter. + */ + protected long[] slotPairs; + + @Override + public void allocateBucketArray() { + int slotPairArraySize = 2 * logicalHashBucketCount; + slotPairs = new long[slotPairArraySize]; + } + + public static String displayBytes(byte[] bytes, int start, int length) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + char ch = (char) bytes[i]; + if (ch < ' ' || ch > '~') { + sb.append(String.format("\\%03d", bytes[i] & 0xff)); + } else { + sb.append(ch); + } + } + return sb.toString(); + } + + @Override + public void hashMultiSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) { + + hashMultiSetResult.forget(); + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long keyRef; + long hashCodeAndCounterWord; + while (true) { + pairIndex = slot * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef != 0) { + hashCodeAndCounterWord = slotPairs[pairIndex + 1]; + + LOG.info(CLASS_NAME + " hashMultiSetContains inputHashCode " + hashCode + + " slotHashCode " + HashCodeAndCounterWord.getHashCode(hashCodeAndCounterWord)); + + if (hashCode == HashCodeAndCounterWord.getHashCode(hashCodeAndCounterWord)) { + + // Finally, verify the key bytes match. + if (keyStore.equalKey(keyRef, keyBytes, keyStart, keyLength)) { + hashMultiSetResult.setMatch(HashCodeAndCounterWord.getCounter(hashCodeAndCounterWord)); + + LOG.info(CLASS_NAME + " key match " + displayBytes(keyBytes, keyStart, keyLength)); + return; + } + } + } + // Some other key (collision) - keep probing. + metricGetConflict++; + probeSlot += (++i); + if (i > largestNumberOfSteps) { + // We know we never went that far when we were inserting. + hashMultiSetResult.setNoMatch(); + return; + } + slot = (int)(probeSlot & logicalHashBucketMask); + } } public VectorMapJoinFastBytesHashMultiSet( - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); keyStore = new VectorMapJoinFastKeyStore(writeBuffersSize); } + + @Override + public long memorySize() { + return keyStore.writeBuffers().size() + slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } + + @Override + public void clear() { + // UNDONE + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashSet.java index 990a2e5..2411e42 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashSet.java @@ -20,67 +20,203 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; import org.apache.hadoop.io.BytesWritable; -import org.apache.hive.common.util.HashCodeUtil; /* * An single byte array value hash multi-set optimized for vector map join. */ -public abstract class VectorMapJoinFastBytesHashSet - extends VectorMapJoinFastBytesHashTable - implements VectorMapJoinBytesHashSet { +public abstract class VectorMapJoinFastBytesHashSet extends VectorMapJoinFastBytesHashTable { private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastBytesHashSet.class); - @Override - public VectorMapJoinHashSetResult createHashSetResult() { - return new VectorMapJoinFastHashSet.HashSetResult(); - } + private VectorMapJoinFastKeyStore keyStore; @Override - public void assignSlot(int slot, byte[] keyBytes, int keyStart, int keyLength, - long hashCode, boolean isNewKey, BytesWritable currentValue) { + protected void add(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + BytesWritable currentValue) { + + if (resizeThreshold <= keysAssigned) { + expandAndRehash(); + } + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long keyRef; + boolean isNewKey; + long longHashCode = (long) hashCode; // Allow sign extension. + while (true) { + pairIndex = 2 * slot; + keyRef = slotPairs[pairIndex]; + if (keyRef == 0) { + isNewKey = true;; + break; + } + + if (longHashCode == slotPairs[pairIndex + 1] && + keyStore.equalKey(keyRef, keyBytes, keyStart, keyLength)) { + isNewKey = false; + break; + } + + // Some other key (collision) - keep probing. + metricPutConflict++; + probeSlot += (++i); + slot = (int) (probeSlot & logicalHashBucketMask); + } + + if (largestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + largestNumberOfSteps = i; + } - int tripleIndex = 3 * slot; if (isNewKey) { - // First entry. - slotTriples[tripleIndex] = keyStore.add(keyBytes, keyStart, keyLength); - slotTriples[tripleIndex + 1] = hashCode; - slotTriples[tripleIndex + 2] = 1; // Existence + slotPairs[pairIndex] = keyStore.add(keyBytes, keyStart, keyLength); + slotPairs[pairIndex + 1] = longHashCode; keysAssigned++; + } else { + // Entry exists. } + numValues++; } @Override - public JoinUtil.JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashSetResult hashSetResult) { + protected void expandAndRehashImpl(int capacity) { - VectorMapJoinFastHashSet.HashSetResult optimizedHashSetResult = - (VectorMapJoinFastHashSet.HashSetResult) hashSetResult; + long expandTime = System.currentTimeMillis(); + int newLogicalHashBucketCount = capacity; + int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; + int newMetricPutConflict = 0; + int newLargestNumberOfSteps = 0; - optimizedHashSetResult.forget(); + int newSlotPairArraySize = newLogicalHashBucketCount * 3; + long[] newSlotPairs = new long[newSlotPairArraySize]; - long hashCode = HashCodeUtil.murmurHash(keyBytes, keyStart, keyLength); - long existance = findReadSlot(keyBytes, keyStart, keyLength, hashCode); - JoinUtil.JoinResult joinResult; - if (existance == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - joinResult = JoinUtil.JoinResult.MATCH; + int pairIndex; + long keyRef; + long longHashCode = -1; + int intHashCode = -1; + for (int slot = 0; slot < logicalHashBucketCount; slot++) { + pairIndex = slot * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef != 0) { + + longHashCode = slotPairs[pairIndex + 1]; + intHashCode = (int) longHashCode; + + // Copy to new slot table. + int newSlot = intHashCode & newLogicalHashBucketMask; + long newProbeSlot = newSlot; + int newPairIndex; + int i = 0; + while (true) { + newPairIndex = newSlot * 2; + long newKeyRef = newSlotPairs[newPairIndex]; + if (newKeyRef == 0) { + break; + } + ++newMetricPutConflict; + // Some other key (collision) - keep probing. + newProbeSlot += (++i); + newSlot = (int)(newProbeSlot & newLogicalHashBucketMask); + } + + if (newLargestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + newLargestNumberOfSteps = i; + } + + // Use old words. + newSlotPairs[newPairIndex] = keyRef; + newSlotPairs[newPairIndex + 1] = longHashCode; + } } - optimizedHashSetResult.setJoinResult(joinResult); + slotPairs = newSlotPairs; + logicalHashBucketCount = newLogicalHashBucketCount; + logicalHashBucketMask = newLogicalHashBucketMask; + metricPutConflict = newMetricPutConflict; + largestNumberOfSteps = newLargestNumberOfSteps; + resizeThreshold = (int)(logicalHashBucketCount * loadFactor); + metricExpandsMs += (System.currentTimeMillis() - expandTime); + metricExpands++; + } + + protected int getLongsPerSlot() { + return 2; + } + + /* + * The hash table slots. For a bytes key hash counter table, each slot is 2 longs and the + * array is 2X sized. + * + * The slot pair is a 64 bit key reference and the 2nd 64 bit word has the 32 bits for the + * hash code. + */ + protected long[] slotPairs; - return joinResult; + @Override + public void allocateBucketArray() { + int slotPairArraySize = 2 * logicalHashBucketCount; + slotPairs = new long[slotPairArraySize]; + } + + @Override + public void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashSetResult hashSetResult) { + + hashSetResult.forget(); + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long keyRef; + long longHashCode = (long) hashCode; // Allow sign extension. + while (true) { + pairIndex = slot * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef != 0 && longHashCode == slotPairs[pairIndex + 1]) { + + // Finally, verify the key bytes match. + if (keyStore.equalKey(keyRef, keyBytes, keyStart, keyLength)) { + hashSetResult.setMatch(); + return; + } + } + // Some other key (collision) - keep probing. + metricGetConflict++; + probeSlot += (++i); + if (i > largestNumberOfSteps) { + // We know we never went that far when we were inserting. + hashSetResult.setNoMatch(); + return; + } + slot = (int)(probeSlot & logicalHashBucketMask); + } } public VectorMapJoinFastBytesHashSet( - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); keyStore = new VectorMapJoinFastKeyStore(writeBuffersSize); } + + @Override + public long memorySize() { + return keyStore.writeBuffers().size() + slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } + + @Override + public void clear() { + // UNDONE + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashTable.java index 6b536f0..1697ee6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashTable.java @@ -22,7 +22,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashTable; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.BytesWritable; import org.apache.hive.common.util.HashCodeUtil; @@ -32,193 +31,31 @@ /* * An single byte array value hash map optimized for vector map join. */ -public abstract class VectorMapJoinFastBytesHashTable - extends VectorMapJoinFastHashTable - implements VectorMapJoinBytesHashTable { +public abstract class VectorMapJoinFastBytesHashTable extends VectorMapJoinFastHashTable { private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastBytesHashTable.class); - private final boolean isLogDebugEnabled = LOG.isDebugEnabled(); + public final boolean isLogDebugEnabled = LOG.isDebugEnabled(); - protected VectorMapJoinFastKeyStore keyStore; - - private BytesWritable testKeyBytesWritable; private BytesWritable testValueBytesWritable; - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - // No deserialization of key(s) here -- just get reference to bytes. - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - add(keyBytes, 0, keyLength, currentValue); - } - @VisibleForTesting public void putRow(byte[] currentKey, byte[] currentValue) throws HiveException, IOException { - if (testKeyBytesWritable == null) { - testKeyBytesWritable = new BytesWritable(); + if (testValueBytesWritable == null) { testValueBytesWritable = new BytesWritable(); } - testKeyBytesWritable.set(currentKey, 0, currentKey.length); testValueBytesWritable.set(currentValue, 0, currentValue.length); - putRow(testKeyBytesWritable, testValueBytesWritable); + int hashCode = HashCodeUtil.murmurHash(currentKey, 0, currentKey.length); + add(currentKey, 0, currentKey.length, hashCode, testValueBytesWritable); } - protected abstract void assignSlot(int slot, byte[] keyBytes, int keyStart, int keyLength, - long hashCode, boolean isNewKey, BytesWritable currentValue); - - public void add(byte[] keyBytes, int keyStart, int keyLength, BytesWritable currentValue) { - - if (resizeThreshold <= keysAssigned) { - expandAndRehash(); - } - - long hashCode = HashCodeUtil.murmurHash(keyBytes, keyStart, keyLength); - int intHashCode = (int) hashCode; - int slot = (intHashCode & logicalHashBucketMask); - long probeSlot = slot; - int i = 0; - boolean isNewKey; - while (true) { - int tripleIndex = 3 * slot; - if (slotTriples[tripleIndex] == 0) { - // LOG.debug("VectorMapJoinFastBytesHashMap findWriteSlot slot " + slot + " tripleIndex " + tripleIndex + " empty"); - isNewKey = true;; - break; - } - if (hashCode == slotTriples[tripleIndex + 1] && - keyStore.equalKey(slotTriples[tripleIndex], keyBytes, keyStart, keyLength)) { - // LOG.debug("VectorMapJoinFastBytesHashMap findWriteSlot slot " + slot + " tripleIndex " + tripleIndex + " existing"); - isNewKey = false; - break; - } - // TODO - ++metricPutConflict; - // Some other key (collision) - keep probing. - probeSlot += (++i); - slot = (int) (probeSlot & logicalHashBucketMask); - } - - if (largestNumberOfSteps < i) { - if (isLogDebugEnabled) { - LOG.debug("Probed " + i + " slots (the longest so far) to find space"); - } - largestNumberOfSteps = i; - // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); - } - - assignSlot(slot, keyBytes, keyStart, keyLength, hashCode, isNewKey, currentValue); - - if (isNewKey) { - keysAssigned++; - } - } - - private void expandAndRehash() { - - int newLogicalHashBucketCount = logicalHashBucketCount * 2; - int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; - int newMetricPutConflict = 0; - int newLargestNumberOfSteps = 0; - - int newSlotTripleArraySize = newLogicalHashBucketCount * 3; - long[] newSlotTriples = new long[newSlotTripleArraySize]; - - for (int slot = 0; slot < logicalHashBucketCount; slot++) { - int tripleIndex = slot * 3; - long keyRef = slotTriples[tripleIndex]; - if (keyRef != 0) { - long hashCode = slotTriples[tripleIndex + 1]; - long valueRef = slotTriples[tripleIndex + 2]; - - // Copy to new slot table. - int intHashCode = (int) hashCode; - int newSlot = intHashCode & newLogicalHashBucketMask; - long newProbeSlot = newSlot; - int newTripleIndex; - int i = 0; - while (true) { - newTripleIndex = newSlot * 3; - long newKeyRef = newSlotTriples[newTripleIndex]; - if (newKeyRef == 0) { - break; - } - ++newMetricPutConflict; - // Some other key (collision) - keep probing. - newProbeSlot += (++i); - newSlot = (int)(newProbeSlot & newLogicalHashBucketMask); - } - - if (newLargestNumberOfSteps < i) { - if (isLogDebugEnabled) { - LOG.debug("Probed " + i + " slots (the longest so far) to find space"); - } - newLargestNumberOfSteps = i; - // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); - } - - // Use old value reference word. - // LOG.debug("VectorMapJoinFastLongHashTable expandAndRehash key " + tableKey + " slot " + newSlot + " newPairIndex " + newPairIndex + " empty slot (i = " + i + ")"); - - newSlotTriples[newTripleIndex] = keyRef; - newSlotTriples[newTripleIndex + 1] = hashCode; - newSlotTriples[newTripleIndex + 2] = valueRef; - } - } - - slotTriples = newSlotTriples; - logicalHashBucketCount = newLogicalHashBucketCount; - logicalHashBucketMask = newLogicalHashBucketMask; - metricPutConflict = newMetricPutConflict; - largestNumberOfSteps = newLargestNumberOfSteps; - resizeThreshold = (int)(logicalHashBucketCount * loadFactor); - metricExpands++; - // LOG.debug("VectorMapJoinFastLongHashTable expandAndRehash new logicalHashBucketCount " + logicalHashBucketCount + " resizeThreshold " + resizeThreshold + " metricExpands " + metricExpands); - } - - protected long findReadSlot(byte[] keyBytes, int keyStart, int keyLength, long hashCode) { - - int intHashCode = (int) hashCode; - int slot = (intHashCode & logicalHashBucketMask); - long probeSlot = slot; - int i = 0; - while (true) { - int tripleIndex = slot * 3; - // LOG.debug("VectorMapJoinFastBytesHashMap findReadSlot slot keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(hashCode) + " entry hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); - if (slotTriples[tripleIndex] != 0 && hashCode == slotTriples[tripleIndex + 1]) { - // Finally, verify the key bytes match. - - if (keyStore.equalKey(slotTriples[tripleIndex], keyBytes, keyStart, keyLength)) { - return slotTriples[tripleIndex + 2]; - } - } - // Some other key (collision) - keep probing. - probeSlot += (++i); - if (i > largestNumberOfSteps) { - // We know we never went that far when we were inserting. - return -1; - } - slot = (int)(probeSlot & logicalHashBucketMask); - } - } - - /* - * The hash table slots. For a bytes key hash table, each slot is 3 longs and the array is - * 3X sized. - * - * The slot triple is 1) a non-zero reference word to the key bytes, 2) the key hash code, and - * 3) a non-zero reference word to the first value bytes. - */ - protected long[] slotTriples; - - private void allocateBucketArray() { - int slotTripleArraySize = 3 * logicalHashBucketCount; - slotTriples = new long[slotTripleArraySize]; - } + protected abstract void add(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + BytesWritable currentValue); public VectorMapJoinFastBytesHashTable( - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); allocateBucketArray(); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashUtil.java index 80126ad..d0e303e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashUtil.java @@ -18,8 +18,6 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import org.apache.hadoop.hive.serde2.WriteBuffers; - public class VectorMapJoinFastBytesHashUtil { public static String displayBytes(byte[] bytes, int start, int length) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMap.java index 262b619..836d0d8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMap.java @@ -18,21 +18,12 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; - -public abstract class VectorMapJoinFastHashMap - extends VectorMapJoinFastHashTable - implements VectorMapJoinHashMap { - - @Override - public VectorMapJoinHashMapResult createHashMapResult() { - return new VectorMapJoinFastValueStore.HashMapResult(); - } +public abstract class VectorMapJoinFastHashMap extends VectorMapJoinFastHashTable { public VectorMapJoinFastHashMap( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, int maxProbeSize) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, maxProbeSize); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMultiSet.java index 5f7c6a7..a0c11e1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMultiSet.java @@ -18,31 +18,12 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; - -public abstract class VectorMapJoinFastHashMultiSet - extends VectorMapJoinFastHashTable implements VectorMapJoinHashMultiSet { - - @Override - public VectorMapJoinHashMultiSetResult createHashMultiSetResult() { - return new HashMultiSetResult(); - } - - public static class HashMultiSetResult extends VectorMapJoinHashMultiSetResult { - - HashMultiSetResult() { - super(); - } - - public void set(long count) { - this.count = count; - } - } +public abstract class VectorMapJoinFastHashMultiSet extends VectorMapJoinFastHashTable { public VectorMapJoinFastHashMultiSet( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, int maxProbeSize) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, maxProbeSize); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashSet.java index 8509971..d85d5c8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashSet.java @@ -18,27 +18,12 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; - -public abstract class VectorMapJoinFastHashSet - extends VectorMapJoinFastHashTable implements VectorMapJoinHashSet { - - @Override - public VectorMapJoinHashSetResult createHashSetResult() { - return new HashSetResult(); - } - - public static class HashSetResult extends VectorMapJoinHashSetResult { - - HashSetResult() { - super(); - } - } +public abstract class VectorMapJoinFastHashSet extends VectorMapJoinFastHashTable { public VectorMapJoinFastHashSet( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, int maxProbeSize) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, maxProbeSize); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTable.java index 099f38e..2bb2306 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTable.java @@ -18,24 +18,69 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; +import java.io.IOException; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.serde2.SerDeException; + +/** + * Different Implementation Variations for the fast Vector Map Join hash tables: + * + * Key Hash Table Kind Key and/or Value Class longs/Slot Comments + * --- --------------- ---------------------- ---------- --------------------------------- + * + * Bytes HashMap ~KeyAndValueStore 2 96-bit key and value store + * reference plus 32 bits for hash + * code. + * + * HashMultiSet ~KeyStore 2 64 bit key store reference. + * 2nd 64 bits has 32 bit hash code + * and 31 bit multi-set counter. + * + * HashSet ~KeyStore 2 64 bit key store reference. + * 2nd 64 bits just has 32 bit hash + * code. Existence for set is + * implicit with non-zero key store + * reference. + * + * Long HashMap ~ValueStore 2 64 bit value store reference. + * 2nd 64 bits has long key. + * + * HashMultiSet (none) 2 64 bits has 31 bit multi-set + * counter. 2nd 64 bits has long + * key. + * + * HashSet (none) 2 64 bits has 1 bit of existence + * for set. 2nd 64 bits has long + * key. + * + */ +public abstract class VectorMapJoinFastHashTable implements MapJoinHashTable { -public abstract class VectorMapJoinFastHashTable implements VectorMapJoinHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastHashTable.class); + protected VectorMapJoinFastHashTableFactory mapJoinHashTableFactory; + protected int logicalHashBucketCount; protected int logicalHashBucketMask; protected float loadFactor; protected int writeBuffersSize; - protected int metricPutConflict; protected int largestNumberOfSteps; protected int keysAssigned; + protected int numValues; protected int resizeThreshold; + + protected int metricPutConflict; + protected int metricGetConflict; protected int metricExpands; + protected int metricExpandsMs; private static void validateCapacity(long capacity) { if (Long.bitCount(capacity) != 1) { @@ -46,12 +91,19 @@ private static void validateCapacity(long capacity) { } } - private static int nextHighestPowerOfTwo(int v) { + protected static int nextHighestPowerOfTwo(int v) { return Integer.highestOneBit(v) << 1; } + protected abstract int getLongsPerSlot(); + + protected abstract void allocateBucketArray(); + public VectorMapJoinFastHashTable( - int initialCapacity, float loadFactor, int writeBuffersSize) { + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + + this.mapJoinHashTableFactory = mapJoinHashTableFactory; initialCapacity = (Long.bitCount(initialCapacity) == 1) ? initialCapacity : nextHighestPowerOfTwo(initialCapacity); @@ -64,10 +116,151 @@ public VectorMapJoinFastHashTable( this.loadFactor = loadFactor; this.writeBuffersSize = writeBuffersSize; + + keysAssigned = 0; + numValues = 0; + + metricPutConflict = 0; + metricGetConflict = 0; + metricExpands = 0; + metricExpandsMs= 0; + + allocateBucketArray(); + } + + protected void expandAndRehash() { + expandAndRehashImpl(logicalHashBucketCount << 1); + } + + protected abstract void expandAndRehashImpl(int capacity); + + @Override + public void expandAndRehashToTarget(int estimateNewRowCount) { + int oldCount = logicalHashBucketCount; + int newCount = oldCount + estimateNewRowCount; + if (resizeThreshold <= newCount) { + newCount = + (Long.bitCount(newCount) == 1) ? estimateNewRowCount : nextHighestPowerOfTwo(newCount); + expandAndRehashImpl(newCount); + LOG.info("Expand and rehash to " + newCount + " from " + oldCount); + } } + /** + * Number of keys in the hashmap + * @return number of keys + */ @Override public int size() { return keysAssigned; } + + /** + * Number of values in the hashmap + * This is equal to or bigger than number of keys, since some values may share the same key + * @return number of values + */ + @Override + public int getNumValues() { + return numValues; + } + + @Override + public void seal() { + // Nothing to seal in base class. + } + + @Override + public void clear() { + // This will make the object completely unusable. Semantics of clear are not defined... + this.keysAssigned = 0; + this.numValues = 0; + } + + //---------------------------- COMMON LONG METHODS (Begin)---------------------------------------- + + @Override + public boolean useMinMax() { + throw new RuntimeException("Expected this method to be overriden"); + } + + @Override + public long min() { + throw new RuntimeException("Expected this method to be overriden"); + } + + @Override + public long max() { + throw new RuntimeException("Expected this method to be overriden"); + } + + //----------------------------- COMMON LONG METHODS (End)----------------------------------------- + + //-------------------------------- HASH MAP (Begin)----------------------------------------------- + + @Override + public void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMapResult hashMapResult) + throws IOException { + throw new RuntimeException("Expected this method to be overriden"); + } + + @Override + public void hashMapLookup(long key, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException { + throw new RuntimeException("Expected this method to be overriden"); + } + + //-------------------------------- HASH MAP (End) ------------------------------------------------ + + //---------------------------- HASH MULTI-SET (Begin) ------------------------------------------- + + @Override + public void hashMultiSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException { + throw new RuntimeException("Expected this method to be overriden"); + } + + @Override + public void hashMultiSetContains(long key, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) throws IOException { + throw new RuntimeException("Expected this method to be overriden"); + } + + //----------------------------- HASH MULTI-SET (End) -------------------------------------------- + + //------------------------------- HASH SET (Begin) ---------------------------------------------- + + @Override + public void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashSetResult hashSetResult) throws IOException { + throw new RuntimeException("Expected this method to be overriden"); + } + + @Override + public void hashSetContains(long key, int hashCode, + MapJoinHashSetResult hashSetResult) throws IOException { + throw new RuntimeException("Expected this method to be overriden"); + } + + //--------------------------------- HASH SET (End) ---------------------------------------------- + + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { + throw new RuntimeException("Expected this method to be overriden"); + } + + @Override + public long memorySize() { + throw new RuntimeException("Expected this method to be overriden"); + } + + @Override + public void debugDumpMetrics() { + LOG.info("Map metrics: keys allocated " + logicalHashBucketCount +", keys assigned " + keysAssigned + + ", write conflict " + metricPutConflict + ", write max dist " + largestNumberOfSteps + + ", read conflict " + metricGetConflict + + ", expanded " + metricExpands + " times in " + metricExpandsMs + "ms"); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableFactory.java new file mode 100644 index 0000000..b9af0d4 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableFactory.java @@ -0,0 +1,233 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResultImpl; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResultImpl; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; + +import com.google.common.annotations.VisibleForTesting; + +/* + * Factory for creating a fast vector map join hash table (which could be a hash map, hash multi-set, + * or hash set) with long, string, or multi-key key. + * + * And, associated objects (e.g. hash map result). + * + * Implements the standard map join interface for creating hash tables. + * + */ +public class VectorMapJoinFastHashTableFactory implements MapJoinHashTableFactory { + + private final boolean isOuterJoin; + private final HashTableKind hashTableKind; + private final HashTableKeyType hashTableKeyType; + private final boolean useMinMax; + + @VisibleForTesting + public VectorMapJoinFastHashTableFactory(HashTableKeyType hashTableKeyType) { + isOuterJoin = false; + hashTableKind = HashTableKind.HASH_MAP; + this.hashTableKeyType = hashTableKeyType; + useMinMax = false; + } + + public VectorMapJoinFastHashTableFactory(MapJoinDesc desc) { + + VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); + + isOuterJoin = !desc.isNoOuterJoin(); + hashTableKind = vectorDesc.hashTableKind(); + hashTableKeyType = vectorDesc.hashTableKeyType(); + useMinMax = vectorDesc.minMaxEnabled() && + (hashTableKeyType == HashTableKeyType.BOOLEAN || + hashTableKeyType == HashTableKeyType.BYTE || + hashTableKeyType == HashTableKeyType.SHORT || + hashTableKeyType == HashTableKeyType.INT || + hashTableKeyType == HashTableKeyType.LONG); + } + + @Override + public MapJoinHashTable createHashTable(int initialCapacity, float loadFactor, + int writeBuffersSize, long memUsage) { + + MapJoinHashTable MapJoinHashTableFind = null; + switch (hashTableKeyType) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + switch (hashTableKind) { + case HASH_MAP: + MapJoinHashTableFind = new VectorMapJoinFastLongHashMap( + this, + useMinMax, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_MULTISET: + MapJoinHashTableFind = new VectorMapJoinFastLongHashMultiSet( + this, + useMinMax, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_SET: + MapJoinHashTableFind = new VectorMapJoinFastLongHashSet( + this, + useMinMax, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + break; + + case STRING: + switch (hashTableKind) { + case HASH_MAP: + MapJoinHashTableFind = new VectorMapJoinFastStringHashMap( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_MULTISET: + MapJoinHashTableFind = new VectorMapJoinFastStringHashMultiSet( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_SET: + MapJoinHashTableFind = new VectorMapJoinFastStringHashSet( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + break; + + case MULTI_KEY: + switch (hashTableKind) { + case HASH_MAP: + MapJoinHashTableFind = new VectorMapJoinFastMultiKeyHashMap( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_MULTISET: + MapJoinHashTableFind = new VectorMapJoinFastMultiKeyHashMultiSet( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_SET: + MapJoinHashTableFind = new VectorMapJoinFastMultiKeyHashSet( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + break; + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + + return MapJoinHashTableFind; + } + + /* + * @return A new hash map result implementation specific object. + * + * The object can be used to access the values when there is a match, or + * access spill information when the partition with the key is currently spilled. + */ + @Override + public MapJoinHashMapResult createHashMapResult() { + switch (hashTableKind) { + case HASH_MAP: + switch (hashTableKeyType) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + return new VectorMapJoinFastValueStore.HashMapResult(); + case STRING: + case MULTI_KEY: + return new VectorMapJoinFastKeyAndValueStore.HashMapResult(); + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + case HASH_MULTISET: + case HASH_SET: + throw new RuntimeException("Hash Map result only for Hash Map tables"); + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + } + + /* + * @return A new hash multi-set result implementation specific object. + * + * The object can be used to access the *count* of values when the key is contained in the + * multi-set, or access spill information when the partition with the key is currently spilled. + */ + @Override + public MapJoinHashMultiSetResult createHashMultiSetResult() { + return new MapJoinHashMultiSetResultImpl(); + } + + /* + * @return A new hash set result implementation specific object. + * + * The object can be used to access access spill information when the partition with the key + * is currently spilled. + */ + @Override + public MapJoinHashSetResult createHashSetResult() { + return new MapJoinHashSetResultImpl(); + } + + @Override + public boolean keyValuePutHelperIsExternal() { + return true; + } + + @Override + public KeyValuePut createKeyValuePut() { + return new VectorMapJoinFastKeyValuePut(hashTableKind, hashTableKeyType, isOuterJoin); + } + + @Override + public boolean useMinMax() { + return useMinMax; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableLoader.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableLoader.java deleted file mode 100644 index 49ecdd1..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableLoader.java +++ /dev/null @@ -1,114 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; - -import java.io.IOException; -import java.util.Collections; -import java.util.Map; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.MapJoinOperator; -import org.apache.hadoop.hive.ql.exec.MapredContext; -import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; -import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe; -import org.apache.hadoop.hive.ql.exec.tez.TezContext; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.io.BytesWritable; -import org.apache.tez.runtime.api.Input; -import org.apache.tez.runtime.api.LogicalInput; -import org.apache.tez.runtime.library.api.KeyValueReader; - -/** - * HashTableLoader for Tez constructs the hashtable from records read from - * a broadcast edge. - */ -public class VectorMapJoinFastHashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTableLoader { - - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastHashTableLoader.class.getName()); - - private Configuration hconf; - protected MapJoinDesc desc; - private TezContext tezContext; - - @Override - public void init(ExecMapperContext context, MapredContext mrContext, - Configuration hconf, MapJoinOperator joinOp) { - this.tezContext = (TezContext) mrContext; - this.hconf = hconf; - this.desc = joinOp.getConf(); - } - - @Override - public void load(MapJoinTableContainer[] mapJoinTables, - MapJoinTableContainerSerDe[] mapJoinTableSerdes) - throws HiveException { - - Map parentToInput = desc.getParentToInput(); - Map parentKeyCounts = desc.getParentKeyCounts(); - - for (int pos = 0; pos < mapJoinTables.length; pos++) { - if (pos == desc.getPosBigTable()) { - continue; - } - - String inputName = parentToInput.get(pos); - LogicalInput input = tezContext.getInput(inputName); - - try { - input.start(); - tezContext.getTezProcessorContext().waitForAnyInputReady( - Collections. singletonList(input)); - } catch (Exception e) { - throw new HiveException(e); - } - - try { - KeyValueReader kvReader = (KeyValueReader) input.getReader(); - - Long keyCountObj = parentKeyCounts.get(pos); - long keyCount = (keyCountObj == null) ? -1 : keyCountObj.longValue(); - - VectorMapJoinFastTableContainer vectorMapJoinFastTableContainer = - new VectorMapJoinFastTableContainer(desc, hconf, keyCount); - - vectorMapJoinFastTableContainer.setSerde(null, null); // No SerDes here. - while (kvReader.next()) { - vectorMapJoinFastTableContainer.putRow((BytesWritable)kvReader.getCurrentKey(), - (BytesWritable)kvReader.getCurrentValue()); - } - - vectorMapJoinFastTableContainer.seal(); - mapJoinTables[pos] = (MapJoinTableContainer) vectorMapJoinFastTableContainer; - - } catch (IOException e) { - throw new HiveException(e); - } catch (SerDeException e) { - throw new HiveException(e); - } catch (Exception e) { - throw new HiveException(e); - } - } - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyAndValueStore.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyAndValueStore.java new file mode 100644 index 0000000..2a0506f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyAndValueStore.java @@ -0,0 +1,722 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResultImpl; +import org.apache.hadoop.hive.serde2.WriteBuffers; +import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; +import org.apache.hadoop.hive.serde2.WriteBuffers.Position; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/* + * Used by VectorMapJoinFastBytesHashMap to store the key and values for a hash map with a bytes + * key. + */ +public class VectorMapJoinFastKeyAndValueStore { + + private static final String CLASS_NAME = VectorMapJoinFastKeyAndValueStore.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + private WriteBuffers writeBuffers; + + private WriteBuffers.Position readPos; + + /** + * A store for a key and a list of arbitrary length values in memory. + * + * The memory is a "infinite" byte array or WriteBuffers object. + * + * We give the client (i.e. hash map) a 128-bit key and value reference to keep that has + * the offset within the "infinite" byte array of the last value inserted in a list. The 128 bits + * includes the hash code. + * + * We optimize the common case when the key is short, the value is short, and the value list + * has 1 element and store that information in the 128 bits. + * + * We also support keeping the value count (up to a limit or cap) so help with join result + * generation algorithms. + * + * When there are more than 1 value, the zero padding is overwritten with a relative offset to + * the next value. The next value always includes the value length. + * + * Cases: + * + * 1) One element when key and value lengths are small (and stored in reference words): + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * <5 0's Padding for Next Rel Offset> + * NEXT (NONE) KEY VALUE + * + * 2) One element, general: + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * <5 0's Padding for Next Rel Offset> [Big Key Len] [Big Value Len] + * NEXT (NONE) optional KEY optional VALUE + * + * 3) Two elements when key and value lengths are small (and stored in reference words): + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * + * | NEXT KEY VALUE + * | + * | first record absolute offset + relative offset + * | + * -------- + * | + * v + * <5 0's Padding for Next Value Ref> + * NEXT (NONE) + * + * 4) Three elements showing how first record updated to point to new value and + * new value points to most recent (additional) value: + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * + * | NEXT KEY VALUE + * | + * | first record absolute offset + relative offset + * | + * | + * | <5 0's Padding for Next Value Ref> + * | ^ NEXT (NONE) VALUE + * | | + * | ------ + * | | + * | | new record absolute offset - (minus) relative offset + * | | + * -----> + * NEXT VALUE + * + * + * 5) Four elements showing how first record is again updated to point to new value and + * new value points to most recent (additional) value: + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * + * | NEXT KEY VALUE + * | + * | first record absolute offset + relative offset + * | + * | + * | <5 0's Padding for Next Value Ref> + * | ^ NEXT (NONE) VALUE + * | | + * | ------ + * | | record absolute offset - (minus) relative offset + * | | + * | + * | ^ NEXT VALUE + * | | + * | ------ + * | | + * | | new record absolute offset - (minus) relative offset + * | | + * -----> + * NEXT VALUE + * + * + * You get the idea. + */ + + public WriteBuffers writeBuffers() { + return writeBuffers; + } + + /** + * A hash map result that can read values stored by the key and value store, one-by-one. + * It also has support routines for checking the hash code and key equality. + * + * It implements the standard map join hash map result interface. + * + */ + public static class HashMapResult extends MapJoinHashTableResultImpl + implements MapJoinHashMapResult { + + private VectorMapJoinFastKeyAndValueStore keyAndValueStore; + + private long absoluteOffset; + private int keyLength; + + private boolean hasRows; + private long part1Word; + private long part2Word; + private boolean isSingleRow; + private int cappedCount; + private long keyAbsoluteOffset; + private int firstValueLength; + private long firstValueAbsoluteOffset; + + private int readIndex; + private boolean isNextEof; + + long nextAbsoluteValueOffset; + + private ByteSegmentRef byteSegmentRef; + private Position readPos; + + public HashMapResult() { + super(); + part1Word = -1; + part2Word = -1; + hasRows = false; + byteSegmentRef = new ByteSegmentRef(); + readPos = new Position(); + } + + /** + * Setup for reading the key of an entry with the equalKey method. + * @param keyAndValueStore + * @param part1Word + * @param part2Word + */ + public void setKey(VectorMapJoinFastKeyAndValueStore keyAndValueStore, long part1Word, + long part2Word) { + + Preconditions.checkState(!KeyAndValueRefPart1.getIsInvalidFlag(part1Word)); + Preconditions.checkState(!KeyAndValueRefPart2.getIsInvalidFlag(part2Word)); + + this.keyAndValueStore = keyAndValueStore; + + this.part1Word = part1Word; + this.part2Word = part2Word; + + absoluteOffset = KeyAndValueRefPart2.getAbsoluteOffset(part2Word); + + // Position after next relative offset (fixed length) to the key. + keyAndValueStore.writeBuffers.setReadPoint( + absoluteOffset + RelativeOffset.byteLength, readPos); + + keyLength = KeyAndValueRefPart1.getSmallKeyLength(part1Word); + boolean isKeyLengthSmall = (keyLength != KeyAndValueRefPart1.SmallKeyLength.allBitsOn); + if (!isKeyLengthSmall) { + // And, if current value is big we must read it. + keyLength = keyAndValueStore.writeBuffers.readVInt(readPos); + } + + // Reading is positioned before the key bytes. + keyAbsoluteOffset = keyAndValueStore.writeBuffers.getReadPoint(readPos); + } + + /** + * Compare a key with the key positioned with the setKey method. + * @param keyBytes + * @param keyStart + * @param keyLength + * @return + */ + public boolean equalKey(byte[] keyBytes, int keyStart, int keyLength) { + + if (this.keyLength != keyLength) { + return false; + } + + // Our reading was positioned to the key. + if (!keyAndValueStore.writeBuffers.isEqual(keyBytes, keyStart, readPos, keyLength)) { + return false; + } + + // NOTE: WriteBuffers.isEqual does not advance the read position... + + return true; + } + + /** + * Mark the key matched with equalKey as a match and set up for reading the values. + * Afterward, methods isSingleRow, cappedCount, first, next, etc may be called. + */ + public void setMatch() { + hasRows = true; + cappedCount = KeyAndValueRefPart1.getCappedCount(part1Word); + isSingleRow = (cappedCount == 1); + + // We must set the position since equalKey does not leave us posiitoned correctly. + keyAndValueStore.writeBuffers.setReadPoint( + keyAbsoluteOffset + keyLength, readPos); + + firstValueLength = KeyAndValueRefPart2.getSmallValueLength(part2Word); + boolean isFirstValueLengthSmall = + (firstValueLength != KeyAndValueRefPart2.SmallValueLength.allBitsOn); + if (!isFirstValueLengthSmall) { + + // And, if current value is big we must read it. + firstValueLength = keyAndValueStore.writeBuffers.readVInt(readPos); + } + + // Save first value absolute offset... + firstValueAbsoluteOffset = keyAndValueStore.writeBuffers.getReadPoint(readPos); + + // Position to beginning. + readIndex = 0; + isNextEof = false; + mapJoinResult = MapJoinResult.MATCH; + } + + @Override + public boolean hasRows() { + return hasRows; + } + + @Override + public boolean isSingleRow() { + if (!hasRows) { + return false; + } + + return isSingleRow; + } + + @Override + public boolean isCappedCountAvailable() { + return true; + } + + @Override + public int cappedCount() { + if (!hasRows) { + return 0; + } + + return cappedCount; + } + + @Override + public ByteSegmentRef first() { + if (!hasRows) { + return null; + } + + // Position to beginning. + readIndex = 0; + isNextEof = false; + + return internalRead(); + } + + @Override + public ByteSegmentRef next() { + if (!hasRows || isNextEof) { + return null; + } + + return internalRead(); + } + + public ByteSegmentRef internalRead() { + + int nextValueLength; + + if (readIndex == 0) { + if (isSingleRow) { + isNextEof = true; + nextAbsoluteValueOffset = -1; + } else { + + // Read the next relative offset the last inserted value record. + keyAndValueStore.writeBuffers.setReadPoint(absoluteOffset, readPos); + long relativeNextValueOffset = + keyAndValueStore.writeBuffers.readNByteLong( + KeyAndValueRefPart2.AbsoluteOffset.byteLength, readPos); + Preconditions.checkState(relativeNextValueOffset != 0); + isNextEof = false; + + // Use positive relative offset from first record to last inserted value record. + nextAbsoluteValueOffset = absoluteOffset + relativeNextValueOffset; + } + + // Position past the key to first value. + keyAndValueStore.writeBuffers.setReadPoint(firstValueAbsoluteOffset, readPos); + nextValueLength = firstValueLength; + } else { + + // Position to the next value record. + Preconditions.checkState(nextAbsoluteValueOffset >= 0); + keyAndValueStore.writeBuffers.setReadPoint(nextAbsoluteValueOffset, readPos); + + // Read the next relative offset. + long relativeNextValueOffset = + keyAndValueStore.writeBuffers.readNByteLong( + RelativeOffset.byteLength, readPos); + if (relativeNextValueOffset == 0) { + isNextEof = true; + nextAbsoluteValueOffset = -1; + } else { + isNextEof = false; + + // The way we insert causes our chain to backwards from the last inserted value record... + nextAbsoluteValueOffset = nextAbsoluteValueOffset - relativeNextValueOffset; + } + nextValueLength = keyAndValueStore.writeBuffers.readVInt(readPos); + + // Now positioned to the value. + } + + // Capture a ByteSegmentRef to the current value position and length. + keyAndValueStore.writeBuffers.getByteSegmentRefToCurrent(byteSegmentRef, nextValueLength, readPos); + + readIndex++; + return byteSegmentRef; + } + + @Override + public boolean isEof() { + if (!hasRows) { + return true; + } + return isNextEof; + } + + @Override + public boolean isAliasFilterAvailable() { + return false; + } + + @Override + public byte aliasFilter() { + return 0; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("(" + super.toString() + ", "); + sb.append("cappedCount " + cappedCount() + ")"); + return sb.toString(); + } + } + + /** + * Retrieve the hash code for an entry. + * @param part1Word + * @return + */ + public static int getHashCode(long part1Word) { + return KeyAndValueRefPart1.getHashCode(part1Word); + } + + public boolean equalKey(long part1Word, long part2Word, byte[] keyBytes, int keyStart, + int keyLength) { + + Preconditions.checkState((part1Word & KeyAndValueRefPart1.IsInvalidFlag.flagOnMask) == 0); + Preconditions.checkState((part2Word & KeyAndValueRefPart2.IsInvalidFlag.flagOnMask) == 0); + + long absoluteOffset = KeyAndValueRefPart2.getAbsoluteOffset(part2Word); + + // Position after next relative offset (fixed length) to the key. + writeBuffers.setReadPoint( + absoluteOffset + RelativeOffset.byteLength, readPos); + + int actualKeyLength = KeyAndValueRefPart1.getSmallKeyLength(part1Word); + boolean isKeyLengthSmall = (actualKeyLength != KeyAndValueRefPart1.SmallKeyLength.allBitsOn); + if (!isKeyLengthSmall) { + + // And, if current value is big we must read it. + actualKeyLength = writeBuffers.readVInt(readPos); + } + + if (actualKeyLength != keyLength) { + return false; + } + + // Our reading was positioned to the key. + if (!writeBuffers.isEqual(keyBytes, keyStart, readPos, keyLength)) { + return false; + } + + return true; + } + + /** + * Bit-length fields within a 128-bit (2 long) key and value reference that includes the + * 32 bit hash code. + * + * First 64 bit long (Part 1): + * + * Lowest field: The 32 bit hash code. + * + * 2nd field: A value count, up to a limit (a cap). Have a count helps the join result + * algorithms determine which optimization to use for M x N result cross products. + * A special constant indicates if the value count is >= the cap. + * + * 3rd field: For short keys, the length of the key. Otherwise, a special constant + * indicating a big value whose length is stored with the key and value. + * + * (Invalid flag field: high bit indicating whether the word is valid). + * + * Second 64 bit long (Part 2): + * + * Lowest field: An absolute byte offset to the key and value in the WriteBuffers. + * + * 2nd field: For short values, the length of the value. Otherwise, a special constant + * indicating a big value whose length is stored with the value. + * + * (Invalid flag field: high bit indicating whether the word is valid). + */ + private final static class KeyAndValueRefPart1 { + + // Lowest field. + private final class HashCode { + private static final int bitLength = 32; + private static final long allBitsOn = (1L << bitLength) - 1; + private static final long bitMask = allBitsOn; + } + + public static int getHashCode(long part1Word) { + return (int) ((part1Word & HashCode.bitMask)); + } + + private final class CappedCount { + private static final int bitLength = 10; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int limit = allBitsOn; + private static final int bitShift = HashCode.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + } + + public static int getCappedCount(long part1Word) { + return (int) ((part1Word & CappedCount.bitMask) >> CappedCount.bitShift); + } + + private final class SmallKeyLength { + private static final int bitLength = 20; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int threshold = allBitsOn; // Lower this for big value testing. + private static final int bitShift = CappedCount.bitShift + CappedCount.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; + } + + public static int getSmallKeyLength(long part1Word) { + return (int) ((part1Word & SmallKeyLength.bitMask) >> SmallKeyLength.bitShift); + } + + // This bit should not be on for valid value references. We use -1 for a no value marker. + private final class IsInvalidFlag { + private static final int bitShift = 63; + private static final long flagOnMask = 1L << bitShift; + } + + public static boolean getIsInvalidFlag(long part1Word) { + return (part1Word & IsInvalidFlag.flagOnMask) != 0; + } + } + + private final static class KeyAndValueRefPart2 { + + // Lowest field. + private final class AbsoluteOffset { + private static final int bitLength = 40; + private static final int byteLength = (bitLength + Byte.SIZE -1) / Byte.SIZE; + private static final long allBitsOn = (1L << bitLength) - 1; + private static final long bitMask = allBitsOn; + + // Make it a power of 2. + private static final long maxSize = 1L << (bitLength - 2); + } + + public static long getAbsoluteOffset(long part2Word) { + return (part2Word & KeyAndValueRefPart2.AbsoluteOffset.bitMask); + } + + private final class SmallValueLength { + private static final int bitLength = 20; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int threshold = allBitsOn; // Lower this for big value testing. + private static final int bitShift = AbsoluteOffset.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; + } + + public static int getSmallValueLength(long part2Word) { + return (int) ((part2Word & SmallValueLength.bitMask) >> SmallValueLength.bitShift); + } + + // This bit should not be on for valid value references. We use -1 for a no value marker. + private final class IsInvalidFlag { + private static final int bitShift = 63; + private static final long flagOnMask = 1L << bitShift; + } + + public static boolean getIsInvalidFlag(long part2Word) { + return (part2Word & IsInvalidFlag.flagOnMask) != 0; + } + } + + private final static class RelativeOffset { + private static final int byteLength = KeyAndValueRefPart2.AbsoluteOffset.byteLength; + + // Relative offset zero padding. + private static final byte[] zeroPadding = new byte[] { 0,0,0,0,0 }; + } + + public long addPart1Word; + public long addPart2Word; + + /** + * Two 64-bit long results will be placed in addPart1Word and addPart2Word. + * @param hashCode + * @param keyBytes + * @param keyStart + * @param keyLength + * @param valueBytes + * @param valueStart + * @param valueLength + */ + public void addFirst(int hashCode, byte[] keyBytes, int keyStart, int keyLength, byte[] valueBytes, + int valueStart, int valueLength) { + + long absoluteOffset = writeBuffers.getWritePoint(); + Preconditions.checkState(absoluteOffset >= 0); + + // Zero pad out bytes for fixed size next relative offset if more values are added later. + writeBuffers.write(RelativeOffset.zeroPadding); + + boolean isKeyLengthBig = (keyLength >= KeyAndValueRefPart1.SmallKeyLength.threshold); + if (isKeyLengthBig) { + writeBuffers.writeVInt(keyLength); + } + writeBuffers.write(keyBytes, keyStart, keyLength); + + boolean isValueLengthBig = (valueLength >= KeyAndValueRefPart2.SmallValueLength.threshold); + if (isValueLengthBig) { + writeBuffers.writeVInt(valueLength); + } + writeBuffers.write(valueBytes, valueStart, valueLength); + + /* + * Form part 1. + */ + addPart1Word = ((long) hashCode) & KeyAndValueRefPart1.HashCode.bitMask; + + addPart1Word |= ((long) 1 << KeyAndValueRefPart1.CappedCount.bitShift); + + if (isKeyLengthBig) { + addPart1Word |= KeyAndValueRefPart1.SmallKeyLength.allBitsOnBitShifted; + } else { + addPart1Word |= ((long) keyLength) << KeyAndValueRefPart1.SmallKeyLength.bitShift; + } + Preconditions.checkState(!KeyAndValueRefPart1.getIsInvalidFlag(addPart1Word)); + + /* + * Form part 2. + */ + addPart2Word = absoluteOffset; + + if (isValueLengthBig) { + addPart2Word |= KeyAndValueRefPart2.SmallValueLength.allBitsOnBitShifted; + } else { + addPart2Word |= ((long) valueLength) << KeyAndValueRefPart2.SmallValueLength.bitShift; + } + Preconditions.checkState(!KeyAndValueRefPart2.getIsInvalidFlag(addPart2Word)); + } + + /** + * The part1 64-bit long updated will be placed in addPart1Word. + * @param part1Word + * @param part2Word + * @param valueBytes + * @param valueStart + * @param valueLength + */ + public void addMore(long part1Word, long part2Word, byte[] valueBytes, + int valueStart, int valueLength) { + + Preconditions.checkState(!KeyAndValueRefPart1.getIsInvalidFlag(part1Word)); + Preconditions.checkState(!KeyAndValueRefPart2.getIsInvalidFlag(part2Word)); + + /* + * Extract information from reference words. + */ + int oldCappedCount = KeyAndValueRefPart1.getCappedCount(part1Word); + + long absoluteOffset = KeyAndValueRefPart2.getAbsoluteOffset(part2Word); + + // Where the new value record will be written. + long nextAbsoluteValueOffset = writeBuffers.getWritePoint(); + + if (oldCappedCount == 1) { + // Write zeros to indicate no 3rd record. + writeBuffers.write(RelativeOffset.zeroPadding); + } else { + + // To insert next value record above count 2: + + // 1) Read next relative offset in first record (this is a positive relative offset) to + // last inserted value record. + long oldPrevRelativeValueOffset = + writeBuffers.readNByteLong( + absoluteOffset, RelativeOffset.byteLength, readPos); + + // 2) Relative offset is positive from first record to last inserted value record. + long prevAbsoluteValueOffset = absoluteOffset + oldPrevRelativeValueOffset; + + // 3) Since previous record is before the new one, subtract because we store relative offsets + // as unsigned. + long newPrevRelativeValueOffset = nextAbsoluteValueOffset - prevAbsoluteValueOffset; + Preconditions.checkState(newPrevRelativeValueOffset >= 0); + writeBuffers.writeFiveByteULong(newPrevRelativeValueOffset); + } + + writeBuffers.writeVInt(valueLength); + writeBuffers.write(valueBytes, valueStart, valueLength); + + // Overwrite relative offset in first record. + long newRelativeOffset = nextAbsoluteValueOffset - absoluteOffset; + Preconditions.checkState(newRelativeOffset >= 0); + writeBuffers.writeFiveByteULong(absoluteOffset, newRelativeOffset); + + // Update part1Word + addPart1Word = part1Word; + if (oldCappedCount < KeyAndValueRefPart1.CappedCount.limit) { + int newCappedCount = oldCappedCount + 1; + addPart1Word &= ~KeyAndValueRefPart1.CappedCount.bitMask; + addPart1Word |= ((long) newCappedCount) << KeyAndValueRefPart1.CappedCount.bitShift; + Preconditions.checkState(!KeyAndValueRefPart1.getIsInvalidFlag(addPart1Word)); + } + } + + public VectorMapJoinFastKeyAndValueStore(int writeBuffersSize) { + writeBuffers = new WriteBuffers(writeBuffersSize, KeyAndValueRefPart2.AbsoluteOffset.maxSize); + + readPos = new WriteBuffers.Position(); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyStore.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyStore.java index efdcd43..dc33474 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyStore.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyStore.java @@ -22,11 +22,17 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.serde2.WriteBuffers; -// Optimized for sequential key lookup. +import com.google.common.base.Preconditions; +/* + * Used by VectorMapJoinFastBytesHash{MultiSet|Set} to store the key for a hash multi-set or set + * with a bytes key. Those hash tables do not store values but instead store a count or existence + * (respectively) in the hash table slot array. + */ public class VectorMapJoinFastKeyStore { - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastKeyStore.class.getName()); + private static final String CLASS_NAME = VectorMapJoinFastKeyStore.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); private WriteBuffers writeBuffers; @@ -58,99 +64,124 @@ * Last field: an always on bit to insure the key reference non-zero when the offset and * length are zero. */ + private final static class KeyRef { + /* + * The absolute offset to the beginning of the key within the WriteBuffers. + */ + private final class AbsoluteKeyOffset { + private static final int bitLength = 40; + private static final long allBitsOn = (((long) 1) << bitLength) - 1; + private static final long bitMask = allBitsOn; + + // Make it a power of 2 by backing down (i.e. the -2). + private static final long maxSize = ((long) 1) << (bitLength - 2); + } - /* - * The absolute offset to the beginning of the key within the WriteBuffers. - */ - private final class AbsoluteKeyOffset { - private static final int bitLength = 40; - private static final long allBitsOn = (((long) 1) << bitLength) - 1; - private static final long bitMask = allBitsOn; + public static long getAbsoluteKeyOffset(long keyRef) { + return (keyRef & AbsoluteKeyOffset.bitMask); + } - // Make it a power of 2 by backing down (i.e. the -2). - private static final long maxSize = ((long) 1) << (bitLength - 2); - } + /* + * The small key length. + * + * If the key is big (i.e. length >= allBitsOn), then the key length is stored in the + * WriteBuffers. + */ + private final class SmallKeyLength { + private static final int bitLength = 20; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int threshold = allBitsOn; // Lower this for big key testing. + private static final int bitShift = AbsoluteKeyOffset.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; + } - /* - * The small key length. - * - * If the key is big (i.e. length >= allBitsOn), then the key length is stored in the - * WriteBuffers. - */ - private final class SmallKeyLength { - private static final int bitLength = 20; - private static final int allBitsOn = (1 << bitLength) - 1; - private static final int threshold = allBitsOn; // Lower this for big key testing. - private static final int bitShift = AbsoluteKeyOffset.bitLength; - private static final long bitMask = ((long) allBitsOn) << bitShift; - private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; - } + public static int getSmallKeyLength(long keyRef) { + return (int) ((keyRef & SmallKeyLength.bitMask) >> SmallKeyLength.bitShift); + } - /* - * An always on bit to insure the key reference non-zero. - */ - private final class IsNonZeroFlag { - private static final int bitShift = SmallKeyLength.bitShift + SmallKeyLength.bitLength;; - private static final long flagOnMask = ((long) 1) << bitShift; + /* + * An always on bit to insure the key reference non-zero. + */ + private final class IsNonZeroFlag { + private static final int bitShift = SmallKeyLength.bitShift + SmallKeyLength.bitLength;; + private static final long flagOnMask = ((long) 1) << bitShift; + } + + public static boolean getIsNonZeroFlag(long keyRef) { + return (keyRef & IsNonZeroFlag.flagOnMask) != 0; + } + + // This bit should not be on for valid value references. We use -1 for a no value marker. + private final class IsInvalidFlag { + private static final int bitShift = 63; + private static final long flagOnMask = 1L << bitShift; + } + + public static boolean getIsInvalidFlag(long keyRef) { + return (keyRef & IsInvalidFlag.flagOnMask) != 0; + } } public long add(byte[] keyBytes, int keyStart, int keyLength) { - boolean isKeyLengthBig = (keyLength >= SmallKeyLength.threshold); long absoluteKeyOffset = writeBuffers.getWritePoint(); + + boolean isKeyLengthBig = (keyLength >= KeyRef.SmallKeyLength.threshold); if (isKeyLengthBig) { writeBuffers.writeVInt(keyLength); } writeBuffers.write(keyBytes, keyStart, keyLength); - long keyRefWord = IsNonZeroFlag.flagOnMask; + long keyRefWord = KeyRef.IsNonZeroFlag.flagOnMask; if (isKeyLengthBig) { - keyRefWord |= SmallKeyLength.allBitsOnBitShifted; + keyRefWord |= KeyRef.SmallKeyLength.allBitsOnBitShifted; } else { - keyRefWord |= ((long) keyLength) << SmallKeyLength.bitShift; + keyRefWord |= ((long) keyLength) << KeyRef.SmallKeyLength.bitShift; } keyRefWord |= absoluteKeyOffset; - // LOG.debug("VectorMapJoinFastKeyStore add keyLength " + keyLength + " absoluteKeyOffset " + absoluteKeyOffset + " keyRefWord " + Long.toHexString(keyRefWord)); + Preconditions.checkState(KeyRef.getIsNonZeroFlag(keyRefWord)); + Preconditions.checkState(!KeyRef.getIsInvalidFlag(keyRefWord)); + return keyRefWord; } public boolean equalKey(long keyRefWord, byte[] keyBytes, int keyStart, int keyLength) { - int storedKeyLengthLength = - (int) ((keyRefWord & SmallKeyLength.bitMask) >> SmallKeyLength.bitShift); - boolean isKeyLengthSmall = (storedKeyLengthLength != SmallKeyLength.allBitsOn); + Preconditions.checkState(KeyRef.getIsNonZeroFlag(keyRefWord)); + Preconditions.checkState(!KeyRef.getIsInvalidFlag(keyRefWord)); - // LOG.debug("VectorMapJoinFastKeyStore equalKey keyLength " + keyLength + " isKeyLengthSmall " + isKeyLengthSmall + " storedKeyLengthLength " + storedKeyLengthLength + " keyRefWord " + Long.toHexString(keyRefWord)); + int storedKeyLengthLength = KeyRef.getSmallKeyLength(keyRefWord); + boolean isKeyLengthSmall = (storedKeyLengthLength != KeyRef.SmallKeyLength.allBitsOn); if (isKeyLengthSmall && storedKeyLengthLength != keyLength) { return false; } - long absoluteKeyOffset = - (keyRefWord & AbsoluteKeyOffset.bitMask); + long absoluteKeyOffset = KeyRef.getAbsoluteKeyOffset(keyRefWord); writeBuffers.setReadPoint(absoluteKeyOffset, readPos); if (!isKeyLengthSmall) { // Read big value length we wrote with the value. storedKeyLengthLength = writeBuffers.readVInt(readPos); if (storedKeyLengthLength != keyLength) { - // LOG.debug("VectorMapJoinFastKeyStore equalKey no match big length"); return false; } } // Our reading is positioned to the key. if (!writeBuffers.isEqual(keyBytes, keyStart, readPos, keyLength)) { - // LOG.debug("VectorMapJoinFastKeyStore equalKey no match on bytes"); return false; } - - // LOG.debug("VectorMapJoinFastKeyStore equalKey match on bytes"); return true; } + public WriteBuffers writeBuffers() { + return writeBuffers; + } + public VectorMapJoinFastKeyStore(int writeBuffersSize) { - writeBuffers = new WriteBuffers(writeBuffersSize, AbsoluteKeyOffset.maxSize); + writeBuffers = new WriteBuffers(writeBuffersSize, KeyRef.AbsoluteKeyOffset.maxSize); readPos = new WriteBuffers.Position(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyValuePut.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyValuePut.java new file mode 100644 index 0000000..585dfee --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyValuePut.java @@ -0,0 +1,215 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; +import org.apache.hadoop.hive.serde2.fast.DeserializeRead.ReadStringResults; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HashCodeUtil; + +/* + * Helper object for putting a new key and value into a fast Vector Map Join hash table. + * + * One motivation for this object is fast Vector Map Join needs it for these + * purposes: + * + * 1) For a (single) integer key, it deserializes so the long value can be available for min/max + * determination during hash table load. Later, the fast Vector Map Join operators will be + * able to filter out long keys using min/max. + * + * And, compute the hash code on the long primitive. + * + * 2) For a (single) string key, it deserializes the string value so it can be saved in the hash + * table as a string instead of the serialized form. This improves performance in the + * fast Vector Map Join operator by allowing it to lookup it string key without having to + * serialize it. + * + * And, compute the hash code on the string. + * + * The class implements the standard Map Join interface for key value put. + * + */ +public class VectorMapJoinFastKeyValuePut implements KeyValuePut { + + private final HashTableKind hashTableKind; + private final HashTableKeyType hashTableKeyType; + private final boolean isOuterJoin; + + private final BinarySortableDeserializeRead keyBinarySortableDeserializeRead; + private final ReadStringResults readStringResults; + + private final boolean isLong; + + private BytesWritable keyBytesWritable; + private BytesWritable valueBytesWritable; + + private long longKey; + + private boolean hasHashCode; + private int hashCode; + + private boolean isNull; + + public VectorMapJoinFastKeyValuePut(HashTableKind hashTableKind, + HashTableKeyType hashTableKeyType, boolean isOuterJoin) { + this.hashTableKind = hashTableKind; + this.hashTableKeyType = hashTableKeyType; + this.isOuterJoin = isOuterJoin; + + switch (hashTableKeyType) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + { + isLong = true; + TypeInfo typeInfo; + switch (hashTableKeyType) { + case BOOLEAN: + typeInfo = TypeInfoFactory.booleanTypeInfo; + break; + case BYTE: + typeInfo = TypeInfoFactory.byteTypeInfo; + break; + case SHORT: + typeInfo = TypeInfoFactory.shortTypeInfo; + break; + case INT: + typeInfo = TypeInfoFactory.intTypeInfo; + break; + case LONG: + typeInfo = TypeInfoFactory.longTypeInfo; + break; + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + TypeInfo[] typeInfos = { typeInfo }; + keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(typeInfos); + readStringResults = null; + } + break; + case STRING: + { + isLong = false; + TypeInfo[] typeInfos = { TypeInfoFactory.stringTypeInfo }; + keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(typeInfos); + readStringResults = keyBinarySortableDeserializeRead.createReadStringResults(); + } + break; + case MULTI_KEY: + isLong = false; + keyBinarySortableDeserializeRead = null; + readStringResults = null; + break; + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + + hasHashCode = false; + } + + @Override + public void setKeyValue(Writable keyWritable, Writable valueWritable) + throws SerDeException, IOException { + + keyBytesWritable = (BytesWritable) keyWritable; + valueBytesWritable = (BytesWritable) valueWritable; + isNull = false; // Assume. + hasHashCode = false; + + if (isLong) { + // Deserialized the single long column. + keyBinarySortableDeserializeRead.set(keyBytesWritable.getBytes(), 0, + keyBytesWritable.getLength()); + if (keyBinarySortableDeserializeRead.readCheckNull()) { + isNull = true; + return; + } + longKey = VectorMapJoinFastLongHashUtil.deserializeLongKey( + keyBinarySortableDeserializeRead, hashTableKeyType); + hashCode = HashCodeUtil.calculateLongHashCode(longKey); + } else { + switch (hashTableKeyType) { + case STRING: + { + // Deserialize the single string column. + keyBinarySortableDeserializeRead.set(keyBytesWritable.getBytes(), 0, + keyBytesWritable.getLength()); + if (keyBinarySortableDeserializeRead.readCheckNull()) { + isNull = true; + return; + } + keyBinarySortableDeserializeRead.readString(readStringResults); + hashCode = HashCodeUtil.murmurHash(readStringResults.bytes, readStringResults.start, + readStringResults.length); + } + break; + + case MULTI_KEY: + // Leave the multi-key unserialized. And, let all NULL entries into the small table. + hashCode = HashCodeUtil.murmurHash(keyBytesWritable.getBytes(), 0, keyBytesWritable.getLength()); + break; + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + hasHashCode = true; + } + } + + @Override + public boolean hasHashCode() { + return hasHashCode; + } + + @Override + public int getKeyHashCode() throws SerDeException { + return hashCode; + } + + public boolean isNull() { + return isNull; + } + + @Override + public long getLongKey() { + return longKey; + } + + public ReadStringResults getStringReadStringResults() { + return readStringResults; + } + + public BytesWritable getKeyBytesWritable() { + return keyBytesWritable; + } + + public BytesWritable getValueBytesWritable() { + return valueBytesWritable; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMap.java index 1384fc9..a78f840 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMap.java @@ -20,27 +20,30 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.io.BytesWritable; -import org.apache.hive.common.util.HashCodeUtil; /* * An single long value map optimized for vector map join. */ -public class VectorMapJoinFastLongHashMap - extends VectorMapJoinFastLongHashTable - implements VectorMapJoinLongHashMap { +public class VectorMapJoinFastLongHashMap extends VectorMapJoinFastLongHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastLongHashMap.class); protected VectorMapJoinFastValueStore valueStore; @Override - public VectorMapJoinHashMapResult createHashMapResult() { - return new VectorMapJoinFastValueStore.HashMapResult(); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + add(internalKeyValuePut.getLongKey(), internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } @Override @@ -61,35 +64,38 @@ public void assignSlot(int slot, long key, boolean isNewKey, BytesWritable curre } @Override - public JoinUtil.JoinResult lookup(long key, VectorMapJoinHashMapResult hashMapResult) { + public void hashMapLookup(long key, int hashCode, MapJoinHashMapResult hashMapResult) { VectorMapJoinFastValueStore.HashMapResult optimizedHashMapResult = (VectorMapJoinFastValueStore.HashMapResult) hashMapResult; optimizedHashMapResult.forget(); - long hashCode = HashCodeUtil.calculateLongHashCode(key); // LOG.debug("VectorMapJoinFastLongHashMap lookup " + key + " hashCode " + hashCode); long valueRef = findReadSlot(key, hashCode); - JoinUtil.JoinResult joinResult; if (valueRef == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; + optimizedHashMapResult.setNoMatch(); } else { - optimizedHashMapResult.set(valueStore, valueRef); - - joinResult = JoinUtil.JoinResult.MATCH; + optimizedHashMapResult.setMatch(valueStore, valueRef); } - - optimizedHashMapResult.setJoinResult(joinResult); - - return joinResult; } public VectorMapJoinFastLongHashMap( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(minMaxEnabled, isOuterJoin, hashTableKeyType, - initialCapacity, loadFactor, writeBuffersSize); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, minMaxEnabled, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); valueStore = new VectorMapJoinFastValueStore(writeBuffersSize); } + + @Override + public long memorySize() { + return valueStore.writeBuffers().size() + slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } + + @Override + public void clear() { + // UNDONE + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMultiSet.java index 94bf706..e529c00 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMultiSet.java @@ -18,31 +18,30 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import java.io.IOException; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMultiSet; -import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.io.BytesWritable; -import org.apache.hive.common.util.HashCodeUtil; /* * An single long value multi-set optimized for vector map join. */ -public class VectorMapJoinFastLongHashMultiSet - extends VectorMapJoinFastLongHashTable - implements VectorMapJoinLongHashMultiSet { +public class VectorMapJoinFastLongHashMultiSet extends VectorMapJoinFastLongHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastLongHashMultiSet.class); @Override - public VectorMapJoinHashMultiSetResult createHashMultiSetResult() { - return new VectorMapJoinFastHashMultiSet.HashMultiSetResult(); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + add(internalKeyValuePut.getLongKey(), internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } @Override @@ -61,32 +60,33 @@ public void assignSlot(int slot, long key, boolean isNewKey, BytesWritable curre @Override - public JoinUtil.JoinResult contains(long key, VectorMapJoinHashMultiSetResult hashMultiSetResult) { + public void hashMultiSetContains(long key, int hashCode, MapJoinHashMultiSetResult hashMultiSetResult) { - VectorMapJoinFastHashMultiSet.HashMultiSetResult optimizedHashMultiSetResult = - (VectorMapJoinFastHashMultiSet.HashMultiSetResult) hashMultiSetResult; + hashMultiSetResult.forget(); - optimizedHashMultiSetResult.forget(); - - long hashCode = HashCodeUtil.calculateLongHashCode(key); long count = findReadSlot(key, hashCode); - JoinUtil.JoinResult joinResult; if (count == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; + hashMultiSetResult.setNoMatch(); } else { - optimizedHashMultiSetResult.set(count); - joinResult = JoinUtil.JoinResult.MATCH; + hashMultiSetResult.setMatch(count); } - - optimizedHashMultiSetResult.setJoinResult(joinResult); - - return joinResult; } public VectorMapJoinFastLongHashMultiSet( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(minMaxEnabled, isOuterJoin, hashTableKeyType, - initialCapacity, loadFactor, writeBuffersSize); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, minMaxEnabled, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + } + + @Override + public long memorySize() { + return slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } + + @Override + public void clear() { + // UNDONE } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashSet.java index 2cbc548..7dde167 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashSet.java @@ -20,26 +20,28 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashSet; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.io.BytesWritable; -import org.apache.hive.common.util.HashCodeUtil; /* * An single long value multi-set optimized for vector map join. */ -public class VectorMapJoinFastLongHashSet - extends VectorMapJoinFastLongHashTable - implements VectorMapJoinLongHashSet { +public class VectorMapJoinFastLongHashSet extends VectorMapJoinFastLongHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastLongHashSet.class); @Override - public VectorMapJoinHashSetResult createHashSetResult() { - return new VectorMapJoinFastHashSet.HashSetResult(); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + add(internalKeyValuePut.getLongKey(), internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } @Override @@ -54,32 +56,34 @@ public void assignSlot(int slot, long key, boolean isNewKey, BytesWritable curre } @Override - public JoinResult contains(long key, VectorMapJoinHashSetResult hashSetResult) { + public void hashSetContains(long key, int hashCode, MapJoinHashSetResult hashSetResult) { - VectorMapJoinFastHashSet.HashSetResult optimizedHashSetResult = - (VectorMapJoinFastHashSet.HashSetResult) hashSetResult; + hashSetResult.forget(); - optimizedHashSetResult.forget(); - - long hashCode = HashCodeUtil.calculateLongHashCode(key); long existance = findReadSlot(key, hashCode); - JoinUtil.JoinResult joinResult; if (existance == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; + hashSetResult.setNoMatch(); } else { - joinResult = JoinUtil.JoinResult.MATCH; + hashSetResult.setMatch(); } - optimizedHashSetResult.setJoinResult(joinResult); - - return joinResult; - } public VectorMapJoinFastLongHashSet( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(minMaxEnabled, isOuterJoin, hashTableKeyType, - initialCapacity, loadFactor, writeBuffersSize); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, minMaxEnabled, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + } + + @Override + public long memorySize() { + return slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } + + @Override + public void clear() { + // UNDONE } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java index f37f056..0677f53 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java @@ -22,10 +22,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashTable; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; @@ -33,16 +29,13 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.io.BytesWritable; import org.apache.hive.common.util.HashCodeUtil; -import org.apache.tez.runtime.library.api.KeyValueReader; import com.google.common.annotations.VisibleForTesting; /* * An single long value map optimized for vector map join. */ -public abstract class VectorMapJoinFastLongHashTable - extends VectorMapJoinFastHashTable - implements VectorMapJoinLongHashTable { +public abstract class VectorMapJoinFastLongHashTable extends VectorMapJoinFastHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastLongHashTable.class); @@ -75,43 +68,26 @@ public long max() { return max; } - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - if (keyBinarySortableDeserializeRead.readCheckNull()) { - return; - } - - long key = VectorMapJoinFastLongHashUtil.deserializeLongKey( - keyBinarySortableDeserializeRead, hashTableKeyType); - - add(key, currentValue); - } - - @VisibleForTesting public void putRow(long currentKey, byte[] currentValue) throws HiveException, IOException { if (testValueBytesWritable == null) { testValueBytesWritable = new BytesWritable(); } testValueBytesWritable.set(currentValue, 0, currentValue.length); - add(currentKey, testValueBytesWritable); + int hashCode = HashCodeUtil.calculateLongHashCode(currentKey); + add(currentKey, hashCode, testValueBytesWritable); } protected abstract void assignSlot(int slot, long key, boolean isNewKey, BytesWritable currentValue); - public void add(long key, BytesWritable currentValue) { + public void add(long key, int hashCode, BytesWritable currentValue) { if (resizeThreshold <= keysAssigned) { expandAndRehash(); } - long hashCode = HashCodeUtil.calculateLongHashCode(key); - int intHashCode = (int) hashCode; - int slot = (intHashCode & logicalHashBucketMask); + int slot = (hashCode & logicalHashBucketMask); long probeSlot = slot; int i = 0; boolean isNewKey; @@ -119,18 +95,17 @@ public void add(long key, BytesWritable currentValue) { int pairIndex = 2 * slot; long valueRef = slotPairs[pairIndex]; if (valueRef == 0) { - // LOG.debug("VectorMapJoinFastLongHashTable add key " + key + " slot " + slot + " pairIndex " + pairIndex + " empty slot (i = " + i + ")"); isNewKey = true; break; } long tableKey = slotPairs[pairIndex + 1]; if (key == tableKey) { - // LOG.debug("VectorMapJoinFastLongHashTable add key " + key + " slot " + slot + " pairIndex " + pairIndex + " found key (i = " + i + ")"); isNewKey = false; break; } - ++metricPutConflict; + // Some other key (collision) - keep probing. + metricPutConflict++; probeSlot += (++i); slot = (int)(probeSlot & logicalHashBucketMask); } @@ -160,8 +135,10 @@ public void add(long key, BytesWritable currentValue) { } } - private void expandAndRehash() { + @Override + protected void expandAndRehashImpl(int capacity) { + long expandTime = System.currentTimeMillis(); int newLogicalHashBucketCount = logicalHashBucketCount * 2; int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; int newMetricPutConflict = 0; @@ -177,9 +154,9 @@ private void expandAndRehash() { long tableKey = slotPairs[pairIndex + 1]; // Copy to new slot table. - long hashCode = HashCodeUtil.calculateLongHashCode(tableKey); - int intHashCode = (int) hashCode; - int newSlot = intHashCode & newLogicalHashBucketMask; + int hashCode = HashCodeUtil.calculateLongHashCode(tableKey); + + int newSlot = hashCode & newLogicalHashBucketMask; long newProbeSlot = newSlot; int newPairIndex; int i = 0; @@ -200,11 +177,9 @@ private void expandAndRehash() { LOG.debug("Probed " + i + " slots (the longest so far) to find space"); } newLargestNumberOfSteps = i; - // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); } // Use old value reference word. - // LOG.debug("VectorMapJoinFastLongHashTable expandAndRehash key " + tableKey + " slot " + newSlot + " newPairIndex " + newPairIndex + " empty slot (i = " + i + ")"); newSlotPairs[newPairIndex] = valueRef; newSlotPairs[newPairIndex + 1] = tableKey; @@ -217,14 +192,13 @@ private void expandAndRehash() { metricPutConflict = newMetricPutConflict; largestNumberOfSteps = newLargestNumberOfSteps; resizeThreshold = (int)(logicalHashBucketCount * loadFactor); + metricExpandsMs += (System.currentTimeMillis() - expandTime); metricExpands++; - // LOG.debug("VectorMapJoinFastLongHashTable expandAndRehash new logicalHashBucketCount " + logicalHashBucketCount + " resizeThreshold " + resizeThreshold + " metricExpands " + metricExpands); } - protected long findReadSlot(long key, long hashCode) { + protected long findReadSlot(long key, int hashCode) { - int intHashCode = (int) hashCode; - int slot = intHashCode & logicalHashBucketMask; + int slot = hashCode & logicalHashBucketMask; long probeSlot = slot; int i = 0; @@ -233,7 +207,6 @@ protected long findReadSlot(long key, long hashCode) { long valueRef = slotPairs[pairIndex]; if (valueRef == 0) { // Given that we do not delete, an empty slot means no match. - // LOG.debug("VectorMapJoinFastLongHashTable findReadSlot key " + key + " slot " + slot + " pairIndex " + pairIndex + " empty slot (i = " + i + ")"); return -1; } long tableKey = slotPairs[pairIndex + 1]; @@ -246,13 +219,16 @@ protected long findReadSlot(long key, long hashCode) { if (i > largestNumberOfSteps) { // LOG.debug("VectorMapJoinFastLongHashTable findReadSlot returning not found"); // We know we never went that far when we were inserting. - // LOG.debug("VectorMapJoinFastLongHashTable findReadSlot key " + key + " slot " + slot + " pairIndex " + pairIndex + " largestNumberOfSteps " + largestNumberOfSteps + " (i = " + i + ")"); return -1; } slot = (int)(probeSlot & logicalHashBucketMask); } } + protected int getLongsPerSlot() { + return 2; + } + /* * The hash table slots. For a long key hash table, each slot is 2 longs and the array is * 2X sized. @@ -261,15 +237,16 @@ protected long findReadSlot(long key, long hashCode) { */ protected long[] slotPairs; - private void allocateBucketArray() { + protected void allocateBucketArray() { int slotPairArraySize = 2 * logicalHashBucketCount; slotPairs = new long[slotPairArraySize]; } public VectorMapJoinFastLongHashTable( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); this.isOuterJoin = isOuterJoin; this.hashTableKeyType = hashTableKeyType; PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.longTypeInfo }; @@ -279,4 +256,9 @@ public VectorMapJoinFastLongHashTable( min = Long.MAX_VALUE; max = Long.MIN_VALUE; } + + @Override + public void clear() { + // UNDONE + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMap.java index 9a9fb8d..6efb677 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMap.java @@ -18,22 +18,37 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.io.BytesWritable; + import com.google.common.annotations.VisibleForTesting; /* * An multi-key value hash map optimized for vector map join. */ -public class VectorMapJoinFastMultiKeyHashMap - extends VectorMapJoinFastBytesHashMap { +public class VectorMapJoinFastMultiKeyHashMap extends VectorMapJoinFastBytesHashMap { + + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + + BytesWritable keyBytesWritable = internalKeyValuePut.getKeyBytesWritable(); + add(keyBytesWritable.getBytes(), 0, keyBytesWritable.getLength(), + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); + } @VisibleForTesting - public VectorMapJoinFastMultiKeyHashMap(int initialCapacity, float loadFactor, int wbSize) { - this(false, initialCapacity, loadFactor, wbSize); + public VectorMapJoinFastMultiKeyHashMap( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int wbSize, long memUsage) { + this(mapJoinHashTableFactory, false, initialCapacity, loadFactor, wbSize, memUsage); } public VectorMapJoinFastMultiKeyHashMap( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMultiSet.java index a8744a5..d86b585 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMultiSet.java @@ -18,15 +18,34 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.io.BytesWritable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /* * An multi-key value hash multi-set optimized for vector map join. */ public class VectorMapJoinFastMultiKeyHashMultiSet extends VectorMapJoinFastBytesHashMultiSet { + private static final String CLASS_NAME = VectorMapJoinFastMultiKeyHashMultiSet.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + + BytesWritable keyBytesWritable = internalKeyValuePut.getKeyBytesWritable(); + add(keyBytesWritable.getBytes(), 0, keyBytesWritable.getLength(), + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); + } + public VectorMapJoinFastMultiKeyHashMultiSet( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashSet.java index a8048e5..42c9e52 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashSet.java @@ -18,15 +18,29 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.io.BytesWritable; + /* * An multi-key value hash set optimized for vector map join. */ public class VectorMapJoinFastMultiKeyHashSet extends VectorMapJoinFastBytesHashSet { + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + + BytesWritable keyBytesWritable = internalKeyValuePut.getKeyBytesWritable(); + add(keyBytesWritable.getBytes(), 0, keyBytesWritable.getLength(), + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); + } + public VectorMapJoinFastMultiKeyHashSet( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java deleted file mode 100644 index adb8044..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; -import org.apache.hadoop.hive.serde2.fast.DeserializeRead.ReadStringResults; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.BytesWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/* - * An single byte array value hash map optimized for vector map join. - */ -public class VectorMapJoinFastStringCommon { - - public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastStringCommon.class); - - private boolean isOuterJoin; - - private BinarySortableDeserializeRead keyBinarySortableDeserializeRead; - - private ReadStringResults readStringResults; - - public void adaptPutRow(VectorMapJoinFastBytesHashTable hashTable, - BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - if (keyBinarySortableDeserializeRead.readCheckNull()) { - return; - } - keyBinarySortableDeserializeRead.readString(readStringResults); - - hashTable.add(readStringResults.bytes, readStringResults.start, readStringResults.length, - currentValue); - } - - public VectorMapJoinFastStringCommon(boolean isOuterJoin) { - this.isOuterJoin = isOuterJoin; - PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.stringTypeInfo }; - keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); - readStringResults = keyBinarySortableDeserializeRead.createReadStringResults(); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMap.java index 6f181b2..a067408 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMap.java @@ -18,27 +18,32 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.fast.DeserializeRead.ReadStringResults; /* * An single byte array value hash map optimized for vector map join. */ public class VectorMapJoinFastStringHashMap extends VectorMapJoinFastBytesHashMap { - private VectorMapJoinFastStringCommon stringCommon; - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - stringCommon.adaptPutRow(this, currentKey, currentValue); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + ReadStringResults readStringResults = internalKeyValuePut.getStringReadStringResults(); + add(readStringResults.bytes, readStringResults.start, readStringResults.length, + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } public VectorMapJoinFastStringHashMap( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); - stringCommon = new VectorMapJoinFastStringCommon(isOuterJoin); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMultiSet.java index 9653b71..502af43 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMultiSet.java @@ -18,27 +18,32 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.fast.DeserializeRead.ReadStringResults; /* * An single byte array value hash map optimized for vector map join. */ public class VectorMapJoinFastStringHashMultiSet extends VectorMapJoinFastBytesHashMultiSet { - private VectorMapJoinFastStringCommon stringCommon; - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - stringCommon.adaptPutRow(this, currentKey, currentValue); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + ReadStringResults readStringResults = internalKeyValuePut.getStringReadStringResults(); + add(readStringResults.bytes, readStringResults.start, readStringResults.length, + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } public VectorMapJoinFastStringHashMultiSet( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); - stringCommon = new VectorMapJoinFastStringCommon(isOuterJoin); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashSet.java index 6419a0b..25b33e2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashSet.java @@ -18,27 +18,32 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.fast.DeserializeRead.ReadStringResults; /* * An single byte array value hash map optimized for vector map join. */ public class VectorMapJoinFastStringHashSet extends VectorMapJoinFastBytesHashSet { - private VectorMapJoinFastStringCommon stringCommon; - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - stringCommon.adaptPutRow(this, currentKey, currentValue); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + ReadStringResults readStringResults = internalKeyValuePut.getStringReadStringResults(); + add(readStringResults.bytes, readStringResults.start, readStringResults.length, + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } public VectorMapJoinFastStringHashSet( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); - stringCommon = new VectorMapJoinFastStringCommon(isOuterJoin); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java deleted file mode 100644 index 3b73f7d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java +++ /dev/null @@ -1,233 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; - -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext; -import org.apache.hadoop.hive.ql.exec.tez.HashTableLoader; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTable; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinTableContainer; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.Writable; -import org.apache.tez.runtime.library.api.KeyValueReader; - -/** - * HashTableLoader for Tez constructs the hashtable from records read from - * a broadcast edge. - */ -public class VectorMapJoinFastTableContainer implements VectorMapJoinTableContainer { - - private static final Logger LOG = LoggerFactory.getLogger(HashTableLoader.class.getName()); - - private final MapJoinDesc desc; - private final Configuration hconf; - - private final float keyCountAdj; - private final int threshold; - private final float loadFactor; - private final int wbSize; - private final long keyCount; - - - private final VectorMapJoinFastHashTable VectorMapJoinFastHashTable; - - public VectorMapJoinFastTableContainer(MapJoinDesc desc, Configuration hconf, - long keyCount) throws SerDeException { - - this.desc = desc; - this.hconf = hconf; - - keyCountAdj = HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEKEYCOUNTADJUSTMENT); - threshold = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD); - loadFactor = HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLELOADFACTOR); - wbSize = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEWBSIZE); - - this.keyCount = keyCount; - - // LOG.info("VectorMapJoinFastTableContainer load keyCountAdj " + keyCountAdj); - // LOG.info("VectorMapJoinFastTableContainer load threshold " + threshold); - // LOG.info("VectorMapJoinFastTableContainer load loadFactor " + loadFactor); - // LOG.info("VectorMapJoinFastTableContainer load wbSize " + wbSize); - - int newThreshold = HashMapWrapper.calculateTableSize( - keyCountAdj, threshold, loadFactor, keyCount); - - // LOG.debug("VectorMapJoinFastTableContainer load newThreshold " + newThreshold); - - VectorMapJoinFastHashTable = createHashTable(newThreshold); - } - - @Override - public VectorMapJoinHashTable vectorMapJoinHashTable() { - return VectorMapJoinFastHashTable; - } - - private VectorMapJoinFastHashTable createHashTable(int newThreshold) { - - boolean isOuterJoin = !desc.isNoOuterJoin(); - VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); - HashTableImplementationType hashTableImplementationType = vectorDesc.hashTableImplementationType(); - HashTableKind hashTableKind = vectorDesc.hashTableKind(); - HashTableKeyType hashTableKeyType = vectorDesc.hashTableKeyType(); - boolean minMaxEnabled = vectorDesc.minMaxEnabled(); - - int writeBufferSize = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEWBSIZE); - - VectorMapJoinFastHashTable hashTable = null; - - switch (hashTableKeyType) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinFastLongHashMap( - minMaxEnabled, isOuterJoin, hashTableKeyType, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinFastLongHashMultiSet( - minMaxEnabled, isOuterJoin, hashTableKeyType, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_SET: - hashTable = new VectorMapJoinFastLongHashSet( - minMaxEnabled, isOuterJoin, hashTableKeyType, - newThreshold, loadFactor, writeBufferSize); - break; - } - break; - - case STRING: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinFastStringHashMap( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinFastStringHashMultiSet( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_SET: - hashTable = new VectorMapJoinFastStringHashSet( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - } - break; - - case MULTI_KEY: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinFastMultiKeyHashMap( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinFastMultiKeyHashMultiSet( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_SET: - hashTable = new VectorMapJoinFastMultiKeyHashSet( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - } - break; - } - - return hashTable; - } - - @Override - public MapJoinKey putRow(Writable currentKey, Writable currentValue) - throws SerDeException, HiveException, IOException { - - // We are not using the key and value contexts, nor do we support a MapJoinKey. - VectorMapJoinFastHashTable.putRow((BytesWritable) currentKey, (BytesWritable) currentValue); - return null; - } - - @Override - public void seal() { - // Do nothing - } - - @Override - public ReusableGetAdaptor createGetter(MapJoinKey keyTypeFromLoader) { - throw new RuntimeException("Not applicable"); - } - - @Override - public void clear() { - // Do nothing - } - - @Override - public MapJoinKey getAnyKey() { - throw new RuntimeException("Not applicable"); - } - - @Override - public void dumpMetrics() { - // TODO - } - - @Override - public boolean hasSpill() { - return false; - } - - @Override - public int size() { - return VectorMapJoinFastHashTable.size(); - } - - @Override - public void setSerde(MapJoinObjectSerDeContext keyCtx, MapJoinObjectSerDeContext valCtx) - throws SerDeException { - // Do nothing in this case. - - } - - /* - @Override - public com.esotericsoftware.kryo.io.Output getHybridBigTableSpillOutput(int partitionId) { - throw new RuntimeException("Not applicable"); - } - */ -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java index 570a747..888ac72 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java @@ -20,141 +20,197 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResultImpl; import org.apache.hadoop.hive.serde2.WriteBuffers; import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; -import org.apache.hadoop.hive.serde2.WriteBuffers.Position;; +import org.apache.hadoop.hive.serde2.WriteBuffers.Position; +import com.google.common.base.Preconditions; -// Supports random access. - +/* + * Used by VectorMapJoinFastLongHashMap to store the values for a hash map. (The long key is + * stored in the hash table slot array). + */ public class VectorMapJoinFastValueStore { - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastValueStore.class.getName()); + private static final String CLASS_NAME = VectorMapJoinFastValueStore.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); private WriteBuffers writeBuffers; + private WriteBuffers.Position readPos; /** - * A store for "lists" of arbitrary length values in memory. + * A store for a list of arbitrary length values in memory. * * The memory is a "infinite" byte array or WriteBuffers object. * - * We give the client a 64-bit (long) value reference to keep that has the offset within - * the "infinite" byte array of the last value inserted in a "list". + * We give the client (i.e. hash table) a 64-bit value reference to keep that has the offset + * within the "infinite" byte array of the last value inserted in a list. + * + * We optimize the common case when the the value is short, and the value list + * has 1 element and store that information in the 64 bits. * - * We optimize the common case when "list"s are 1 element and values are short and store the - * value length in the value reference word. + * We also support keeping the value count (up to a limit or cap) so help with join result + * generation algorithms. * - * We also support keeping a count (up to a limit or cap) so help with join result generation - * algorithms. + * When there are more than 1 value, the zero padding is overwritten with a relative offset to + * the next value. The next value always includes the value length. * - * If the last value is big, the big length will be encoded as an integer at the beginning - * of the value followed by the big value bytes. + * Cases: + * 1) One element when key and value lengths are small (and stored in reference words): * - * Due to optimizing by keeping the last value's length in the value reference, when we have - * more than one value, a new value will need to keep the small value length of the next - * value. + * Value Reference + * | + * | absoluteValueOffset + * | + * v + * <5 0's Padding for Next Rel Offset> + * NEXT (NONE) VALUE * - * So, values after the first value have 4 parts: a relative offset word with flags, an - * optional length if the current value is big, an optional next value length if it is small, - * and the value bytes. + * 2) One element, general: + * + * Value Reference + * | + * | absoluteValueOffset + * | + * v + * <5 0's Padding for Next Rel Offset> [Big Value Len] + * | NEXT VALUE + * | + * | first record absolute offset + relative offset + * | + * -------- + * | + * v + * <5 0's Padding for Next Value Ref> + * NEXT (NONE) VALUE * - * Value Reference -------------- - * | - * | - * v - * {Big Value Len} {Big Value Bytes} + * 4) Three elements showing how first record updated to point to new value and + * new value points to most recent (additional) value: * - * 1) Multiple elements: + * Value Reference + * | + * | absoluteValueOffset + * | + * v + * + * | NEXT VALUE + * | + * | first record absolute offset + relative offset + * | + * | + * | <5 0's Padding for Next Value Ref> + * | ^ NEXT (NONE) VALUE + * | | + * | ------ + * | | + * | | new record absolute offset - (minus) relative offset + * | | + * -----> + * NEXT VALUE * - * Value Reference ---------------- - * | - * | // Last value added. - * | - * v - * {Rel Offset Word} [Big Value Len] [Next Value Small Len] {Value Bytes} - * | optional optional - * | - * | - * --- . . . --- - * | - * | // 0 or more - * | - * v - * {Rel Offset Word} [Big Value Len] [Next Value Small Len] {Value Bytes} - * | optional optional - * | - * | - * -------------------- - * | - * | - * v - * [Big Value Length] {Value Bytes} - * optional * - * // First value added without Relative Offset Word, etc. + * 5) Four elements showing how first record is again updated to point to new value and + * new value points to most recent (additional) value: + * + * Value Reference + * | + * | absoluteValueOffset + * | + * v + * + * | NEXT VALUE + * | + * | first record absolute offset + relative offset + * | + * | + * | <5 0's Padding for Next Value Ref> + * | ^ NEXT (NONE) VALUE + * | | + * | ------ + * | | record absolute offset - (minus) relative offset + * | | + * | + * | ^ NEXT VALUE + * | | + * | ------ + * | | + * | | new record absolute offset - (minus) relative offset + * | | + * -----> + * NEXT VALUE + * + * You get the idea. */ - public WriteBuffers writeBuffers() { return writeBuffers; } - public static class HashMapResult extends VectorMapJoinHashMapResult { + /** + * A hash map result that can read values stored by the value store, one-by-one. + * + * It implements the standard map join hash map result interface. + * + */ + public static class HashMapResult extends MapJoinHashTableResultImpl + implements MapJoinHashMapResult { private VectorMapJoinFastValueStore valueStore; private boolean hasRows; - private long valueRefWord; private boolean isSingleRow; private int cappedCount; - private boolean haveReadCurrent; - private int readIndex; - private boolean isEof; + private long absoluteValueOffset; + private int firstValueLength; + private boolean isFirstValueLengthSmall; + private int readIndex; private boolean isNextEof; - private boolean isNextLast; + long nextAbsoluteValueOffset; - boolean isNextValueLengthSmall; - int nextSmallValueLength; private ByteSegmentRef byteSegmentRef; private Position readPos; public HashMapResult() { super(); - valueRefWord = -1; hasRows = false; byteSegmentRef = new ByteSegmentRef(); readPos = new Position(); } - public void set(VectorMapJoinFastValueStore valueStore, long valueRefWord) { - // LOG.debug("VectorMapJoinFastValueStore set valueRefWord " + Long.toHexString(valueRefWord)); + public void setMatch(VectorMapJoinFastValueStore valueStore, long valueRefWord) { this.valueStore = valueStore; - this.valueRefWord = valueRefWord; hasRows = true; - isSingleRow = ((valueRefWord & IsLastFlag.flagOnMask) != 0); - cappedCount = - (int) ((valueRefWord & CappedCount.bitMask) >> CappedCount.bitShift); + cappedCount = ValueRef.getCappedCount(valueRefWord); + isSingleRow = (cappedCount == 1); + + absoluteValueOffset = ValueRef.getAbsoluteValueOffset(valueRefWord); + + firstValueLength = ValueRef.getSmallValueLength(valueRefWord); + isFirstValueLengthSmall = + (firstValueLength != ValueRef.SmallValueLength.allBitsOn); + // Position to beginning. - haveReadCurrent = false; readIndex = 0; - isEof = false; + isNextEof = false; + mapJoinResult = MapJoinResult.MATCH; } @Override @@ -192,16 +248,15 @@ public ByteSegmentRef first() { } // Position to beginning. - haveReadCurrent = false; readIndex = 0; - isEof = false; + isNextEof = false; return internalRead(); } @Override public ByteSegmentRef next() { - if (!hasRows) { + if (!hasRows || isNextEof) { return null; } @@ -211,111 +266,60 @@ public ByteSegmentRef next() { public ByteSegmentRef internalRead() { - long absoluteValueOffset; - - int valueLength; + int nextValueLength; if (readIndex == 0) { - /* - * Extract information from reference word from slot table. - */ - absoluteValueOffset = - (valueRefWord & AbsoluteValueOffset.bitMask); - - // Position before the last written value. - valueStore.writeBuffers.setReadPoint(absoluteValueOffset, readPos); - if (isSingleRow) { - isNextEof = true; - valueLength = - (int) ((valueRefWord & SmallValueLength.bitMask) >> SmallValueLength.bitShift); - boolean isValueLengthSmall = (valueLength != SmallValueLength.allBitsOn); - if (!isValueLengthSmall) { - // And, if current value is big we must read it. - valueLength = valueStore.writeBuffers.readVInt(readPos); - } + // Skip next relative value offset + valueStore.writeBuffers.setReadPoint( + absoluteValueOffset + RelativeOffset.byteLength, readPos); + isNextEof = true; + nextAbsoluteValueOffset = -1; } else { - isNextEof = false; - - // 2nd and beyond records have a relative offset word at the beginning. - long relativeOffsetWord = valueStore.writeBuffers.readVLong(readPos); - long relativeOffset = - (relativeOffsetWord & NextRelativeValueOffset.bitMask) >> NextRelativeValueOffset.bitShift; - - nextAbsoluteValueOffset = absoluteValueOffset - relativeOffset; - - isNextLast = ((relativeOffsetWord & IsNextValueLastFlag.flagOnMask) != 0); - isNextValueLengthSmall = - ((relativeOffsetWord & IsNextValueLengthSmallFlag.flagOnMask) != 0); - } + // Read the next relative offset at beginning. + valueStore.writeBuffers.setReadPoint(absoluteValueOffset, readPos); + long relativeNextValueOffset = + valueStore.writeBuffers.readNByteLong(ValueRef.AbsoluteValueOffset.byteLength, readPos); + Preconditions.checkState(relativeNextValueOffset != 0); + isNextEof = false; - valueLength = - (int) ((valueRefWord & SmallValueLength.bitMask) >> SmallValueLength.bitShift); - boolean isValueLengthSmall = (valueLength != SmallValueLength.allBitsOn); - if (!isValueLengthSmall) { - // And, if current value is big we must read it. - valueLength = valueStore.writeBuffers.readVInt(readPos); + // Use positive relative offset from first record to last inserted value record. + nextAbsoluteValueOffset = absoluteValueOffset + relativeNextValueOffset; } - - // 2nd and beyond have the next value's small length in the current record. - if (isNextValueLengthSmall) { - nextSmallValueLength = valueStore.writeBuffers.readVInt(readPos); + if (isFirstValueLengthSmall) { + nextValueLength = firstValueLength; } else { - nextSmallValueLength = -1; + nextValueLength = valueStore.writeBuffers.readVInt(readPos); } - } else { - if (isNextEof) { - return null; - } - - absoluteValueOffset = nextAbsoluteValueOffset; - // Position before the last written value. - valueStore.writeBuffers.setReadPoint(absoluteValueOffset, readPos); + // Position to the next value record. + Preconditions.checkState(nextAbsoluteValueOffset >= 0); + valueStore.writeBuffers.setReadPoint(nextAbsoluteValueOffset, readPos); - if (isNextLast) { + // Read the next relative offset. + long relativeNextValueOffset = + valueStore.writeBuffers.readNByteLong( + RelativeOffset.byteLength, readPos); + if (relativeNextValueOffset == 0) { isNextEof = true; - - if (isNextValueLengthSmall) { - valueLength = nextSmallValueLength; - } else { - valueLength = (int) valueStore.writeBuffers.readVLong(readPos); - } + nextAbsoluteValueOffset = -1; } else { isNextEof = false; - // 2nd and beyond records have a relative offset word at the beginning. - long relativeOffsetWord = valueStore.writeBuffers.readVLong(readPos); - - // Read current value's big length now, if necessary. - if (isNextValueLengthSmall) { - valueLength = nextSmallValueLength; - } else { - valueLength = (int) valueStore.writeBuffers.readVLong(readPos); - } - - long relativeOffset = - (relativeOffsetWord & NextRelativeValueOffset.bitMask) >> NextRelativeValueOffset.bitShift; - - nextAbsoluteValueOffset = absoluteValueOffset - relativeOffset; - - isNextLast = ((relativeOffsetWord & IsNextValueLastFlag.flagOnMask) != 0); - isNextValueLengthSmall = - ((relativeOffsetWord & IsNextValueLengthSmallFlag.flagOnMask) != 0); - if (isNextValueLengthSmall) { - // TODO: Write readVInt - nextSmallValueLength = (int) valueStore.writeBuffers.readVLong(readPos); - } else { - nextSmallValueLength = -1; - } + // The way we insert causes our chain to backwards from the last inserted value record... + nextAbsoluteValueOffset = nextAbsoluteValueOffset - relativeNextValueOffset; + Preconditions.checkState(nextAbsoluteValueOffset >= 0); } + nextValueLength = valueStore.writeBuffers.readVInt(readPos); + + // Now positioned to the value. } - // Our reading is positioned to the value. - valueStore.writeBuffers.getByteSegmentRefToCurrent(byteSegmentRef, valueLength, readPos); + // Capture a ByteSegmentRef to the current value position and length. + valueStore.writeBuffers.getByteSegmentRefToCurrent(byteSegmentRef, nextValueLength, readPos); readIndex++; return byteSegmentRef; @@ -326,13 +330,18 @@ public boolean isEof() { if (!hasRows) { return true; } - return isEof; + return isNextEof; } @Override - public void forget() { + public boolean isAliasFilterAvailable() { + return false; } + @Override + public byte aliasFilter() { + return 0; + } @Override public String toString() { @@ -357,201 +366,150 @@ public String toString() { * * Last field: an bit indicating whether there is only one value. */ + private final static class ValueRef { - // Lowest field. - private final class AbsoluteValueOffset { - private static final int bitLength = 40; - private static final long allBitsOn = (1L << bitLength) - 1; - private static final long bitMask = allBitsOn; + // Lowest field. + private final class AbsoluteValueOffset { + private static final int bitLength = 40; + private static final int byteLength = (bitLength + Byte.SIZE -1) / Byte.SIZE; + private static final long allBitsOn = (1L << bitLength) - 1; + private static final long bitMask = allBitsOn; - // Make it a power of 2. - private static final long maxSize = 1L << (bitLength - 2); - } + // Make it a power of 2. + private static final long maxSize = 1L << (bitLength - 2); + } - private final class SmallValueLength { - private static final int bitLength = 10; - private static final int allBitsOn = (1 << bitLength) - 1; - private static final int threshold = allBitsOn; // Lower this for big value testing. - private static final int bitShift = AbsoluteValueOffset.bitLength; - private static final long bitMask = ((long) allBitsOn) << bitShift; - private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; - } + public static long getAbsoluteValueOffset(long valueRef) { + return (valueRef & AbsoluteValueOffset.bitMask); + } - private final class CappedCount { - private static final int bitLength = 10; - private static final int allBitsOn = (1 << bitLength) - 1; - private static final int limit = allBitsOn; - private static final int bitShift = SmallValueLength.bitShift + SmallValueLength.bitLength; - private static final long bitMask = ((long) allBitsOn) << bitShift; - } + private final class SmallValueLength { + private static final int bitLength = 10; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int threshold = allBitsOn; // Lower this for big value testing. + private static final int bitShift = AbsoluteValueOffset.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; + } - private final class IsLastFlag { - private static final int bitShift = CappedCount.bitShift + CappedCount.bitLength;; - private static final long flagOnMask = 1L << bitShift; - } + public static int getSmallValueLength(long valueRef) { + return (int) ((valueRef & SmallValueLength.bitMask) >> SmallValueLength.bitShift); + } - // This bit should not be on for valid value references. We use -1 for a no value marker. - private final class IsInvalidFlag { - private static final int bitShift = 63; - private static final long flagOnMask = 1L << bitShift; - } + private final class CappedCount { + private static final int bitLength = 10; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int limit = allBitsOn; + private static final int bitShift = SmallValueLength.bitShift + SmallValueLength.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + } - /** - * Relative Offset Word stored at the beginning of all but the last value that has a - * relative offset and 2 flags. - * - * We put the flags at the low end of the word so the variable length integer will - * encode smaller. - * - * First bit is a flag indicating if the next value (not the current value) has a small length. - * When the first value is added and it has a small length, that length is stored in the - * value reference and not with the value. So, when we have multiple values, we need a way to - * know to keep the next value's small length with the current value. - * - * Second bit is a flag indicating if the next value (not the current value) is the last value. - * - * The relative offset *backwards* to the next value. - */ + public static int getCappedCount(long part1Word) { + return (int) ((part1Word & CappedCount.bitMask) >> CappedCount.bitShift); + } - private final class IsNextValueLengthSmallFlag { - private static final int bitLength = 1; - private static final long flagOnMask = 1L; - } + // This bit should not be on for valid value references. We use -1 for a no value marker. + private final class IsInvalidFlag { + private static final int bitShift = 63; + private static final long flagOnMask = 1L << bitShift; + } - private final class IsNextValueLastFlag { - private static final int bitLength = 1; - private static final int bitShift = IsNextValueLengthSmallFlag.bitLength; - private static final long flagOnMask = 1L << bitShift; + public static boolean getIsInvalidFlag(long valueRef) { + return (valueRef & IsInvalidFlag.flagOnMask) != 0; + } } - private final class NextRelativeValueOffset { - private static final int bitLength = 40; - private static final long allBitsOn = (1L << bitLength) - 1; - private static final int bitShift = IsNextValueLastFlag.bitShift + IsNextValueLastFlag.bitLength; - private static final long bitMask = allBitsOn << bitShift; + private final static class RelativeOffset { + private static final int byteLength = ValueRef.AbsoluteValueOffset.byteLength; + + // Relative offset zero padding. + private static final byte[] zeroPadding = new byte[] { 0,0,0,0,0 }; } public long addFirst(byte[] valueBytes, int valueStart, int valueLength) { - // First value is written without: next relative offset, next value length, is next value last - // flag, is next value length small flag, etc. - - /* - * We build up the Value Reference Word we will return that will be kept by the caller. - */ - - long valueRefWord = IsLastFlag.flagOnMask; - - valueRefWord |= ((long) 1 << CappedCount.bitShift); - - long newAbsoluteOffset; - if (valueLength < SmallValueLength.threshold) { - - // Small case: Just write the value bytes only. - - if (valueLength == 0) { - // We don't write a first empty value. - // Get an offset to reduce the relative offset later if there are more than 1 value. - newAbsoluteOffset = writeBuffers.getWritePoint(); - } else { - newAbsoluteOffset = writeBuffers.getWritePoint(); - writeBuffers.write(valueBytes, valueStart, valueLength); - } - - // The caller remembers the small value length. - valueRefWord |= ((long) valueLength) << SmallValueLength.bitShift; - } else { + long absoluteValueOffset = writeBuffers.getWritePoint(); + Preconditions.checkState(absoluteValueOffset >= 0); - // Big case: write the length as a VInt and then the value bytes. - - newAbsoluteOffset = writeBuffers.getWritePoint(); + // Zero pad out bytes for fixed size next relative offset. + writeBuffers.write(RelativeOffset.zeroPadding); + boolean isValueLengthBig = (valueLength >= ValueRef.SmallValueLength.threshold); + if (isValueLengthBig) { writeBuffers.writeVInt(valueLength); - writeBuffers.write(valueBytes, valueStart, valueLength); - - // Use magic length value to indicate big. - valueRefWord |= SmallValueLength.allBitsOnBitShifted; } + writeBuffers.write(valueBytes, valueStart, valueLength); - // LOG.debug("VectorMapJoinFastValueStore addFirst valueLength " + valueLength + " newAbsoluteOffset " + newAbsoluteOffset + " valueRefWord " + Long.toHexString(valueRefWord)); + long valueRefWord = absoluteValueOffset; + valueRefWord |= ((long) 1 << ValueRef.CappedCount.bitShift); - // The lower bits are the absolute value offset. - valueRefWord |= newAbsoluteOffset; + if (isValueLengthBig) { + valueRefWord |= ValueRef.SmallValueLength.allBitsOnBitShifted; + } else { + valueRefWord |= ((long) valueLength) << ValueRef.SmallValueLength.bitShift; + } return valueRefWord; } - public long addMore(long oldValueRef, byte[] valueBytes, int valueStart, int valueLength) { + public long addMore(long valueRefWord, byte[] valueBytes, int valueStart, int valueLength) { + + Preconditions.checkState(!ValueRef.getIsInvalidFlag(valueRefWord)); - if ((oldValueRef & IsInvalidFlag.flagOnMask) != 0) { - throw new RuntimeException("Invalid optimized hash table reference"); - } /* * Extract information about the old value. */ - long oldAbsoluteValueOffset = - (oldValueRef & AbsoluteValueOffset.bitMask); - int oldSmallValueLength = - (int) ((oldValueRef & SmallValueLength.bitMask) >> SmallValueLength.bitShift); - boolean isOldValueLengthSmall = (oldSmallValueLength != SmallValueLength.allBitsOn); - int oldCappedCount = - (int) ((oldValueRef & CappedCount.bitMask) >> CappedCount.bitShift); - boolean isOldValueLast = - ((oldValueRef & IsLastFlag.flagOnMask) != 0); - - // LOG.debug("VectorMapJoinFastValueStore addMore isOldValueLast " + isOldValueLast + " oldSmallValueLength " + oldSmallValueLength + " oldAbsoluteValueOffset " + oldAbsoluteValueOffset + " oldValueRef " + Long.toHexString(oldValueRef)); + int oldCappedCount = ValueRef.getCappedCount(valueRefWord); - /* - * Write information about the old value (which becomes our next) at the beginning - * of our new value. - */ - long newAbsoluteOffset = writeBuffers.getWritePoint(); + long absoluteValueOffset = ValueRef.getAbsoluteValueOffset(valueRefWord); - long relativeOffsetWord = 0; - if (isOldValueLengthSmall) { - relativeOffsetWord |= IsNextValueLengthSmallFlag.flagOnMask; - } - if (isOldValueLast) { - relativeOffsetWord |= IsNextValueLastFlag.flagOnMask; - } - int newCappedCount = oldCappedCount; - if (newCappedCount < CappedCount.limit) { - newCappedCount++; - } - long relativeOffset = newAbsoluteOffset - oldAbsoluteValueOffset; - relativeOffsetWord |= (relativeOffset << NextRelativeValueOffset.bitShift); + long nextAbsoluteValueOffset = writeBuffers.getWritePoint(); + Preconditions.checkState(nextAbsoluteValueOffset >= 0); - writeBuffers.writeVLong(relativeOffsetWord); + if (oldCappedCount == 1) { + // Write zeros to indicate no 3rd record. + writeBuffers.write(RelativeOffset.zeroPadding); + } else { - // When the next value is small it was not recorded with the old (i.e. next) value and we - // have to remember it. - if (isOldValueLengthSmall) { - writeBuffers.writeVInt(oldSmallValueLength); - } + // To insert next value record above count 2: - // Now, we have written all information about the next value, work on the *new* value. + // 1) Read next relative offset in first record (this is a positive relative offset) to + // last inserted value record. + long oldPrevRelativeValueOffset = + writeBuffers.readNByteLong( + absoluteValueOffset, RelativeOffset.byteLength, readPos); - long newValueRef = ((long) newCappedCount) << CappedCount.bitShift; - boolean isNewValueSmall = (valueLength < SmallValueLength.threshold); - if (!isNewValueSmall) { - // Use magic value to indicating we are writing the big value length. - newValueRef |= ((long) SmallValueLength.allBitsOn << SmallValueLength.bitShift); - writeBuffers.writeVInt(valueLength); - } else { - // Caller must remember small value length. - newValueRef |= ((long) valueLength) << SmallValueLength.bitShift; - } - writeBuffers.write(valueBytes, valueStart, valueLength); + // 2) Relative offset is positive from first record to last inserted value record. + long prevAbsoluteValueOffset = absoluteValueOffset + oldPrevRelativeValueOffset; - // The lower bits are the absolute value offset. - newValueRef |= newAbsoluteOffset; + // 3) Since previous record is before the new one, subtract because we store relative offsets + // as unsigned. + long newPrevRelativeValueOffset = nextAbsoluteValueOffset - prevAbsoluteValueOffset; + Preconditions.checkState(newPrevRelativeValueOffset >= 0); + writeBuffers.writeFiveByteULong(newPrevRelativeValueOffset); + } - // LOG.debug("VectorMapJoinFastValueStore addMore valueLength " + valueLength + " newAbsoluteOffset " + newAbsoluteOffset + " newValueRef " + Long.toHexString(newValueRef)); + writeBuffers.writeVInt(valueLength); + writeBuffers.write(valueBytes, valueStart, valueLength); - return newValueRef; + // Overwrite beginning of first record to insert new value. + long newRelativeOffset = nextAbsoluteValueOffset - absoluteValueOffset; + Preconditions.checkState(newRelativeOffset >= 0); + writeBuffers.writeFiveByteULong(absoluteValueOffset, newRelativeOffset); + + // Update valueRefWord + if (oldCappedCount < ValueRef.CappedCount.limit) { + int newCappedCount = oldCappedCount + 1; + valueRefWord &= ~ValueRef.CappedCount.bitMask; + valueRefWord |= ((long) newCappedCount) << ValueRef.CappedCount.bitShift; + Preconditions.checkState(!ValueRef.getIsInvalidFlag(valueRefWord)); + } + return valueRefWord; } public VectorMapJoinFastValueStore(int writeBuffersSize) { - writeBuffers = new WriteBuffers(writeBuffersSize, AbsoluteValueOffset.maxSize); + writeBuffers = new WriteBuffers(writeBuffersSize, ValueRef.AbsoluteValueOffset.maxSize); + + readPos = new WriteBuffers.Position(); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMap.java deleted file mode 100644 index 512db1b..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMap.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single byte array key hash map lookup method. - */ -public interface VectorMapJoinBytesHashMap - extends VectorMapJoinBytesHashTable, VectorMapJoinHashMap { - - /* - * Lookup a byte array key in the hash map. - * - * @param keyBytes - * A byte array containing the key within a range. - * @param keyStart - * The offset the beginning of the key. - * @param keyLength - * The length of the key. - * @param hashMapResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spill (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult lookup(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMapResult hashMapResult) throws IOException; - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMultiSet.java deleted file mode 100644 index 196403d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMultiSet.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single byte array key hash multi-set contains method. - */ -public interface VectorMapJoinBytesHashMultiSet - extends VectorMapJoinBytesHashTable, VectorMapJoinHashMultiSet { - - /* - * Lookup an byte array key in the hash multi-set. - * - * @param keyBytes - * A byte array containing the key within a range. - * @param keyStart - * The offset the beginning of the key. - * @param keyLength - * The length of the key. - * @param hashMultiSetResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException; - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashSet.java deleted file mode 100644 index a0c93e5..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashSet.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single byte array key hash multi-set contains method. - */ -public interface VectorMapJoinBytesHashSet - extends VectorMapJoinBytesHashTable, VectorMapJoinHashSet { - - /* - * Lookup an byte array key in the hash set. - * - * @param keyBytes - * A byte array containing the key within a range. - * @param keyStart - * The offset the beginning of the key. - * @param keyLength - * The length of the key. - * @param hashSetResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashSetResult hashSetResult) throws IOException; - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashTable.java deleted file mode 100644 index 7494e1d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashTable.java +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * Interface for a vector map join hash table (which could be a hash map, hash multi-set, or - * hash set) for a single byte array key. - */ -public interface VectorMapJoinBytesHashTable extends VectorMapJoinHashTable { -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMap.java deleted file mode 100644 index 7abe2c8..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMap.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * The root interface for a vector map join hash map. - */ -public interface VectorMapJoinHashMap extends VectorMapJoinHashTable { - - /* - * @return A new hash map result implementation specific object. - * - * The object can be used to access the values when there is a match, or - * access spill information when the partition with the key is currently spilled. - */ - VectorMapJoinHashMapResult createHashMapResult(); - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java deleted file mode 100644 index fa6dedb..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; - -/* - * Abstract class for a hash map result. For reading the values, one-by-one. - */ -public abstract class VectorMapJoinHashMapResult extends VectorMapJoinHashTableResult { - - /** - * @return Whether there are any rows (i.e. true for match). - */ - public abstract boolean hasRows(); - - /** - * @return Whether there is 1 value row. - */ - public abstract boolean isSingleRow(); - - /** - * @return Whether there is a capped count available from cappedCount. - */ - public abstract boolean isCappedCountAvailable(); - - /** - * @return The count of values, up to a arbitrary cap limit. When available, the capped - * count can be used to make decisions on how to optimally generate join results. - */ - public abstract int cappedCount(); - - /** - * @return A reference to the first value, or null if there are no values. - */ - public abstract ByteSegmentRef first(); - - /** - * @return The next value, or null if there are no more values to be read. - */ - public abstract ByteSegmentRef next(); - - /** - * @return Whether reading is at the end. - */ - public abstract boolean isEof(); -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSet.java deleted file mode 100644 index 210597d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSet.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -public interface VectorMapJoinHashMultiSet extends VectorMapJoinHashTable { - - /* - * @return A new hash multi-set result implementation specific object. - * - * The object can be used to access the *count* of values when the key is contained in the - * multi-set, or access spill information when the partition with the key is currently spilled. - */ - VectorMapJoinHashMultiSetResult createHashMultiSetResult(); - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSetResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSetResult.java deleted file mode 100644 index 0728f78..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSetResult.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * Abstract class for a hash multi-set result. - */ -public abstract class VectorMapJoinHashMultiSetResult extends VectorMapJoinHashTableResult { - - protected long count; - - /* - * @return The multi-set count for the lookup key. - */ - public long count() { - return count; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSet.java deleted file mode 100644 index a26f997..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSet.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * The root interface for a vector map join hash set. - */ -public interface VectorMapJoinHashSet extends VectorMapJoinHashTable { - - /* - * @return A new hash set result implementation specific object. - * - * The object can be used to access access spill information when the partition with the key - * is currently spilled. - */ - VectorMapJoinHashSetResult createHashSetResult(); - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSetResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSetResult.java deleted file mode 100644 index 467c4c1..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSetResult.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * Abstract class for a hash set result. - */ -public abstract class VectorMapJoinHashSetResult extends VectorMapJoinHashTableResult { - - // Nothing currently available for hash sets. - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTable.java deleted file mode 100644 index c7e585c..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTable.java +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.io.BytesWritable; - -/* - * Root interface for a vector map join hash table (which could be a hash map, hash multi-set, or - * hash set). - */ -public interface VectorMapJoinHashTable { - - - /* - * @param currentKey - * The current key. - * @param currentValue - * The current value. - */ - void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException; - - /** - * Get hash table size - */ - int size(); -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTableResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTableResult.java deleted file mode 100644 index ce598e3..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTableResult.java +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * Root abstract class for a hash table result. - */ -public abstract class VectorMapJoinHashTableResult { - - private JoinUtil.JoinResult joinResult; - - private int spillPartitionId; - - public VectorMapJoinHashTableResult() { - joinResult = JoinUtil.JoinResult.NOMATCH; - spillPartitionId = -1; - } - - /** - * @return The join result from the most recent hash map match, or hash multi-set / set contains - * call. - */ - public JoinUtil.JoinResult joinResult() { - return joinResult; - } - - /** - * Set the current join result. - * @param joinResult - * The new join result. - */ - public void setJoinResult(JoinUtil.JoinResult joinResult) { - this.joinResult = joinResult; - } - - /** - * Forget about the most recent hash table lookup or contains call. - */ - public void forget() { - joinResult = JoinUtil.JoinResult.NOMATCH; - } - - /** - * Set the spill partition id. - */ - public void setSpillPartitionId(int spillPartitionId) { - this.spillPartitionId = spillPartitionId; - } - - /** - * @return The Hybrid Grace spill partition id. - */ - public int spillPartitionId() { - return spillPartitionId; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("joinResult " + joinResult.name()); - return sb.toString(); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMap.java deleted file mode 100644 index f180d02..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMap.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single long key hash map lookup method. - */ -public interface VectorMapJoinLongHashMap - extends VectorMapJoinLongHashTable, VectorMapJoinHashMap { - - /* - * Lookup an long in the hash map. - * - * @param key - * The long key. - * @param hashMapResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult lookup(long key, VectorMapJoinHashMapResult hashMapResult) throws IOException; - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMultiSet.java deleted file mode 100644 index 7477584..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMultiSet.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single long key hash multi-set contains method. - */ -public interface VectorMapJoinLongHashMultiSet - extends VectorMapJoinLongHashTable, VectorMapJoinHashMultiSet { - - /* - * Lookup an long in the hash multi-set. - * - * @param key - * The long key. - * @param hashMultiSetResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult contains(long key, VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException; - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashSet.java deleted file mode 100644 index 8c28bff..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashSet.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface adds the single long key hash multi-set contains method. - */ -public interface VectorMapJoinLongHashSet - extends VectorMapJoinLongHashTable, VectorMapJoinHashSet { - - /* - * Lookup an long in the hash set. - * - * @param key - * The long key. - * @param hashSetResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult contains(long key, VectorMapJoinHashSetResult hashSetResult) throws IOException; - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashTable.java deleted file mode 100644 index 046a403..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashTable.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * Interface for a vector map join hash table (which could be a hash map, hash multi-set, or - * hash set) for a single long. - */ -public interface VectorMapJoinLongHashTable extends VectorMapJoinHashTable { - - boolean useMinMax(); - long min(); - long max(); - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinTableContainer.java deleted file mode 100644 index 09631e4..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinTableContainer.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; - -public interface VectorMapJoinTableContainer extends MapJoinTableContainer { - - VectorMapJoinHashTable vectorMapJoinHashTable(); - - // com.esotericsoftware.kryo.io.Output getHybridBigTableSpillOutput(int partitionId); -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedCreateHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedCreateHashTable.java deleted file mode 100644 index f34b1cd..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedCreateHashTable.java +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; - -/** - */ -public class VectorMapJoinOptimizedCreateHashTable { - - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOptimizedCreateHashTable.class.getName()); - - public static VectorMapJoinOptimizedHashTable createHashTable(MapJoinDesc desc, - MapJoinTableContainer mapJoinTableContainer) { - - MapJoinKey refKey = mapJoinTableContainer.getAnyKey(); - ReusableGetAdaptor hashMapRowGetter = mapJoinTableContainer.createGetter(refKey); - - boolean isOuterJoin = !desc.isNoOuterJoin(); - VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); - HashTableKind hashTableKind = vectorDesc.hashTableKind(); - HashTableKeyType hashTableKeyType = vectorDesc.hashTableKeyType(); - boolean minMaxEnabled = vectorDesc.minMaxEnabled(); - - VectorMapJoinOptimizedHashTable hashTable = null; - - switch (hashTableKeyType) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinOptimizedLongHashMap( - minMaxEnabled, isOuterJoin, hashTableKeyType, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinOptimizedLongHashMultiSet( - minMaxEnabled, isOuterJoin, hashTableKeyType, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_SET: - hashTable = new VectorMapJoinOptimizedLongHashSet( - minMaxEnabled, isOuterJoin, hashTableKeyType, - mapJoinTableContainer, hashMapRowGetter); - break; - } - break; - - case STRING: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinOptimizedStringHashMap( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinOptimizedStringHashMultiSet( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_SET: - hashTable = new VectorMapJoinOptimizedStringHashSet( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - } - break; - - case MULTI_KEY: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinOptimizedMultiKeyHashMap( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinOptimizedMultiKeyHashMultiSet( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_SET: - hashTable = new VectorMapJoinOptimizedMultiKeyHashSet( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - } - break; - } - return hashTable; - } - - /* - @Override - public com.esotericsoftware.kryo.io.Output getHybridBigTableSpillOutput(int partitionId) { - - HybridHashTableContainer ht = (HybridHashTableContainer) mapJoinTableContainer; - - HashPartition hp = ht.getHashPartitions()[partitionId]; - - return hp.getMatchfileOutput(); - } - */ -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java deleted file mode 100644 index e56c821..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java +++ /dev/null @@ -1,128 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; - -public class VectorMapJoinOptimizedHashMap - extends VectorMapJoinOptimizedHashTable - implements VectorMapJoinBytesHashMap { - - @Override - public VectorMapJoinHashMapResult createHashMapResult() { - return new HashMapResult(); - } - - public static class HashMapResult extends VectorMapJoinHashMapResult { - - private BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult; - - public HashMapResult() { - super(); - bytesBytesMultiHashMapResult = new BytesBytesMultiHashMap.Result(); - } - - public BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult() { - return bytesBytesMultiHashMapResult; - } - - @Override - public boolean hasRows() { - return (joinResult() == JoinUtil.JoinResult.MATCH); - } - - @Override - public boolean isSingleRow() { - if (joinResult() != JoinUtil.JoinResult.MATCH) { - throw new RuntimeException("HashMapResult is not a match"); - } - return bytesBytesMultiHashMapResult.isSingleRow(); - } - - @Override - public boolean isCappedCountAvailable() { - return false; - } - - @Override - public int cappedCount() { - return 0; - } - - @Override - public ByteSegmentRef first() { - if (joinResult() != JoinUtil.JoinResult.MATCH) { - throw new RuntimeException("HashMapResult is not a match"); - } - return bytesBytesMultiHashMapResult.first(); - } - - @Override - public ByteSegmentRef next() { - return bytesBytesMultiHashMapResult.next(); - } - - @Override - public boolean isEof() { - return bytesBytesMultiHashMapResult.isEof(); - } - - @Override - public void forget() { - bytesBytesMultiHashMapResult.forget(); - super.forget(); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("(" + super.toString() + ", "); - sb.append("isSingleRow " + (joinResult() == JoinUtil.JoinResult.MATCH ? isSingleRow() : "") + ")"); - return sb.toString(); - } - } - - @Override - public JoinUtil.JoinResult lookup(byte[] keyBytes, int keyOffset, int keyLength, - VectorMapJoinHashMapResult hashMapResult) throws IOException { - - HashMapResult implementationHashMapResult = (HashMapResult) hashMapResult; - - JoinUtil.JoinResult joinResult = - doLookup(keyBytes, keyOffset, keyLength, - implementationHashMapResult.bytesBytesMultiHashMapResult(), - (VectorMapJoinHashTableResult) hashMapResult); - - return joinResult; - } - - public VectorMapJoinOptimizedHashMap( - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMultiSet.java deleted file mode 100644 index 34de7e1..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMultiSet.java +++ /dev/null @@ -1,103 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; - -public class VectorMapJoinOptimizedHashMultiSet - extends VectorMapJoinOptimizedHashTable - implements VectorMapJoinBytesHashMultiSet { - - @Override - public VectorMapJoinHashMultiSetResult createHashMultiSetResult() { - return new HashMultiSetResult(); - } - - public static class HashMultiSetResult extends VectorMapJoinHashMultiSetResult { - - private BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult; - - private boolean haveCount; - - public HashMultiSetResult() { - super(); - bytesBytesMultiHashMapResult = new BytesBytesMultiHashMap.Result(); - } - - public BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult() { - return bytesBytesMultiHashMapResult; - } - - /* - * @return The multi-set count for the lookup key. - */ - @Override - public long count() { - if (!haveCount) { - if (bytesBytesMultiHashMapResult.isSingleRow()) { - count = 1; - } else { - count = 0; - ByteSegmentRef byteSegmentRef = bytesBytesMultiHashMapResult.first(); - while (byteSegmentRef != null) { - count++; - byteSegmentRef = bytesBytesMultiHashMapResult.next(); - } - } - haveCount = true; - } - return count; - } - - @Override - public void forget() { - haveCount = false; - bytesBytesMultiHashMapResult.forget(); - super.forget(); - } - } - - @Override - public JoinUtil.JoinResult contains(byte[] keyBytes, int keyOffset, int keyLength, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException { - - HashMultiSetResult implementationHashMultiSetResult = (HashMultiSetResult) hashMultiSetResult; - - JoinUtil.JoinResult joinResult = - doLookup(keyBytes, keyOffset, keyLength, - implementationHashMultiSetResult.bytesBytesMultiHashMapResult(), - (VectorMapJoinHashTableResult) hashMultiSetResult); - - return joinResult; - } - - public VectorMapJoinOptimizedHashMultiSet( - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashSet.java deleted file mode 100644 index 93a89d7..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashSet.java +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; - -public class VectorMapJoinOptimizedHashSet - extends VectorMapJoinOptimizedHashTable - implements VectorMapJoinBytesHashSet { - - @Override - public VectorMapJoinHashSetResult createHashSetResult() { - return new HashSetResult(); - } - - public static class HashSetResult extends VectorMapJoinHashSetResult { - - private BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult; - - public HashSetResult() { - super(); - bytesBytesMultiHashMapResult = new BytesBytesMultiHashMap.Result(); - } - - public BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult() { - return bytesBytesMultiHashMapResult; - } - - @Override - public void forget() { - bytesBytesMultiHashMapResult.forget(); - super.forget(); - } - } - - @Override - public JoinUtil.JoinResult contains(byte[] keyBytes, int keyOffset, int keyLength, - VectorMapJoinHashSetResult hashSetResult) throws IOException { - - HashSetResult implementationHashSetResult = (HashSetResult) hashSetResult; - - JoinUtil.JoinResult joinResult = - doLookup(keyBytes, keyOffset, keyLength, - implementationHashSetResult.bytesBytesMultiHashMapResult(), - (VectorMapJoinHashTableResult) hashSetResult); - - return joinResult; - } - - public VectorMapJoinOptimizedHashSet( - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashTable.java deleted file mode 100644 index 5fe7861..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashTable.java +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerDirectAccess; -import org.apache.hadoop.hive.ql.exec.persistence.ReusableGetAdaptorDirectAccess; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTable; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.Writable; - -/* - * Root interface for a vector map join hash table (which could be a hash map, hash multi-set, or - * hash set). - */ -public abstract class VectorMapJoinOptimizedHashTable implements VectorMapJoinHashTable { - - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOptimizedMultiKeyHashMap.class.getName()); - - protected final MapJoinTableContainer originalTableContainer; - protected final MapJoinTableContainerDirectAccess containerDirectAccess; - protected final ReusableGetAdaptorDirectAccess adapatorDirectAccess; - - public static class SerializedBytes { - byte[] bytes; - int offset; - int length; - } - - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - putRowInternal(currentKey, currentValue); - } - - protected void putRowInternal(BytesWritable key, BytesWritable value) - throws SerDeException, HiveException, IOException { - - containerDirectAccess.put((Writable) key, (Writable) value); - } - - public JoinUtil.JoinResult doLookup(byte[] keyBytes, int keyOffset, int keyLength, - BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult, - VectorMapJoinHashTableResult hashTableResult) { - - hashTableResult.forget(); - - JoinUtil.JoinResult joinResult = - adapatorDirectAccess.setDirect(keyBytes, keyOffset, keyLength, - bytesBytesMultiHashMapResult); - if (joinResult == JoinUtil.JoinResult.SPILL) { - hashTableResult.setSpillPartitionId(adapatorDirectAccess.directSpillPartitionId()); - } - - hashTableResult.setJoinResult(joinResult); - - return joinResult; - } - - public VectorMapJoinOptimizedHashTable( - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - - this.originalTableContainer = originalTableContainer; - containerDirectAccess = (MapJoinTableContainerDirectAccess) originalTableContainer; - adapatorDirectAccess = (ReusableGetAdaptorDirectAccess) hashMapRowGetter; - } - - @Override - public int size() { - return originalTableContainer.size(); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java deleted file mode 100644 index a84de89..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java +++ /dev/null @@ -1,171 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedHashTable.SerializedBytes; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; -import org.apache.hadoop.hive.serde2.ByteStream.Output; -import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; - -/* - * An single long value hash map based on the BytesBytesMultiHashMap. - * - * We serialize the long key into BinarySortable format into an output buffer accepted by - * BytesBytesMultiHashMap. - */ -public class VectorMapJoinOptimizedLongCommon { - - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOptimizedLongCommon.class.getName()); - - private boolean isOuterJoin; - - private HashTableKeyType hashTableKeyType; - - // private BinarySortableDeserializeRead keyBinarySortableDeserializeRead; - - private BinarySortableSerializeWrite keyBinarySortableSerializeWrite; - - private transient Output output; - - private transient SerializedBytes serializedBytes; - - // protected boolean useMinMax; - protected long min; - protected long max; - - public boolean useMinMax() { - return false; - } - - public long min() { - return min; - } - - public long max() { - return max; - } - - /* - * For now, just use MapJoinBytesTableContainer / HybridHashTableContainer directly. - - public void adaptPutRow(VectorMapJoinOptimizedHashTable hashTable, - BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - if (useMinMax) { - // Peek at the BinarySortable key to extract the long so we can determine min and max. - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - if (keyBinarySortableDeserializeRead.readCheckNull()) { - if (isOuterJoin) { - return; - } else { - // For inner join, we expect all NULL values to have been filtered out before now. - throw new HiveException("Unexpected NULL"); - } - } - long key = 0; - switch (hashTableKeyType) { - case BOOLEAN: - key = (keyBinarySortableDeserializeRead.readBoolean() ? 1 : 0); - break; - case BYTE: - key = (long) keyBinarySortableDeserializeRead.readByte(); - break; - case SHORT: - key = (long) keyBinarySortableDeserializeRead.readShort(); - break; - case INT: - key = (long) keyBinarySortableDeserializeRead.readInt(); - break; - case LONG: - key = keyBinarySortableDeserializeRead.readLong(); - break; - default: - throw new RuntimeException("Unexpected hash table key type " + hashTableKeyType.name()); - } - if (key < min) { - min = key; - } - if (key > max) { - max = key; - } - - // byte[] bytes = Arrays.copyOf(currentKey.get(), currentKey.getLength()); - // LOG.debug("VectorMapJoinOptimizedLongCommon adaptPutRow key " + key + " min " + min + " max " + max + " hashTableKeyType " + hashTableKeyType.name() + " hex " + Hex.encodeHexString(bytes)); - - } - - hashTable.putRowInternal(currentKey, currentValue); - } - */ - - public SerializedBytes serialize(long key) throws IOException { - keyBinarySortableSerializeWrite.reset(); - - switch (hashTableKeyType) { - case BOOLEAN: - keyBinarySortableSerializeWrite.writeBoolean(key == 1); - break; - case BYTE: - keyBinarySortableSerializeWrite.writeByte((byte) key); - break; - case SHORT: - keyBinarySortableSerializeWrite.writeShort((short) key); - break; - case INT: - keyBinarySortableSerializeWrite.writeInt((int) key); - break; - case LONG: - keyBinarySortableSerializeWrite.writeLong(key); - break; - default: - throw new RuntimeException("Unexpected hash table key type " + hashTableKeyType.name()); - } - - // byte[] bytes = Arrays.copyOf(output.getData(), output.getLength()); - // LOG.debug("VectorMapJoinOptimizedLongCommon serialize key " + key + " hashTableKeyType " + hashTableKeyType.name() + " hex " + Hex.encodeHexString(bytes)); - - serializedBytes.bytes = output.getData(); - serializedBytes.offset = 0; - serializedBytes.length = output.getLength(); - - return serializedBytes; - } - - public VectorMapJoinOptimizedLongCommon( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType) { - this.isOuterJoin = isOuterJoin; - // useMinMax = minMaxEnabled; - min = Long.MAX_VALUE; - max = Long.MIN_VALUE; - this.hashTableKeyType = hashTableKeyType; - // PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.longTypeInfo }; - // keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); - keyBinarySortableSerializeWrite = new BinarySortableSerializeWrite(1); - output = new Output(); - keyBinarySortableSerializeWrite.set(output); - serializedBytes = new SerializedBytes(); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMap.java deleted file mode 100644 index 403d265..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMap.java +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; - -/* - * An single long value hash map based on the BytesBytesMultiHashMap. - * - * We serialize the long key into BinarySortable format into an output buffer accepted by - * BytesBytesMultiHashMap. - */ -public class VectorMapJoinOptimizedLongHashMap - extends VectorMapJoinOptimizedHashMap - implements VectorMapJoinLongHashMap { - - private VectorMapJoinOptimizedLongCommon longCommon; - - @Override - public boolean useMinMax() { - return longCommon.useMinMax(); - } - - @Override - public long min() { - return longCommon.min(); - } - - @Override - public long max() { - return longCommon.max(); - } - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - longCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult lookup(long key, - VectorMapJoinHashMapResult hashMapResult) throws IOException { - - SerializedBytes serializedBytes = longCommon.serialize(key); - - return super.lookup(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashMapResult); - } - - public VectorMapJoinOptimizedLongHashMap( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - longCommon = new VectorMapJoinOptimizedLongCommon(minMaxEnabled, isOuterJoin, hashTableKeyType); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMultiSet.java deleted file mode 100644 index 5fb8c3a..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMultiSet.java +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMultiSet; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; - -/* - * An single long value hash map based on the BytesBytesMultiHashMultiSet. - * - * We serialize the long key into BinarySortable format into an output buffer accepted by - * BytesBytesMultiHashMultiSet. - */ -public class VectorMapJoinOptimizedLongHashMultiSet - extends VectorMapJoinOptimizedHashMultiSet - implements VectorMapJoinLongHashMultiSet { - - private VectorMapJoinOptimizedLongCommon longCommon; - - @Override - public boolean useMinMax() { - return longCommon.useMinMax(); - } - - @Override - public long min() { - return longCommon.min(); - } - - @Override - public long max() { - return longCommon.max(); - } - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - longCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult contains(long key, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException { - - SerializedBytes serializedBytes = longCommon.serialize(key); - - return super.contains(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashMultiSetResult); - - } - - public VectorMapJoinOptimizedLongHashMultiSet( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - longCommon = new VectorMapJoinOptimizedLongCommon(minMaxEnabled, isOuterJoin, hashTableKeyType); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashSet.java deleted file mode 100644 index c41505a..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashSet.java +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashSet; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; - -/* - * An single long value hash map based on the BytesBytesMultiHashSet. - * - * We serialize the long key into BinarySortable format into an output buffer accepted by - * BytesBytesMultiHashSet. - */ -public class VectorMapJoinOptimizedLongHashSet - extends VectorMapJoinOptimizedHashSet - implements VectorMapJoinLongHashSet { - - private VectorMapJoinOptimizedLongCommon longCommon; - - @Override - public boolean useMinMax() { - return longCommon.useMinMax(); - } - - @Override - public long min() { - return longCommon.min(); - } - - @Override - public long max() { - return longCommon.max(); - } - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - longCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult contains(long key, - VectorMapJoinHashSetResult hashSetResult) throws IOException { - - SerializedBytes serializedBytes = longCommon.serialize(key); - - return super.contains(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashSetResult); - - } - - public VectorMapJoinOptimizedLongHashSet( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - longCommon = new VectorMapJoinOptimizedLongCommon(minMaxEnabled, isOuterJoin, hashTableKeyType); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMap.java deleted file mode 100644 index 4f3e20e..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMap.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; - -/* - * An multi-key hash map based on the BytesBytesMultiHashMap. - */ -public class VectorMapJoinOptimizedMultiKeyHashMap - extends VectorMapJoinOptimizedHashMap { - - // UNDONE: How to look for all NULLs in a multi-key????? Let nulls through for now. - - public VectorMapJoinOptimizedMultiKeyHashMap(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMultiSet.java deleted file mode 100644 index b95a2dd..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMultiSet.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; - -/* - * An multi-key hash map based on the BytesBytesMultiHashMultiSet. - */ -public class VectorMapJoinOptimizedMultiKeyHashMultiSet - extends VectorMapJoinOptimizedHashMultiSet { - - // UNDONE: How to look for all NULLs in a multi-key????? Let nulls through for now. - - public VectorMapJoinOptimizedMultiKeyHashMultiSet(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashSet.java deleted file mode 100644 index 35ecc2a..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashSet.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; - -/* - * An multi-key hash map based on the BytesBytesMultiHashSet. - */ -public class VectorMapJoinOptimizedMultiKeyHashSet - extends VectorMapJoinOptimizedHashSet { - - // UNDONE: How to look for all NULLs in a multi-key????? Let nulls through for now. - - public VectorMapJoinOptimizedMultiKeyHashSet(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringCommon.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringCommon.java deleted file mode 100644 index 39c2d49..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringCommon.java +++ /dev/null @@ -1,98 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedHashTable.SerializedBytes; -import org.apache.hadoop.hive.serde2.ByteStream.Output; -import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; - -/* - * An single byte array value hash map based on the BytesBytesMultiHashMap. - * - * Since BytesBytesMultiHashMap does not interpret the key as BinarySortable we optimize - * this case and just reference the byte array key directly for the lookup instead of serializing - * the byte array into BinarySortable. We rely on it just doing byte array equality comparisons. - */ -public class VectorMapJoinOptimizedStringCommon { - - // private boolean isOuterJoin; - - // private BinarySortableDeserializeRead keyBinarySortableDeserializeRead; - - // private ReadStringResults readStringResults; - - private BinarySortableSerializeWrite keyBinarySortableSerializeWrite; - - private transient Output output; - - private transient SerializedBytes serializedBytes; - - /* - private BytesWritable bytesWritable; - - public void adaptPutRow(VectorMapJoinOptimizedHashTable hashTable, - BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - if (keyBinarySortableDeserializeRead.readCheckNull()) { - if (isOuterJoin) { - return; - } else { - // For inner join, we expect all NULL values to have been filtered out before now. - throw new HiveException("Unexpected NULL"); - } - } - keyBinarySortableDeserializeRead.readString(readStringResults); - - bytesWritable.set(readStringResults.bytes, readStringResults.start, readStringResults.length); - - hashTable.putRowInternal(bytesWritable, currentValue); - } - */ - - public SerializedBytes serialize(byte[] keyBytes, int keyStart, int keyLength) throws IOException { - - keyBinarySortableSerializeWrite.reset(); - keyBinarySortableSerializeWrite.writeString(keyBytes, keyStart, keyLength); - - serializedBytes.bytes = output.getData(); - serializedBytes.offset = 0; - serializedBytes.length = output.getLength(); - - return serializedBytes; - - } - - public VectorMapJoinOptimizedStringCommon(boolean isOuterJoin) { - // this.isOuterJoin = isOuterJoin; - // PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.stringTypeInfo }; - // keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); - // readStringResults = keyBinarySortableDeserializeRead.createReadStringResults(); - // bytesWritable = new BytesWritable(); - keyBinarySortableSerializeWrite = new BinarySortableSerializeWrite(1); - output = new Output(); - keyBinarySortableSerializeWrite.set(output); - serializedBytes = new SerializedBytes(); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMap.java deleted file mode 100644 index 220c05e..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMap.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; - -/* - * An multi-key hash map based on the BytesBytesMultiHashMap. - */ -public class VectorMapJoinOptimizedStringHashMap - extends VectorMapJoinOptimizedHashMap - implements VectorMapJoinBytesHashMap { - - private VectorMapJoinOptimizedStringCommon stringCommon; - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - stringCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult lookup(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMapResult hashMapResult) throws IOException { - - SerializedBytes serializedBytes = stringCommon.serialize(keyBytes, keyStart, keyLength); - - return super.lookup(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashMapResult); - - } - - public VectorMapJoinOptimizedStringHashMap(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - stringCommon = new VectorMapJoinOptimizedStringCommon(isOuterJoin); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMultiSet.java deleted file mode 100644 index b6c6958..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMultiSet.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; - -/* - * An multi-key hash map based on the BytesBytesMultiHashMultiSet. - */ -public class VectorMapJoinOptimizedStringHashMultiSet - extends VectorMapJoinOptimizedHashMultiSet - implements VectorMapJoinBytesHashMultiSet { - - private VectorMapJoinOptimizedStringCommon stringCommon; - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - stringCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException { - - SerializedBytes serializedBytes = stringCommon.serialize(keyBytes, keyStart, keyLength); - - return super.contains(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashMultiSetResult); - - - } - - public VectorMapJoinOptimizedStringHashMultiSet(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - stringCommon = new VectorMapJoinOptimizedStringCommon(isOuterJoin); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashSet.java deleted file mode 100644 index f921b9c..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashSet.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; - -/* - * An multi-key hash map based on the BytesBytesMultiHashSet. - */ -public class VectorMapJoinOptimizedStringHashSet - extends VectorMapJoinOptimizedHashSet - implements VectorMapJoinBytesHashSet { - - private VectorMapJoinOptimizedStringCommon stringCommon; - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - stringCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashSetResult hashSetResult) throws IOException { - - SerializedBytes serializedBytes = stringCommon.serialize(keyBytes, keyStart, keyLength); - - return super.contains(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashSetResult); - - } - - public VectorMapJoinOptimizedStringHashSet(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - stringCommon = new VectorMapJoinOptimizedStringCommon(isOuterJoin); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkCommonOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkCommonOperator.java index 8133aef..b01cbf1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkCommonOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkCommonOperator.java @@ -33,9 +33,10 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesSerialized; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesFast; import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.BaseWork; @@ -67,6 +68,32 @@ private static final String CLASS_NAME = VectorReduceSinkCommonOperator.class.getName(); private static final Log LOG = LogFactory.getLog(CLASS_NAME); + protected abstract String getLoggingPrefix(); + + // For debug tracing: information about the map or reduce task, operator, operator class, etc. + protected transient String loggingPrefix; + + protected String getLoggingPrefix(String className) { + if (loggingPrefix == null) { + initLoggingPrefix(className); + } + return loggingPrefix; + } + + protected void initLoggingPrefix(String className) { + if (hconf == null) { + // Constructor time... + loggingPrefix = className; + } else { + // Determine the name of our map or reduce task for debug tracing. + BaseWork work = Utilities.getMapWork(hconf); + if (work == null) { + work = Utilities.getReduceWork(hconf); + } + loggingPrefix = className + " " + work.getName() + " " + getOperatorId(); + } + } + protected VectorReduceSinkDesc vectorDesc; /** @@ -84,6 +111,7 @@ // This is map of which vectorized row batch columns are the key columns. // And, their types. protected int[] reduceSinkKeyColumnMap; + protected String[] reduceSinkKeyColumnNames; protected TypeInfo[] reduceSinkKeyTypeInfos; // Optional vectorized key expressions that need to be run on each batch. @@ -92,6 +120,7 @@ // This is map of which vectorized row batch columns are the value columns. // And, their types. protected int[] reduceSinkValueColumnMap; + protected String[] reduceSinkValueColumnNames; protected TypeInfo[] reduceSinkValueTypeInfos; // Optional vectorized value expressions that need to be run on each batch. @@ -101,6 +130,8 @@ // transient. //--------------------------------------------------------------------------- + protected transient Configuration hconf; + // Whether there is to be a tag added to the end of each key and the tag value. private transient boolean reduceSkipTag; private transient byte reduceTagByte; @@ -130,7 +161,7 @@ private transient OutputCollector out; // The object that determines equal key series. - protected transient VectorKeySeriesSerialized serializedKeySeries; + protected transient VectorKeySeriesFast serializedKeySeriesFast; private transient long numRows = 0; private transient long cntr = 1; @@ -167,12 +198,26 @@ public VectorReduceSinkCommonOperator(CompilationOpContext ctx, // Since a key expression can be a calculation and the key will go into a scratch column, // we need the mapping and type information. reduceSinkKeyColumnMap = vectorReduceSinkInfo.getReduceSinkKeyColumnMap(); + reduceSinkKeyColumnNames = vectorReduceSinkInfo.getReduceSinkKeyColumnNames(); reduceSinkKeyTypeInfos = vectorReduceSinkInfo.getReduceSinkKeyTypeInfos(); reduceSinkKeyExpressions = vectorReduceSinkInfo.getReduceSinkKeyExpressions(); reduceSinkValueColumnMap = vectorReduceSinkInfo.getReduceSinkValueColumnMap(); + reduceSinkValueColumnNames = vectorReduceSinkInfo.getReduceSinkValueColumnNames(); reduceSinkValueTypeInfos = vectorReduceSinkInfo.getReduceSinkValueTypeInfos(); reduceSinkValueExpressions = vectorReduceSinkInfo.getReduceSinkValueExpressions(); + + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkKeyColumnMap " + Arrays.toString(reduceSinkKeyColumnMap)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkKeyColumnNames " + Arrays.toString(reduceSinkKeyColumnNames)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkKeyTypeInfos " + Arrays.toString(reduceSinkKeyTypeInfos)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkKeyExpressions " + Arrays.toString(reduceSinkKeyExpressions)); + + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkValueColumnMap " + Arrays.toString(reduceSinkValueColumnMap)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkValueColumnNames " + Arrays.toString(reduceSinkValueColumnNames)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkValueTypeInfos " + Arrays.toString(reduceSinkValueTypeInfos)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkValueExpressions " + Arrays.toString(reduceSinkValueExpressions)); + } } // Get the sort order @@ -245,6 +290,7 @@ public VectorReduceSinkCommonOperator(CompilationOpContext ctx, @Override protected void initializeOp(Configuration hconf) throws HiveException { + this.hconf = hconf; super.initializeOp(hconf); if (LOG.isDebugEnabled()) { @@ -295,7 +341,8 @@ protected void initializeOp(Configuration hconf) throws HiveException { throw new HiveException(e); } - valueLazyBinarySerializeWrite = new LazyBinarySerializeWrite(reduceSinkValueColumnMap.length); + valueLazyBinarySerializeWrite = + new LazyBinarySerializeWrite(reduceSinkValueColumnMap.length); valueVectorSerializeRow = new VectorSerializeRow( @@ -322,7 +369,7 @@ public void process(Object row, int tag) throws HiveException { if (batch.size == 0) { if (LOG.isDebugEnabled()) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); + LOG.debug(getLoggingPrefix() + " batch #" + batchCounter + " empty"); } return; } @@ -341,7 +388,7 @@ public void process(Object row, int tag) throws HiveException { } } - serializedKeySeries.processBatch(batch); + serializedKeySeriesFast.processBatch(batch); boolean selectedInUse = batch.selectedInUse; int[] selected = batch.selected; @@ -351,7 +398,7 @@ public void process(Object row, int tag) throws HiveException { int end; int batchIndex; do { - if (serializedKeySeries.getCurrentIsAllNull()) { + if (serializedKeySeriesFast.getCurrentKeyAllNull()) { // Use the same logic as ReduceSinkOperator.toHiveKey. // @@ -368,24 +415,24 @@ public void process(Object row, int tag) throws HiveException { } else { // One serialized key for 1 or more rows for the duplicate keys. - // LOG.info("reduceSkipTag " + reduceSkipTag + " tag " + tag + " reduceTagByte " + (int) reduceTagByte + " keyLength " + serializedKeySeries.getSerializedLength()); - // LOG.info("process offset " + serializedKeySeries.getSerializedStart() + " length " + serializedKeySeries.getSerializedLength()); - keyLength = serializedKeySeries.getSerializedLength(); + // LOG.info(getLoggingPrefix() + " reduceSkipTag " + reduceSkipTag + " tag " + tag + " reduceTagByte " + (int) reduceTagByte + " keyLength " + serializedKeySeriesFast.getSerializedLength()); + // LOG.info(getLoggingPrefix() + " process offset " + serializedKeySeriesFast.getSerializedStart() + " length " + serializedKeySeriesFast.getSerializedLength()); + keyLength = serializedKeySeriesFast.getSerializedLength(); if (tag == -1 || reduceSkipTag) { - keyWritable.set(serializedKeySeries.getSerializedBytes(), - serializedKeySeries.getSerializedStart(), keyLength); + keyWritable.set(serializedKeySeriesFast.getSerializedBytes(), + serializedKeySeriesFast.getSerializedStart(), keyLength); } else { keyWritable.setSize(keyLength + 1); - System.arraycopy(serializedKeySeries.getSerializedBytes(), - serializedKeySeries.getSerializedStart(), keyWritable.get(), 0, keyLength); + System.arraycopy(serializedKeySeriesFast.getSerializedBytes(), + serializedKeySeriesFast.getSerializedStart(), keyWritable.get(), 0, keyLength); keyWritable.get()[keyLength] = reduceTagByte; } keyWritable.setDistKeyLength(keyLength); - keyWritable.setHashCode(serializedKeySeries.getCurrentHashCode()); + keyWritable.setHashCode(serializedKeySeriesFast.getCurrentHashCode()); } - logical = serializedKeySeries.getCurrentLogical(); - end = logical + serializedKeySeries.getCurrentDuplicateCount(); + logical = serializedKeySeriesFast.getCurrentLogical(); + end = logical + serializedKeySeriesFast.getCurrentDuplicateCount(); do { batchIndex = (selectedInUse ? selected[logical] : logical); @@ -396,8 +443,8 @@ public void process(Object row, int tag) throws HiveException { collect(keyWritable, valueBytesWritable); } while (++logical < end); - - if (!serializedKeySeries.next()) { + + if (!serializedKeySeriesFast.next()) { break; } } while (true); @@ -424,7 +471,7 @@ protected void collect(BytesWritable keyWritable, Writable valueWritable) throws } // BytesWritable valueBytesWritable = (BytesWritable) valueWritable; - // LOG.info("VectorReduceSinkCommonOperator collect keyWritable " + keyWritable.getLength() + " " + + // LOG.info(getLoggingPrefix() + " collect keyWritable " + keyWritable.getLength() + " " + // VectorizedBatchUtil.displayBytes(keyWritable.getBytes(), 0, keyWritable.getLength()) + // " valueWritable " + valueBytesWritable.getLength() + // VectorizedBatchUtil.displayBytes(valueBytesWritable.getBytes(), 0, valueBytesWritable.getLength())); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkLongOperator.java index 325f773..6a84523 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkLongOperator.java @@ -23,7 +23,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; -import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLongSerialized; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesLongFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; @@ -38,6 +38,10 @@ private static final String CLASS_NAME = VectorReduceSinkLongOperator.class.getName(); private static final Log LOG = LogFactory.getLog(CLASS_NAME); + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + // The column number and type information for this one column long reduce key. private transient int singleKeyColumn; private transient PrimitiveTypeInfo singleKeyColumnPrimitiveTypeInfo; @@ -71,8 +75,8 @@ protected void initializeOp(Configuration hconf) throws HiveException { singleKeyColumn = reduceSinkKeyColumnMap[0]; singleKeyColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) reduceSinkKeyTypeInfos[0]; - serializedKeySeries = - new VectorKeySeriesLongSerialized( + serializedKeySeriesFast = + new VectorKeySeriesLongFast( singleKeyColumn, singleKeyColumnPrimitiveTypeInfo, keyBinarySortableSerializeWrite); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkMultiKeyOperator.java index 2027187..927b569 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkMultiKeyOperator.java @@ -23,7 +23,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; -import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesMultiSerialized; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; @@ -38,6 +38,10 @@ private static final String CLASS_NAME = VectorReduceSinkMultiKeyOperator.class.getName(); private static final Log LOG = LogFactory.getLog(CLASS_NAME); + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + // The above members are initialized by the constructor and must not be // transient. //--------------------------------------------------------------------------- @@ -64,11 +68,11 @@ public VectorReduceSinkMultiKeyOperator(CompilationOpContext ctx, protected void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); - VectorKeySeriesMultiSerialized serializedMultiKeySeries = - new VectorKeySeriesMultiSerialized( + VectorKeySeriesMultiFast serializedMultiKeySeries = + new VectorKeySeriesMultiFast( keyBinarySortableSerializeWrite); serializedMultiKeySeries.init(reduceSinkKeyTypeInfos, reduceSinkKeyColumnMap); - serializedKeySeries = serializedMultiKeySeries; + serializedKeySeriesFast = serializedMultiKeySeries; } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkStringOperator.java index b655e6e..45797e1e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkStringOperator.java @@ -23,11 +23,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; -import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytesSerialized; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesBytesFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /* * Specialized class for native vectorized reduce sink that is reducing on a single long key column. @@ -38,6 +37,10 @@ private static final String CLASS_NAME = VectorReduceSinkStringOperator.class.getName(); private static final Log LOG = LogFactory.getLog(CLASS_NAME); + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + // The column number and type information for this one column string reduce key. private transient int singleKeyColumn; @@ -69,8 +72,8 @@ protected void initializeOp(Configuration hconf) throws HiveException { singleKeyColumn = reduceSinkKeyColumnMap[0]; - serializedKeySeries = - new VectorKeySeriesBytesSerialized( + serializedKeySeriesFast = + new VectorKeySeriesBytesFast( singleKeyColumn, keyBinarySortableSerializeWrite); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index d806b97..e781a3d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -41,7 +41,6 @@ import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.exec.*; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; import org.apache.hadoop.hive.ql.exec.spark.SparkTask; import org.apache.hadoop.hive.ql.exec.tez.TezTask; import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; @@ -61,6 +60,9 @@ import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkStringOperator; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping; +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping; import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOuterFilteredOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorSMBMapJoinOperator; @@ -113,6 +115,7 @@ import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo; import org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.VectorReduceSinkInfo; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; @@ -167,8 +170,10 @@ import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import com.google.common.base.Preconditions; import com.google.common.base.Joiner; public class Vectorizer implements PhysicalPlanResolver { @@ -1661,15 +1666,15 @@ private boolean validateAggregationIsPrimitive(VectorAggregateExpression vectorA return (outputObjInspector.getCategory() == ObjectInspector.Category.PRIMITIVE); } - private boolean validateAggregationDesc(AggregationDesc aggDesc, boolean isReduceMergePartial, + private boolean validateAggregationDesc(AggregationDesc aggrDesc, boolean isReduceMergePartial, boolean hasKeys) { - String udfName = aggDesc.getGenericUDAFName().toLowerCase(); + String udfName = aggrDesc.getGenericUDAFName().toLowerCase(); if (!supportedAggregationUdfs.contains(udfName)) { LOG.info("Cannot vectorize groupby aggregate expression: UDF " + udfName + " not supported"); return false; } - if (aggDesc.getParameters() != null && !validateExprNodeDesc(aggDesc.getParameters())) { + if (aggrDesc.getParameters() != null && !validateExprNodeDesc(aggrDesc.getParameters())) { LOG.info("Cannot vectorize groupby aggregate expression: UDF parameters not supported"); return false; } @@ -1678,7 +1683,7 @@ private boolean validateAggregationDesc(AggregationDesc aggDesc, boolean isReduc VectorizationContext vc = new ValidatorVectorizationContext(); VectorAggregateExpression vectorAggrExpr; try { - vectorAggrExpr = vc.getAggregatorExpression(aggDesc, isReduceMergePartial); + vectorAggrExpr = vc.getAggregatorExpression(aggrDesc, isReduceMergePartial); } catch (Exception e) { // We should have already attempted to vectorize in validateAggregationDesc. if (LOG.isDebugEnabled()) { @@ -1769,7 +1774,8 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { } Operator specializeMapJoinOperator(Operator op, - VectorizationContext vContext, MapJoinDesc desc) throws HiveException { + VectorizationContext vContext, MapJoinDesc desc, VectorMapJoinInfo vectorMapJoinInfo) + throws HiveException { Operator vectorOp = null; Class> opClass = null; @@ -1777,14 +1783,7 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { VectorMapJoinDesc.HashTableKind hashTableKind = HashTableKind.NONE; VectorMapJoinDesc.HashTableKeyType hashTableKeyType = HashTableKeyType.NONE; - if (HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) { - hashTableImplementationType = HashTableImplementationType.FAST; - } else { - // Restrict to using BytesBytesMultiHashMap via MapJoinBytesTableContainer or - // HybridHashTableContainer. - hashTableImplementationType = HashTableImplementationType.OPTIMIZED; - } + hashTableImplementationType = HashTableImplementationType.FAST; int joinType = desc.getConds()[0].getType(); @@ -1804,20 +1803,31 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { Map> keyExprs = desc.getKeys(); List bigTableKeyExprs = keyExprs.get(posBigTable); if (bigTableKeyExprs.size() == 1) { - String typeName = bigTableKeyExprs.get(0).getTypeString(); - LOG.info("Vectorizer vectorizeOperator map join typeName " + typeName); - if (typeName.equals("boolean")) { + TypeInfo typeInfo = bigTableKeyExprs.get(0).getTypeInfo(); + LOG.info("Vectorizer vectorizeOperator map join typeName " + typeInfo.getTypeName()); + switch (((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()) { + case BOOLEAN: hashTableKeyType = HashTableKeyType.BOOLEAN; - } else if (typeName.equals("tinyint")) { + break; + case BYTE: hashTableKeyType = HashTableKeyType.BYTE; - } else if (typeName.equals("smallint")) { + break; + case SHORT: hashTableKeyType = HashTableKeyType.SHORT; - } else if (typeName.equals("int")) { + break; + case INT: hashTableKeyType = HashTableKeyType.INT; - } else if (typeName.equals("bigint") || typeName.equals("long")) { + break; + case LONG: hashTableKeyType = HashTableKeyType.LONG; - } else if (VectorizationContext.isStringFamily(typeName)) { + break; + case STRING: + case CHAR: + case VARCHAR: + case BINARY: hashTableKeyType = HashTableKeyType.STRING; + default: + // Stay with multi-key. } } } @@ -1908,12 +1918,10 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { throw new HiveException("Unknown join type " + joinType); } break; + default: + throw new RuntimeException("Unexpected hash table key type " + hashTableKeyType.name()); } - vectorOp = OperatorFactory.getVectorOperator( - opClass, op.getCompilationOpContext(), op.getConf(), vContext); - LOG.info("Vectorizer vectorizeOperator map join class " + vectorOp.getClass().getSimpleName()); - boolean minMaxEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MINMAX_ENABLED); @@ -1922,6 +1930,12 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { vectorDesc.setHashTableKind(hashTableKind); vectorDesc.setHashTableKeyType(hashTableKeyType); vectorDesc.setMinMaxEnabled(minMaxEnabled); + vectorDesc.setVectorMapJoinInfo(vectorMapJoinInfo); + + vectorOp = OperatorFactory.getVectorOperator( + opClass, op.getCompilationOpContext(), op.getConf(), vContext); + LOG.info("Vectorizer vectorizeOperator map join class " + vectorOp.getClass().getSimpleName()); + return vectorOp; } @@ -1939,53 +1953,297 @@ private boolean onExpressionHasNullSafes(MapJoinDesc desc) { } private boolean canSpecializeMapJoin(Operator op, MapJoinDesc desc, - boolean isTez) { - - boolean specialize = false; + boolean isTez, VectorizationContext vContext, VectorMapJoinInfo vectorMapJoinInfo) + throws HiveException { - if (op instanceof MapJoinOperator && - HiveConf.getBoolVar(hiveConf, + if (!(op instanceof MapJoinOperator) || + !HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED)) { + return false; + } - // Currently, only under Tez and non-N-way joins. - if (isTez && desc.getConds().length == 1 && !onExpressionHasNullSafes(desc)) { + // Currently, only under Tez and non-N-way joins. + if (!isTez || desc.getConds().length != 1 || onExpressionHasNullSafes(desc)) { + return false; + } - // Ok, all basic restrictions satisfied so far... - specialize = true; + /* + * Populate vectorMapJoininfo. + */ - if (!HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) { + byte posBigTable = (byte) desc.getPosBigTable(); - // We are using the optimized hash table we have further - // restrictions (using optimized and key type). + List keyDesc = desc.getKeys().get(posBigTable); + VectorExpression[] allBigTableKeyExpressions = vContext.getVectorExpressions(keyDesc); - if (!HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE)) { - specialize = false; - } else { - byte posBigTable = (byte) desc.getPosBigTable(); - Map> keyExprs = desc.getKeys(); - List bigTableKeyExprs = keyExprs.get(posBigTable); - for (ExprNodeDesc exprNodeDesc : bigTableKeyExprs) { - String typeName = exprNodeDesc.getTypeString(); - if (!MapJoinKey.isSupportedField(typeName)) { - specialize = false; - break; - } + // Since a key expression can be a calculation and the key will go into a scratch column, + // we need the mapping and type information. + int[] bigTableKeyColumnMap = new int[allBigTableKeyExpressions.length]; + String[] bigTableKeyColumnNames = new String[allBigTableKeyExpressions.length]; + TypeInfo[] bigTableKeyTypeInfos = new TypeInfo[allBigTableKeyExpressions.length]; + ArrayList bigTableKeyExpressionsList = new ArrayList(); + VectorExpression[] bigTableKeyExpressions; + for (int i = 0; i < bigTableKeyColumnMap.length; i++) { + VectorExpression ve = allBigTableKeyExpressions[i]; + if (!IdentityExpression.isColumnOnly(ve)) { + bigTableKeyExpressionsList.add(ve); + } + bigTableKeyColumnMap[i] = ve.getOutputColumn(); + + ExprNodeDesc exprNode = keyDesc.get(i); + bigTableKeyColumnNames[i] = exprNode.toString(); + bigTableKeyTypeInfos[i] = exprNode.getTypeInfo(); + } + if (bigTableKeyExpressionsList.size() == 0) { + bigTableKeyExpressions = null; + } else { + bigTableKeyExpressions = bigTableKeyExpressionsList.toArray(new VectorExpression[0]); + } + + List bigTableExprs = desc.getExprs().get(posBigTable); + VectorExpression[] allBigTableValueExpressions = vContext.getVectorExpressions(bigTableExprs); + + /* + * Similarly, we need a mapping since a value expression can be a calculation and the value + * will go into a scratch column. + */ + int[] bigTableValueColumnMap = new int[allBigTableValueExpressions.length]; + String[] bigTableValueColumnNames = new String[allBigTableValueExpressions.length]; + TypeInfo[] bigTableValueTypeInfos = new TypeInfo[allBigTableValueExpressions.length]; + ArrayList bigTableValueExpressionsList = new ArrayList(); + VectorExpression[] bigTableValueExpressions; + for (int i = 0; i < bigTableValueColumnMap.length; i++) { + VectorExpression ve = allBigTableValueExpressions[i]; + if (!IdentityExpression.isColumnOnly(ve)) { + bigTableValueExpressionsList.add(ve); + } + bigTableValueColumnMap[i] = ve.getOutputColumn(); + + ExprNodeDesc exprNode = bigTableExprs.get(i); + bigTableValueColumnNames[i] = exprNode.toString(); + bigTableValueTypeInfos[i] = exprNode.getTypeInfo(); + } + if (bigTableValueExpressionsList.size() == 0) { + bigTableValueExpressions = null; + } else { + bigTableValueExpressions = bigTableValueExpressionsList.toArray(new VectorExpression[0]); + } + + vectorMapJoinInfo.setBigTableKeyColumnMap(bigTableKeyColumnMap); + vectorMapJoinInfo.setBigTableKeyColumnNames(bigTableKeyColumnNames); + vectorMapJoinInfo.setBigTableKeyTypeInfos(bigTableKeyTypeInfos); + vectorMapJoinInfo.setBigTableKeyExpressions(bigTableKeyExpressions); + + vectorMapJoinInfo.setBigTableValueColumnMap(bigTableValueColumnMap); + vectorMapJoinInfo.setBigTableValueColumnNames(bigTableValueColumnNames); + vectorMapJoinInfo.setBigTableValueTypeInfos(bigTableValueTypeInfos); + vectorMapJoinInfo.setBigTableValueExpressions(bigTableValueExpressions); + + /* + * Small table information. + */ + VectorColumnOutputMapping bigTableRetainedMapping = + new VectorColumnOutputMapping("Big Table Retained Mapping"); + + VectorColumnOutputMapping bigTableOuterKeyMapping = + new VectorColumnOutputMapping("Big Table Outer Key Mapping"); + + // The order of the fields in the LazyBinary small table value must be used, so + // we use the source ordering flavor for the mapping. + VectorColumnSourceMapping smallTableMapping = + new VectorColumnSourceMapping("Small Table Mapping"); + + Byte[] order = desc.getTagOrder(); + Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]); + boolean isOuterJoin = !desc.getNoOuterJoin(); + + /* + * Gather up big and small table output result information from the MapJoinDesc. + */ + List bigTableRetainList = desc.getRetainList().get(posBigTable); + int bigTableRetainSize = bigTableRetainList.size(); + + int[] smallTableIndices; + int smallTableIndicesSize; + List smallTableExprs = desc.getExprs().get(posSingleVectorMapJoinSmallTable); + if (desc.getValueIndices() != null && desc.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) { + smallTableIndices = desc.getValueIndices().get(posSingleVectorMapJoinSmallTable); + smallTableIndicesSize = smallTableIndices.length; + } else { + smallTableIndices = null; + smallTableIndicesSize = 0; + } + + List smallTableRetainList = desc.getRetainList().get(posSingleVectorMapJoinSmallTable); + int smallTableRetainSize = smallTableRetainList.size(); + + int smallTableResultSize = 0; + if (smallTableIndicesSize > 0) { + smallTableResultSize = smallTableIndicesSize; + } else if (smallTableRetainSize > 0) { + smallTableResultSize = smallTableRetainSize; + } + + /* + * Determine the big table retained mapping first so we can optimize out (with + * projection) copying inner join big table keys in the subsequent small table results section. + */ + + // We use a mapping object here so we can build the projection in any order and + // get the ordered by 0 to n-1 output columns at the end. + // + // Also, to avoid copying a big table key into the small table result area for inner joins, + // we reference it with the projection so there can be duplicate output columns + // in the projection. + VectorColumnSourceMapping projectionMapping = new VectorColumnSourceMapping("Projection Mapping"); + + int nextOutputColumn = (order[0] == posBigTable ? 0 : smallTableResultSize); + for (int i = 0; i < bigTableRetainSize; i++) { + + // Since bigTableValueExpressions may do a calculation and produce a scratch column, we + // need to map to the right batch column. + + int retainColumn = bigTableRetainList.get(i); + int batchColumnIndex = bigTableValueColumnMap[retainColumn]; + TypeInfo typeInfo = bigTableValueTypeInfos[i]; + + // With this map we project the big table batch to make it look like an output batch. + projectionMapping.add(nextOutputColumn, batchColumnIndex, typeInfo); + + // Collect columns we copy from the big table batch to the overflow batch. + if (!bigTableRetainedMapping.containsOutputColumn(batchColumnIndex)) { + // Tolerate repeated use of a big table column. + bigTableRetainedMapping.add(batchColumnIndex, batchColumnIndex, typeInfo); + } + + nextOutputColumn++; + } + + /* + * Now determine the small table results. + */ + int firstSmallTableOutputColumn; + firstSmallTableOutputColumn = (order[0] == posBigTable ? bigTableRetainSize : 0); + int smallTableOutputCount = 0; + nextOutputColumn = firstSmallTableOutputColumn; + + // Small table indices has more information (i.e. keys) than retain, so use it if it exists... + String[] bigTableRetainedNames; + if (smallTableIndicesSize > 0) { + smallTableOutputCount = smallTableIndicesSize; + bigTableRetainedNames = new String[smallTableOutputCount]; + + for (int i = 0; i < smallTableIndicesSize; i++) { + if (smallTableIndices[i] >= 0) { + + // Zero and above numbers indicate a big table key is needed for + // small table result "area". + + int keyIndex = smallTableIndices[i]; + + // Since bigTableKeyExpressions may do a calculation and produce a scratch column, we + // need to map the right column. + int batchKeyColumn = bigTableKeyColumnMap[keyIndex]; + bigTableRetainedNames[i] = bigTableKeyColumnNames[keyIndex]; + TypeInfo typeInfo = bigTableKeyTypeInfos[keyIndex]; + + if (!isOuterJoin) { + + // Optimize inner join keys of small table results. + + // Project the big table key into the small table result "area". + projectionMapping.add(nextOutputColumn, batchKeyColumn, typeInfo); + + if (!bigTableRetainedMapping.containsOutputColumn(batchKeyColumn)) { + // If necessary, copy the big table key into the overflow batch's small table + // result "area". + bigTableRetainedMapping.add(batchKeyColumn, batchKeyColumn, typeInfo); } + } else { + + // For outer joins, since the small table key can be null when there is no match, + // we must have a physical (scratch) column for those keys. We cannot use the + // projection optimization used by inner joins above. + + int scratchColumn = vContext.allocateScratchColumn(typeInfo.getTypeName()); + projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo); + + bigTableRetainedMapping.add(batchKeyColumn, scratchColumn, typeInfo); + + bigTableOuterKeyMapping.add(batchKeyColumn, scratchColumn, typeInfo); } } else { - // With the fast hash table implementation, we currently do not support - // Hybrid Grace Hash Join. + // Negative numbers indicate a column to be (deserialize) read from the small table's + // LazyBinary value row. + int smallTableValueIndex = -smallTableIndices[i] - 1; - if (desc.isHybridHashJoin()) { - specialize = false; + ExprNodeDesc smallTableExprNode = smallTableExprs.get(i); + if (!validateExprNodeDesc(smallTableExprNode)) { + return false; } + + bigTableRetainedNames[i] = smallTableExprNode.toString(); + + TypeInfo typeInfo = smallTableExprNode.getTypeInfo(); + + // Make a new big table scratch column for the small table value. + int scratchColumn = vContext.allocateScratchColumn(typeInfo.getTypeName()); + projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo); + + smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo); } + nextOutputColumn++; } + } else if (smallTableRetainSize > 0) { + smallTableOutputCount = smallTableRetainSize; + bigTableRetainedNames = new String[smallTableOutputCount]; + + // Only small table values appear in join output result. + + for (int i = 0; i < smallTableRetainSize; i++) { + int smallTableValueIndex = smallTableRetainList.get(i); + + ExprNodeDesc smallTableExprNode = smallTableExprs.get(i); + if (!validateExprNodeDesc(smallTableExprNode)) { + return false; + } + + bigTableRetainedNames[i] = smallTableExprNode.toString(); + + // Make a new big table scratch column for the small table value. + TypeInfo typeInfo = smallTableExprNode.getTypeInfo(); + int scratchColumn = vContext.allocateScratchColumn(typeInfo.getTypeName()); + + projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo); + + smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo); + nextOutputColumn++; + } + } else { + bigTableRetainedNames = new String[0]; } - return specialize; + + // Convert dynamic arrays and maps to simple arrays. + + bigTableRetainedMapping.finalize(); + + bigTableOuterKeyMapping.finalize(); + + smallTableMapping.finalize(); + + vectorMapJoinInfo.setBigTableRetainedMapping(bigTableRetainedMapping); + vectorMapJoinInfo.setBigTableOuterKeyMapping(bigTableOuterKeyMapping); + vectorMapJoinInfo.setSmallTableMapping(smallTableMapping); + + projectionMapping.finalize(); + + // Verify we added an entry for each output. + assert projectionMapping.isSourceSequenceGood(); + + vectorMapJoinInfo.setProjectionMapping(projectionMapping); + + return true; } private Operator specializeReduceSinkOperator( @@ -2115,6 +2373,7 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, // Since a key expression can be a calculation and the key will go into a scratch column, // we need the mapping and type information. int[] reduceSinkKeyColumnMap = new int[allKeyExpressions.length]; + String[] reduceSinkKeyColumnNames = new String[allKeyExpressions.length]; TypeInfo[] reduceSinkKeyTypeInfos = new TypeInfo[allKeyExpressions.length]; Type[] reduceSinkKeyColumnVectorTypes = new Type[allKeyExpressions.length]; ArrayList groupByKeyExpressionsList = new ArrayList(); @@ -2122,12 +2381,15 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, for (int i = 0; i < reduceSinkKeyColumnMap.length; i++) { VectorExpression ve = allKeyExpressions[i]; reduceSinkKeyColumnMap[i] = ve.getOutputColumn(); - reduceSinkKeyTypeInfos[i] = keysDescs.get(i).getTypeInfo(); - reduceSinkKeyColumnVectorTypes[i] = - VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkKeyTypeInfos[i]); if (!IdentityExpression.isColumnOnly(ve)) { groupByKeyExpressionsList.add(ve); } + + ExprNodeDesc exprNode = keysDescs.get(i); + reduceSinkKeyColumnNames[i] = exprNode.toString(); + reduceSinkKeyTypeInfos[i] = exprNode.getTypeInfo(); + reduceSinkKeyColumnVectorTypes[i] = + VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkKeyTypeInfos[i]); } if (groupByKeyExpressionsList.size() == 0) { reduceSinkKeyExpressions = null; @@ -2138,33 +2400,39 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, ArrayList valueDescs = desc.getValueCols(); VectorExpression[] allValueExpressions = vContext.getVectorExpressions(valueDescs); - int[] reduceSinkValueColumnMap = new int[valueDescs.size()]; - TypeInfo[] reduceSinkValueTypeInfos = new TypeInfo[valueDescs.size()]; - Type[] reduceSinkValueColumnVectorTypes = new Type[valueDescs.size()]; + int[] reduceSinkValueColumnMap = new int[allValueExpressions.length]; + String[] reduceSinkValueColumnNames = new String[allValueExpressions.length]; + TypeInfo[] reduceSinkValueTypeInfos = new TypeInfo[allValueExpressions.length]; + Type[] reduceSinkValueColumnVectorTypes = new Type[allValueExpressions.length]; ArrayList reduceSinkValueExpressionsList = new ArrayList(); VectorExpression[] reduceSinkValueExpressions; for (int i = 0; i < valueDescs.size(); ++i) { VectorExpression ve = allValueExpressions[i]; reduceSinkValueColumnMap[i] = ve.getOutputColumn(); - reduceSinkValueTypeInfos[i] = valueDescs.get(i).getTypeInfo(); - reduceSinkValueColumnVectorTypes[i] = - VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkValueTypeInfos[i]); if (!IdentityExpression.isColumnOnly(ve)) { reduceSinkValueExpressionsList.add(ve); } + + ExprNodeDesc exprNode = valueDescs.get(i); + reduceSinkValueColumnNames[i] = exprNode.toString(); + reduceSinkValueTypeInfos[i] = exprNode.getTypeInfo(); + reduceSinkValueColumnVectorTypes[i] = + VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkValueTypeInfos[i]); } if (reduceSinkValueExpressionsList.size() == 0) { reduceSinkValueExpressions = null; } else { reduceSinkValueExpressions = reduceSinkValueExpressionsList.toArray(new VectorExpression[0]); } - + vectorReduceSinkInfo.setReduceSinkKeyColumnMap(reduceSinkKeyColumnMap); + vectorReduceSinkInfo.setReduceSinkKeyColumnNames(reduceSinkKeyColumnNames); vectorReduceSinkInfo.setReduceSinkKeyTypeInfos(reduceSinkKeyTypeInfos); vectorReduceSinkInfo.setReduceSinkKeyColumnVectorTypes(reduceSinkKeyColumnVectorTypes); vectorReduceSinkInfo.setReduceSinkKeyExpressions(reduceSinkKeyExpressions); vectorReduceSinkInfo.setReduceSinkValueColumnMap(reduceSinkValueColumnMap); + vectorReduceSinkInfo.setReduceSinkValueColumnNames(reduceSinkValueColumnNames); vectorReduceSinkInfo.setReduceSinkValueTypeInfos(reduceSinkValueTypeInfos); vectorReduceSinkInfo.setReduceSinkValueColumnVectorTypes(reduceSinkValueColumnVectorTypes); vectorReduceSinkInfo.setReduceSinkValueExpressions(reduceSinkValueExpressions); @@ -2179,8 +2447,9 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, switch (op.getType()) { case MAPJOIN: { + VectorMapJoinInfo vectorMapJoinInfo = new VectorMapJoinInfo(); MapJoinDesc desc = (MapJoinDesc) op.getConf(); - boolean specialize = canSpecializeMapJoin(op, desc, isTez || isSpark); + boolean specialize = canSpecializeMapJoin(op, desc, isTez || isSpark, vContext, vectorMapJoinInfo); if (!specialize) { @@ -2205,15 +2474,11 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, } else { - // TEMPORARY Until Native Vector Map Join with Hybrid passes tests... - // HiveConf.setBoolVar(physicalContext.getConf(), - // HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN, false); - - vectorOp = specializeMapJoinOperator(op, vContext, desc); + vectorOp = specializeMapJoinOperator(op, vContext, desc, vectorMapJoinInfo); } } break; - + case REDUCESINK: { VectorReduceSinkInfo vectorReduceSinkInfo = new VectorReduceSinkInfo(); diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java index e1bf1f4..e5e3a54 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.plan; +import com.google.common.base.Preconditions; + /** * VectorGroupByDesc. * @@ -32,7 +34,6 @@ public static enum HashTableImplementationType { NONE, - OPTIMIZED, FAST } @@ -59,11 +60,14 @@ private HashTableKeyType hashTableKeyType; private boolean minMaxEnabled; + private VectorMapJoinInfo vectorMapJoinInfo; + public VectorMapJoinDesc() { hashTableImplementationType = HashTableImplementationType.NONE; hashTableKind = HashTableKind.NONE; hashTableKeyType = HashTableKeyType.NONE; minMaxEnabled = false; + vectorMapJoinInfo = null; } public VectorMapJoinDesc(VectorMapJoinDesc clone) { @@ -71,6 +75,7 @@ public VectorMapJoinDesc(VectorMapJoinDesc clone) { this.hashTableKind = clone.hashTableKind; this.hashTableKeyType = clone.hashTableKeyType; this.minMaxEnabled = clone.minMaxEnabled; + this.vectorMapJoinInfo = clone.vectorMapJoinInfo; } public HashTableImplementationType hashTableImplementationType() { @@ -104,4 +109,13 @@ public boolean minMaxEnabled() { public void setMinMaxEnabled(boolean minMaxEnabled) { this.minMaxEnabled = minMaxEnabled; } + + public void setVectorMapJoinInfo(VectorMapJoinInfo vectorMapJoinInfo) { + Preconditions.checkState(vectorMapJoinInfo != null); + this.vectorMapJoinInfo = vectorMapJoinInfo; + } + + public VectorMapJoinInfo getVectorMapJoinInfo() { + return vectorMapJoinInfo; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinInfo.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinInfo.java new file mode 100644 index 0000000..2cf2e72 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinInfo.java @@ -0,0 +1,169 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping; +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * VectorMapJoinInfo. + * + * A convenience data structure that has information needed to vectorize map join. + * + * It is created by the Vectorizer when it is determining whether it can specialize so the + * information doesn't have to be recreated again and again by the VectorMapJoinOperator's + * constructors and later during execution. + */ +public class VectorMapJoinInfo { + + private static long serialVersionUID = 1L; + + private int[] bigTableKeyColumnMap; + private String[] bigTableKeyColumnNames; + private TypeInfo[] bigTableKeyTypeInfos; + private VectorExpression[] bigTableKeyExpressions; + + private int[] bigTableValueColumnMap; + private String[] bigTableValueColumnNames; + private TypeInfo[] bigTableValueTypeInfos; + private VectorExpression[] bigTableValueExpressions; + + private VectorColumnOutputMapping bigTableRetainedMapping; + private VectorColumnOutputMapping bigTableOuterKeyMapping; + private VectorColumnSourceMapping smallTableMapping; + + private VectorColumnSourceMapping projectionMapping; + + public VectorMapJoinInfo() { + bigTableKeyColumnMap = null; + bigTableKeyColumnNames = null; + bigTableKeyTypeInfos = null; + bigTableKeyExpressions = null; + + bigTableValueColumnMap = null; + bigTableValueColumnNames = null; + bigTableValueTypeInfos = null; + bigTableValueExpressions = null; + + bigTableRetainedMapping = null; + bigTableOuterKeyMapping = null; + smallTableMapping = null; + + projectionMapping = null; + } + + public int[] getBigTableKeyColumnMap() { + return bigTableKeyColumnMap; + } + + public void setBigTableKeyColumnMap(int[] bigTableKeyColumnMap) { + this.bigTableKeyColumnMap = bigTableKeyColumnMap; + } + + public String[] getBigTableKeyColumnNames() { + return bigTableKeyColumnNames; + } + + public void setBigTableKeyColumnNames(String[] bigTableKeyColumnNames) { + this.bigTableKeyColumnNames = bigTableKeyColumnNames; + } + + public TypeInfo[] getBigTableKeyTypeInfos() { + return bigTableKeyTypeInfos; + } + + public void setBigTableKeyTypeInfos(TypeInfo[] bigTableKeyTypeInfos) { + this.bigTableKeyTypeInfos = bigTableKeyTypeInfos; + } + + public VectorExpression[] getBigTableKeyExpressions() { + return bigTableKeyExpressions; + } + + public void setBigTableKeyExpressions(VectorExpression[] bigTableKeyExpressions) { + this.bigTableKeyExpressions = bigTableKeyExpressions; + } + + + public int[] getBigTableValueColumnMap() { + return bigTableValueColumnMap; + } + + public void setBigTableValueColumnMap(int[] bigTableValueColumnMap) { + this.bigTableValueColumnMap = bigTableValueColumnMap; + } + + public String[] getBigTableValueColumnNames() { + return bigTableValueColumnNames; + } + + public void setBigTableValueColumnNames(String[] bigTableValueColumnNames) { + this.bigTableValueColumnNames = bigTableValueColumnNames; + } + + public TypeInfo[] getBigTableValueTypeInfos() { + return bigTableValueTypeInfos; + } + + public void setBigTableValueTypeInfos(TypeInfo[] bigTableValueTypeInfos) { + this.bigTableValueTypeInfos = bigTableValueTypeInfos; + } + + public VectorExpression[] getBigTableValueExpressions() { + return bigTableValueExpressions; + } + + public void setBigTableValueExpressions(VectorExpression[] bigTableValueExpressions) { + this.bigTableValueExpressions = bigTableValueExpressions; + } + + public void setBigTableRetainedMapping(VectorColumnOutputMapping bigTableRetainedMapping) { + this.bigTableRetainedMapping = bigTableRetainedMapping; + } + + public VectorColumnOutputMapping getBigTableRetainedMapping() { + return bigTableRetainedMapping; + } + + public void setBigTableOuterKeyMapping(VectorColumnOutputMapping bigTableOuterKeyMapping) { + this.bigTableOuterKeyMapping = bigTableOuterKeyMapping; + } + + public VectorColumnOutputMapping getBigTableOuterKeyMapping() { + return bigTableOuterKeyMapping; + } + + public void setSmallTableMapping(VectorColumnSourceMapping smallTableMapping) { + this.smallTableMapping = smallTableMapping; + } + + public VectorColumnSourceMapping getSmallTableMapping() { + return smallTableMapping; + } + + public void setProjectionMapping(VectorColumnSourceMapping projectionMapping) { + this.projectionMapping = projectionMapping; + } + + public VectorColumnSourceMapping getProjectionMapping() { + return projectionMapping; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkDesc.java index c56bff6..487dd1c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkDesc.java @@ -31,10 +31,16 @@ private static long serialVersionUID = 1L; public static enum ReduceSinkKeyType { - NONE, - LONG, - STRING, - MULTI_KEY + NONE("none"), + LONG("long"), + STRING("string"), + MULTI_KEY("multi-key"); + + final String displayName; + + ReduceSinkKeyType(String displayName) { + this.displayName = displayName; + } } private ReduceSinkKeyType reduceSinkKeyType; @@ -61,4 +67,12 @@ public void setVectorReduceSinkInfo(VectorReduceSinkInfo vectorReduceSinkInfo) { public VectorReduceSinkInfo getVectorReduceSinkInfo() { return vectorReduceSinkInfo; } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("key type: "); + sb.append(reduceSinkKeyType.displayName); + return sb.toString(); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkInfo.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkInfo.java index 8c35415..807c26d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkInfo.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkInfo.java @@ -23,12 +23,12 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** - * VectorGroupByAggregrationInfo. + * VectorReduceSinkInfo. * * A convenience data structure that has information needed to vectorize reduce sink. * * It is created by the Vectorizer when it is determining whether it can specialize so the - * information doesn't have to be recreated again and agains by the VectorReduceSinkOperator's + * information doesn't have to be recreated again and again by the VectorReduceSinkOperator's * constructors and later during execution. */ public class VectorReduceSinkInfo { @@ -36,22 +36,26 @@ private static long serialVersionUID = 1L; private int[] reduceSinkKeyColumnMap; + private String[] reduceSinkKeyColumnNames; private TypeInfo[] reduceSinkKeyTypeInfos; private Type[] reduceSinkKeyColumnVectorTypes; private VectorExpression[] reduceSinkKeyExpressions; private int[] reduceSinkValueColumnMap; + private String[] reduceSinkValueColumnNames; private TypeInfo[] reduceSinkValueTypeInfos; private Type[] reduceSinkValueColumnVectorTypes; private VectorExpression[] reduceSinkValueExpressions; public VectorReduceSinkInfo() { reduceSinkKeyColumnMap = null; + reduceSinkKeyColumnNames = null; reduceSinkKeyTypeInfos = null; reduceSinkKeyColumnVectorTypes = null; reduceSinkKeyExpressions = null; reduceSinkValueColumnMap = null; + reduceSinkValueColumnNames = null; reduceSinkValueTypeInfos = null; reduceSinkValueColumnVectorTypes = null; reduceSinkValueExpressions = null; @@ -65,6 +69,14 @@ public void setReduceSinkKeyColumnMap(int[] reduceSinkKeyColumnMap) { this.reduceSinkKeyColumnMap = reduceSinkKeyColumnMap; } + public String[] getReduceSinkKeyColumnNames() { + return reduceSinkKeyColumnNames; + } + + public void setReduceSinkKeyColumnNames(String[] reduceSinkKeyColumnNames) { + this.reduceSinkKeyColumnNames = reduceSinkKeyColumnNames; + } + public TypeInfo[] getReduceSinkKeyTypeInfos() { return reduceSinkKeyTypeInfos; } @@ -97,6 +109,14 @@ public void setReduceSinkValueColumnMap(int[] reduceSinkValueColumnMap) { this.reduceSinkValueColumnMap = reduceSinkValueColumnMap; } + public String[] getReduceSinkValueColumnNames() { + return reduceSinkValueColumnNames; + } + + public void setReduceSinkValueColumnNames(String[] reduceSinkValueColumnNames) { + this.reduceSinkValueColumnNames = reduceSinkValueColumnNames; + } + public TypeInfo[] getReduceSinkValueTypeInfos() { return reduceSinkValueTypeInfos; } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java index aed9214..63cef14 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java @@ -25,11 +25,16 @@ import java.util.List; import java.util.Random; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePutWriter; import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.WriteBuffers; +import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; + import static org.junit.Assert.*; public class TestBytesBytesMultiHashMap { @@ -52,10 +57,10 @@ public void testCapacityValidation() { public void testPutGetOne() throws Exception { BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(CAPACITY, LOAD_FACTOR, WB_SIZE); RandomKvSource kv = new RandomKvSource(0, 0); - map.put(kv, -1); + map.put(kv); verifyHashMapResult(map, kv.getLastKey(), kv.getLastValue()); kv = new RandomKvSource(10, 100); - map.put(kv, -1); + map.put(kv); verifyHashMapResult(map, kv.getLastKey(), kv.getLastValue()); } @@ -63,12 +68,12 @@ public void testPutGetOne() throws Exception { public void testPutGetMultiple() throws Exception { BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(CAPACITY, LOAD_FACTOR, WB_SIZE); RandomKvSource kv = new RandomKvSource(0, 100); - map.put(kv, -1); + map.put(kv); verifyHashMapResult(map, kv.getLastKey(), kv.getLastValue()); FixedKeyKvSource kv2 = new FixedKeyKvSource(kv.getLastKey(), 0, 100); kv2.values.add(kv.getLastValue()); for (int i = 0; i < 3; ++i) { - map.put(kv2, -1); + map.put(kv2); verifyHashMapResult(map, kv2.key, kv2.values.toArray(new byte[kv2.values.size()][])); } } @@ -76,17 +81,21 @@ public void testPutGetMultiple() throws Exception { @Test public void testGetNonExistent() throws Exception { BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(CAPACITY, LOAD_FACTOR, WB_SIZE); + BytesBytesMultiHashMapFactory factory = new BytesBytesMultiHashMapFactory(); RandomKvSource kv = new RandomKvSource(1, 100); - map.put(kv, -1); + map.put(kv); byte[] key = kv.getLastKey(); key[0] = (byte)(key[0] + 1); FixedKeyKvSource kv2 = new FixedKeyKvSource(kv.getLastKey(), 0, 100); - map.put(kv2, -1); + map.put(kv2); key[0] = (byte)(key[0] + 1); - BytesBytesMultiHashMap.Result hashMapResult = new BytesBytesMultiHashMap.Result(); - map.getValueResult(key, 0, key.length, hashMapResult); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); assertTrue(!hashMapResult.hasRows()); - map.getValueResult(key, 0, 0, hashMapResult); + + hashCode = HashCodeUtil.murmurHash(key, 0, 0); + map.hashMapLookup(key, 0, 0, hashCode, hashMapResult); assertTrue(!hashMapResult.hasRows()); } @@ -94,17 +103,20 @@ public void testGetNonExistent() throws Exception { public void testPutWithFullMap() throws Exception { // Make sure the map does not expand; should be able to find space. BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(CAPACITY, 1f, WB_SIZE); + BytesBytesMultiHashMapFactory factory = new BytesBytesMultiHashMapFactory(); UniqueKeysKvSource kv = new UniqueKeysKvSource(); for (int i = 0; i < CAPACITY; ++i) { - map.put(kv, -1); + map.put(kv); } for (int i = 0; i < kv.keys.size(); ++i) { verifyHashMapResult(map, kv.keys.get(i), kv.values.get(i)); } assertEquals(CAPACITY, map.getCapacity()); // Get of non-existent key should terminate.. - BytesBytesMultiHashMap.Result hashMapResult = new BytesBytesMultiHashMap.Result(); - map.getValueResult(new byte[0], 0, 0, hashMapResult); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + byte[] key = new byte[0]; + int hashCode = HashCodeUtil.murmurHash(key, 0, 0); + map.hashMapLookup(key, 0, 0, hashCode, hashMapResult); } @Test @@ -113,7 +125,7 @@ public void testExpand() throws Exception { BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(1, 0.0000001f, WB_SIZE); UniqueKeysKvSource kv = new UniqueKeysKvSource(); for (int i = 0; i < 18; ++i) { - map.put(kv, -1); + map.put(kv); for (int j = 0; j <= i; ++j) { verifyHashMapResult(map, kv.keys.get(j), kv.values.get(j)); } @@ -121,9 +133,14 @@ public void testExpand() throws Exception { assertEquals(1 << 18, map.getCapacity()); } - private void verifyHashMapResult(BytesBytesMultiHashMap map, byte[] key, byte[]... values) { - BytesBytesMultiHashMap.Result hashMapResult = new BytesBytesMultiHashMap.Result(); - byte state = map.getValueResult(key, 0, key.length, hashMapResult); + private void verifyHashMapResult(BytesBytesMultiHashMap map, byte[] key, byte[]... values) + throws IOException { + BytesBytesMultiHashMapFactory factory = new BytesBytesMultiHashMapFactory(); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); + byte state = hashMapResult.aliasFilter(); HashSet hs = new HashSet(); int count = 0; if (hashMapResult.hasRows()) { @@ -197,7 +214,7 @@ public void writeValue(RandomAccessOutput dest) throws SerDeException { } } - private static class RandomKvSource implements BytesBytesMultiHashMap.KvSource { + private static class RandomKvSource implements KeyValuePutWriter { private int minLength, maxLength; private final Random rdm = new Random(43); public List keys = new ArrayList(), values = new ArrayList(); @@ -241,5 +258,26 @@ public void writeValue(RandomAccessOutput dest) throws SerDeException { public byte updateStateByte(Byte previousValue) { return (byte)(previousValue == null ? 1 : previousValue + 1); } + + @Override + public void setKeyValue(Writable key, Writable value) + throws SerDeException, IOException { + throw new RuntimeException("Not used"); + } + + @Override + public boolean hasHashCode() { + return false; + } + + @Override + public int getKeyHashCode() throws SerDeException { + return 0; + } + + @Override + public long getLongKey() { + throw new RuntimeException("Not used"); + } } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestHashPartition.java ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestHashPartition.java index efabd2b..66976e9 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestHashPartition.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestHashPartition.java @@ -24,7 +24,8 @@ @Test public void testHashPartition() throws Exception { - // TODO: wtf? - HashPartition hashPartition = new HashPartition(1024, (float) 0.75, 524288, 1, true, null); + // TODO + HashPartition hashPartition = new HashPartition(1024, (float) 0.75, 524288, 1, true, + null, new BytesBytesMultiHashMapFactory()); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowObjectSource.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowObjectSource.java index 2d4baa0..3f6ae80 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowObjectSource.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowObjectSource.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector; +import java.io.IOException; import java.sql.Date; import java.sql.Timestamp; import java.util.ArrayList; @@ -34,7 +35,20 @@ import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.common.type.RandomTypeUtil; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; @@ -61,6 +75,11 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; import org.apache.hive.common.util.DateUtils; /** diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowUtil.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowUtil.java new file mode 100644 index 0000000..2db749c --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowUtil.java @@ -0,0 +1,242 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.io.IOException; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Random; + +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; +import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +/** + * Generate object inspector and random row object[]. + */ +public class RandomRowUtil { + + public static Output serializeRow(Object[] row, RandomRowObjectSource source, + SerializeWrite serializeWrite) throws HiveException, IOException { + Output output = new Output(); + serializeWrite.set(output); + PrimitiveTypeInfo[] primitiveTypeInfos = source.primitiveTypeInfos(); + for (int i = 0; i < primitiveTypeInfos.length; i++) { + Object object = row[i]; + if (object == null) { + serializeWrite.writeNull(); + continue; + } + PrimitiveCategory primitiveCategory = primitiveTypeInfos[i].getPrimitiveCategory(); + switch (primitiveCategory) { + case BOOLEAN: + { + BooleanWritable expectedWritable = (BooleanWritable) object; + boolean value = expectedWritable.get(); + serializeWrite.writeBoolean(value); + } + break; + case BYTE: + { + ByteWritable expectedWritable = (ByteWritable) object; + byte value = expectedWritable.get(); + serializeWrite.writeByte(value); + } + break; + case SHORT: + { + ShortWritable expectedWritable = (ShortWritable) object; + short value = expectedWritable.get(); + serializeWrite.writeShort(value); + } + break; + case INT: + { + IntWritable expectedWritable = (IntWritable) object; + int value = expectedWritable.get(); + serializeWrite.writeInt(value); + } + break; + case LONG: + { + LongWritable expectedWritable = (LongWritable) object; + long value = expectedWritable.get(); + serializeWrite.writeLong(value); + } + break; + case DATE: + { + DateWritable expectedWritable = (DateWritable) object; + Date value = expectedWritable.get(); + serializeWrite.writeDate(value); + } + break; + case FLOAT: + { + FloatWritable expectedWritable = (FloatWritable) object; + float value = expectedWritable.get(); + serializeWrite.writeFloat(value); + } + break; + case DOUBLE: + { + DoubleWritable expectedWritable = (DoubleWritable) object; + double value = expectedWritable.get(); + serializeWrite.writeDouble(value); + } + break; + case STRING: + { + Text text = (Text) object; + serializeWrite.writeString(text.getBytes(), 0, text.getLength()); + } + break; + case CHAR: + { + HiveCharWritable expectedWritable = (HiveCharWritable) object; + HiveChar value = expectedWritable.getHiveChar(); + serializeWrite.writeHiveChar(value); + } + break; + case VARCHAR: + { + HiveVarcharWritable expectedWritable = (HiveVarcharWritable) object; + HiveVarchar value = expectedWritable.getHiveVarchar(); + serializeWrite.writeHiveVarchar(value); + } + break; + case BINARY: + { + BytesWritable expectedWritable = (BytesWritable) object; + byte[] bytes = expectedWritable.getBytes(); + int length = expectedWritable.getLength(); + serializeWrite.writeBinary(bytes, 0, length); + } + break; + case TIMESTAMP: + { + TimestampWritable expectedWritable = (TimestampWritable) object; + Timestamp value = expectedWritable.getTimestamp(); + serializeWrite.writeTimestamp(value); + } + break; + case INTERVAL_YEAR_MONTH: + { + HiveIntervalYearMonthWritable expectedWritable = (HiveIntervalYearMonthWritable) object; + HiveIntervalYearMonth value = expectedWritable.getHiveIntervalYearMonth(); + serializeWrite.writeHiveIntervalYearMonth(value); + } + break; + case INTERVAL_DAY_TIME: + { + HiveIntervalDayTimeWritable expectedWritable = (HiveIntervalDayTimeWritable) object; + HiveIntervalDayTime value = expectedWritable.getHiveIntervalDayTime(); + serializeWrite.writeHiveIntervalDayTime(value); + } + break; + case DECIMAL: + { + HiveDecimalWritable expectedWritable = (HiveDecimalWritable) object; + HiveDecimal value = expectedWritable.getHiveDecimal(); + serializeWrite.writeHiveDecimal(value, ((DecimalTypeInfo)primitiveTypeInfos[i]).scale()); + } + break; + default: + throw new HiveException("Unexpected primitive category " + primitiveCategory); + } + } + return output; + } + + public static void addRandomNulls(Random rand, Object[][] randomRows) { + int randomRowsLength = randomRows.length; + int columnCount = randomRows[0].length; + int randomPasses = (1 + rand.nextInt(randomRowsLength)) * (1 + rand.nextInt(columnCount)); + for (int p = 0; p < randomPasses; p++) { + int column = rand.nextInt(columnCount); + int row = rand.nextInt(randomRowsLength); + randomRows[row][column] = null; + } + } + + public static void addRandomDuplicates(Random rand, Object[][] randomRows) { + int randomRowsLength = randomRows.length; + int columnCount = randomRows[0].length; + int randomPasses = (1 + rand.nextInt(randomRowsLength)) * (1 + rand.nextInt(columnCount)); + for (int p = 0; p < randomPasses; p++) { + int column = rand.nextInt(columnCount); + int row = rand.nextInt(randomRowsLength); + int duplicateCount = 1 + rand.nextInt(20); + Object duplicateObject = randomRows[row][column]; + for (int d = 1; d < duplicateCount; d++) { + int duplicateRow = row + d; + if (duplicateRow >= randomRowsLength) { + break; + } + randomRows[duplicateRow][column] = duplicateObject; + } + } + } + + public static void addRandomSelectToBatch(Random rand, VectorizedRowBatch batch) { + int factor = rand.nextInt(4); + batch.selectedInUse = true; + int numSel = 0; + for (int i = 0; i < batch.size; i++) { + if (rand.nextInt(4) >= factor) { + batch.selected[numSel++] = i; + } + } + batch.size = numSel; + } + + public static void addRandomNullsToColumnVector(Random rand, ColumnVector colVector, int batchSize) { + int factor = rand.nextInt(4); + for (int i = 0; i < batchSize; i++) { + if (rand.nextInt(4) >= factor) { + colVector.noNulls = false; + colVector.isNull[i] = true; + } + } + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java index 7c0c8d1..9d18367 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java @@ -403,133 +403,6 @@ void examineBatch(VectorizedRowBatch batch, VectorExtractRowSameBatch vectorExtr } } - private Output serializeRow(Object[] row, RandomRowObjectSource source, SerializeWrite serializeWrite) throws HiveException, IOException { - Output output = new Output(); - serializeWrite.set(output); - PrimitiveTypeInfo[] primitiveTypeInfos = source.primitiveTypeInfos(); - for (int i = 0; i < primitiveTypeInfos.length; i++) { - Object object = row[i]; - PrimitiveCategory primitiveCategory = primitiveTypeInfos[i].getPrimitiveCategory(); - switch (primitiveCategory) { - case BOOLEAN: - { - BooleanWritable expectedWritable = (BooleanWritable) object; - boolean value = expectedWritable.get(); - serializeWrite.writeBoolean(value); - } - break; - case BYTE: - { - ByteWritable expectedWritable = (ByteWritable) object; - byte value = expectedWritable.get(); - serializeWrite.writeByte(value); - } - break; - case SHORT: - { - ShortWritable expectedWritable = (ShortWritable) object; - short value = expectedWritable.get(); - serializeWrite.writeShort(value); - } - break; - case INT: - { - IntWritable expectedWritable = (IntWritable) object; - int value = expectedWritable.get(); - serializeWrite.writeInt(value); - } - break; - case LONG: - { - LongWritable expectedWritable = (LongWritable) object; - long value = expectedWritable.get(); - serializeWrite.writeLong(value); - } - break; - case DATE: - { - DateWritable expectedWritable = (DateWritable) object; - Date value = expectedWritable.get(); - serializeWrite.writeDate(value); - } - break; - case FLOAT: - { - FloatWritable expectedWritable = (FloatWritable) object; - float value = expectedWritable.get(); - serializeWrite.writeFloat(value); - } - break; - case DOUBLE: - { - DoubleWritable expectedWritable = (DoubleWritable) object; - double value = expectedWritable.get(); - serializeWrite.writeDouble(value); - } - break; - case STRING: - { - Text text = (Text) object; - serializeWrite.writeString(text.getBytes(), 0, text.getLength()); - } - break; - case CHAR: - { - HiveCharWritable expectedWritable = (HiveCharWritable) object; - HiveChar value = expectedWritable.getHiveChar(); - serializeWrite.writeHiveChar(value); - } - break; - case VARCHAR: - { - HiveVarcharWritable expectedWritable = (HiveVarcharWritable) object; - HiveVarchar value = expectedWritable.getHiveVarchar(); - serializeWrite.writeHiveVarchar(value); - } - break; - case BINARY: - { - BytesWritable expectedWritable = (BytesWritable) object; - byte[] bytes = expectedWritable.getBytes(); - int length = expectedWritable.getLength(); - serializeWrite.writeBinary(bytes, 0, length); - } - break; - case TIMESTAMP: - { - TimestampWritable expectedWritable = (TimestampWritable) object; - Timestamp value = expectedWritable.getTimestamp(); - serializeWrite.writeTimestamp(value); - } - break; - case INTERVAL_YEAR_MONTH: - { - HiveIntervalYearMonthWritable expectedWritable = (HiveIntervalYearMonthWritable) object; - HiveIntervalYearMonth value = expectedWritable.getHiveIntervalYearMonth(); - serializeWrite.writeHiveIntervalYearMonth(value); - } - break; - case INTERVAL_DAY_TIME: - { - HiveIntervalDayTimeWritable expectedWritable = (HiveIntervalDayTimeWritable) object; - HiveIntervalDayTime value = expectedWritable.getHiveIntervalDayTime(); - serializeWrite.writeHiveIntervalDayTime(value); - } - break; - case DECIMAL: - { - HiveDecimalWritable expectedWritable = (HiveDecimalWritable) object; - HiveDecimal value = expectedWritable.getHiveDecimal(); - serializeWrite.writeHiveDecimal(value, ((DecimalTypeInfo)primitiveTypeInfos[i]).scale()); - } - break; - default: - throw new HiveException("Unexpected primitive category " + primitiveCategory); - } - } - return output; - } - private Properties createProperties(String fieldNames, String fieldTypes) { Properties tbl = new Properties(); @@ -612,7 +485,7 @@ void testVectorDeserializeRow(int caseNum, Random r, SerializationType serializa for (int i = 0; i < randomRows.length; i++) { Object[] row = randomRows[i]; - Output output = serializeRow(row, source, serializeWrite); + Output output = RandomRowUtil.serializeRow(row, source, serializeWrite); vectorDeserializeRow.setBytes(output.getData(), 0, output.getLength()); vectorDeserializeRow.deserializeByValue(batch, batch.size); batch.size++; diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeries.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeries.java new file mode 100644 index 0000000..c7dec0e --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeries.java @@ -0,0 +1,307 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Random; + +import org.apache.hadoop.hive.common.type.RandomTypeUtil; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.RandomRowUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import junit.framework.TestCase; + +/** + * Unit test for the vectorized key series. + */ +public class TestVectorKeySeries extends TestCase { + + private void validateKeySeriesLongArray(VectorizedRowBatch batch, long[] test) throws IOException { + VectorKeySeriesLong longKeySeries = + new VectorKeySeriesLong(0, TypeInfoFactory.longTypeInfo); + longKeySeries.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < longKeySeries.getKeyCount(); k++) { + assertTrue(logicalIndex == longKeySeries.getCurrentLogical()); + int duplicateCount = longKeySeries.getCurrentDuplicateCount(); + if (!longKeySeries.getCurrentKeyAllNull()) { + long currentKey = longKeySeries.getCurrentKey(); + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + assertTrue(test[batchIndex] == currentKey); + } + } + logicalIndex += duplicateCount; + boolean isNext = longKeySeries.next(); + if (k + 1 < longKeySeries.getKeyCount()) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateLongArray(Random rand, long[] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + LongColumnVector longColVector = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = longColVector; + System.arraycopy(test, 0, longColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesLongArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesLongArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + } + + private void validateLongRepeating(Random rand, int count, int value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + LongColumnVector longColVector = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = longColVector; + longColVector.isRepeating = true; + longColVector.vector[0] = value; + batch.size = count; + long[] test = new long[count]; + Arrays.fill(test, value); + + validateKeySeriesLongArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + } + public void testVectorKeySeriesLong() throws Throwable { + + Random rand = new Random(812); + + long[] test = {1906L, -7838598833900584960L, 1165L, -7456869587112255488L, 2013L, 7333512171174223872L, -7571293705217687552L, -8523434203900674048L, 8099215208813903872L, 9040958359122640896L, -7356685674003021824L, 2072L, 2073L, 871L, -7551394356730339328L, -8172827216441573376L, -8082793390939193344L, 8854495099223375872L, -8358130693961195520L, 9050032047355125760L, -7162299524557471744L, 809L, -8946656952763777024L, 1053L, 482L, -6968892545529896960L, 203L, 1614L, -8593419958317056000L, 8190967051000659968L, 808L, 1L, 412L, 8656571350884048896L, 8769199243315814400L, -8546758906409312256L, 8829545979081744384L, 7545689659010949120L, 618L, 8573305425181941760L, 94L, -7266719102957125632L, 2772L, 379L, 8302473563519950848L, -7802538500225777664L, -7273694358642851840L, 8987827141270880256L, 914L, 723L, -7104310188119834624L, 154L, -8494118409594650624L, 3781L, 1466L, 724L, -7270034223527993344L, 913L, 1704L, 1L, -7883252982752665600L, 412L, 4024L, 7226360892091416576L, -8244657976255889408L, 2862L, 1521L, 1L, 7386087924003676160L, 3907L, 7818464507324121088L, -8293833565967810560L, -7892780594910871552L, 1509L, 7592440105065308160L, -7600138468036386816L, 9064847977742032896L, -8379964450833367040L, 7217123582035116032L, -7879864376629567488L, 2878L, 2412L, 524L, 784L, -7046180371529351168L, 471L, 612L, 8368012468775608320L, -7547245548870025216L, 3841L, 8752150411997356032L, -8623965248051789824L, 7637152193832886272L, 9191943992860327936L, 2700L, 9180098147855769600L, 1775L, 797L, -7773957003968675840L, -8660149447361404928L, 8641221723991433216L, 392L, 1L, 8489735221193138176L, 7944741547145502720L, 6933731240564056064L, 9083704659251798016L, -9084940280061485056L, 8222714144797368320L, 8817665768680906752L, 1995L, 1561L, 2485L, 1826L, 845L, 8376440110255243264L, 9075404705968840704L, -8379109122834997248L, -6938706403992854528L, 961L, 1422L, 9149216169284091904L, 2752L, 2255L, -9080568167841226752L, 1046L, 7926898770090491904L, 7784489776013295616L, 6991316084916879360L, 1566L, 1671L, -8543982423727128576L, -8832750849949892608L, 6963217546192322560L, 236L, 7086206629592252416L, 9053187076403060736L, -8067243114610532352L, 1751L, 2502L, 294L, 7892281003266408448L, 8577096957495025664L, -8665764757143658496L, 2855L, 2811L, 8785153741735616512L, 1726L, 7186401810812059648L, -7603569103205916672L, 4018L, 3566L, 2725L, 1234L, 346L, 7961515985722605568L, 7274777328897802240L, -6933565857643814912L, -8330233444291084288L, 34L, 7080269176324218880L, 2941L, 9117063974299148288L, -6917607783359897600L, -8566940231897874432L, -8710298418608619520L, 1520L, 3728L, -8835408234247168000L, 7705445437881278464L, 6926925215281774592L, 835L, 1L, 3232L, -7840338174858199040L, 7748799008146366464L, 7410096605330227200L, 188L, 1L, -7709958788604936192L, -6920172215209426944L, -9109392978217484288L, 3608L, -8214462866994339840L, 2306L, -7759238919361888256L, -8922409715403112448L, 3664L, -9203942396257984512L, 8116738401948377088L, 1791L, -7419068456205385728L, 8795069490394882048L, 3043L, 3174L, 7625728883085025280L, -8585134536083660800L, 8558000156325707776L, -8572949572756774912L, 661L, 2393L, -7800879252150779904L, 7534549597202194432L, -7642381493746483200L, -7330413050756235264L, 7596563216912211968L, 3307L, 2971L, 2285L, 1880L, 4088L, 743L, -8317591428117274624L, 8854715632851345408L, 7768984605670604800L, 2900L, 7062605127422894080L, 7394967727502467072L, 1781L, 7238339720750948352L, 1638L, 1L, -8522878384019169280L, -8051587217208967168L, -7425160895830573056L, 7344029858387820544L, -8013397854633648128L, 8808467247666241536L, -8768744394742235136L, 9185458640237641728L, -7686220526274502656L, -8203075743525806080L, 3462L, 6964585306125008896L, 3418L, 3366L, -7867219225874571264L, 8367680396909404160L, 7524958388842078208L, 2897L, 8391785334471589888L, -8581979259158929408L, 587L, 130L, 1030L, 8362046808797306880L, 3691L, 7454632396542074880L, 7125231541858205696L, 2580L, 2512L, 7061498706968428544L, -7255686273677328384L, 9048002942653710336L, 8868529429494071296L, 8815398225009967104L, 7128222874437238784L, 8371939471056470016L, -8335810316927213568L, -7144791190333546496L, 1L, 1L, -7572962089372991488L, 8850055384477401088L, 2626L, 3599L, 213L, 2232L, -8297230235506343936L, 3430L, 391L, -7395343938785738752L, 9038087402564657152L, -9013952631912325120L, 3446L, -8703026916864802816L, -7833618000492109824L, 1541L, 8759184090543857664L, -7042183597114081280L, -7147490721376591872L, 3725L, 7961909238130270208L, -8930307926221807616L, 2719L, -6988970700649168896L, -7155539549555105792L, 3625L, 8113585123802529792L, 9207927479837319168L, -8387347109404286976L, 1L, 8896237972875370496L, 8372408423196270592L, 922L, 7255302164215013376L, -8585966098173870080L, 8424515140664360960L, -6997233584896229376L, 8087737899452432384L, 1493L, 8779711700787298304L, 2533L, 1L, 8017403886247927808L, 1282L, 2177L, -8632237187473088512L, 8109381965028548608L, 1157L, 7378993334503694336L, 1L, 2560L, 4037L, -8562524688907485184L, 2325L, 6962726713896484864L, 8120593157178228736L, 6924820982050758656L, -7366430883634929664L, -7209060152494817280L, -8689606130068611072L, 3190L, 3725L, -8581765103969312768L, 1L, 3542L, 8553195689344991232L, 1789L, 8698055291501543424L, 296L, -9095689235523264512L, 7998687089080467456L, 8160569434550403072L, 489L, -9175038118837149696L, 8571268359622172672L, -7916510129632296960L, 8323460620425330688L, 346L, 3980L, -7707242953271500800L, 1811L, 2803L, 7370078518278397952L, 7497276415392407552L, 2323L, 8467976965865799680L, 691L, 1914L, 6982145326341423104L, -9203804401302323200L, 7823874904139849728L, 7534145866886782976L, 9085434340468473856L, 8579974641030365184L, 8536948829863198720L, 341L, -9102482277760983040L, 658L, 1L, 2843L, 7584007864107778048L, 590L, 8899122608190930944L, 3588L, 3609L, 3824L, 7690986322714066944L, 7765456790394871808L, -8649711322250362880L, 1948L, -9101953184875757568L, 2463L, 1813L, 7054271419461812224L, 7548958830580563968L, -9206329156028112896L, 2637L, -7661250850555633664L, 664L, 2487L, 8221561626658881536L, 8169878743136043008L, 6927260280037097472L, 342L, -7501803640821456896L, 2745L, 677L, 8435912708683087872L, 7412924364686458880L, 3563L, 1L, 7153922334283776000L, 8849475396952514560L, 2977L, -7910019233726242816L, 2835L, 2335L, 1L, 2515L, -7617860842651017216L, -7637755520917741568L, 2647L, 707L, 8856674723376668672L, 7857878068300898304L, -8887058200926093312L, 108L, 2762L, 3622L, 868L, 138L, 1786L, 9116137265342169088L, 7955126053367119872L, 491L, -7778829032042790912L, -7753051494275432448L, 8962097525980225536L, 8163948965373386752L, 1145L, -8438554249514491904L, 522L, 1785L, 1545L, 999L, 1941L, 1L, 7454442625055145984L, 3510L, 2373L, -8127494999848919040L, 1643L, -7819437864839495680L, -7822452149325094912L, 7411793502161182720L, 2274L, 8783241818558193664L, 8316336224427483136L, -7669169138124275712L, 2984L, -7772064021830574080L, 3397L, 1L, 8523972434954510336L, -7127548949860818944L, 8286706213485297664L, 3147L, -7536330682873937920L, -7115054815375073280L, -7319315187617587200L, 1099L, -8989473881707921408L, 2816L, -6986178228432322560L, -7759425383684849664L, -7893577088764174336L, 8091421389575282688L, -7409653086454030336L, 7348598907182800896L, -7362189611124563968L, 1L, 2465L, 350L, 2619L, 3722L, 898L, 782L, 1780L, 2186L, -6921654334727036928L, 4020L, 8325227661920133120L, -7036607470351654912L, 7304839835188609024L, 8792059919353348096L, -8856821118526734336L, 8720504651219001344L, 1055L, 1368L, 8736061027343859712L, 7919597361814577152L, 7381659098423926784L, 8731960288562044928L, -7594824008626372608L, -9178166810751909888L, 3083L, -8948335470186373120L, 2569L, 823L, 259L, 8461498293348065280L, -8961059046745669632L, -8607195685207408640L, -8754966081778565120L, -8418913260807217152L, -8877053610728161280L, -6935548339131138048L, -8219876839318716416L, 1132L, 1337L, 1341L, 976L, -7557017910095650816L, 1L, -8683802826440105984L, 1845L, 1965L, -8104684579106914304L, 1835L, 7345991518378442752L, 3212L, -7081500255163727872L, 1074L, 8372588378498777088L, -7593363318079610880L, -7451660755269853184L, 1983L, 8514851182589771776L, 1864L, 8463868417649524736L, 3094L, -8858063395050110976L, 1981L, -8140349174954893312L, -7041362811802148864L, 8972161729142095872L, 7989119273552158720L, 2469L, 1481L, -8566856504746352640L, 8272001752345690112L, -7094827141662539776L, 8396433451610652672L, -7679894005808693248L, 8613562211893919744L, 3407L, 7686992843032010752L, 1048L, 3507L, 7784169796350730240L, 8551446856960942080L, 3467L, 1458L, 213L, 735L, 9190466190353661952L, -8280276629934981120L, -7895991410072928256L, -9145593811310010368L, 8059284960252731392L, 367L, 7614435638888210432L, 9174894805640142848L, -8941201923743703040L, 1075L, 7492436934952574976L, -8714995808835444736L, 7782245855193874432L, 8525894870444638208L, -7661192563533062144L, 1L, 8995562121346260992L, 7626715182847090688L, 8146492373537660928L, 7682327310082531328L, 2968L, 7309156463509061632L, 1955L, 1L, 7022349041913978880L, 7045967493826387968L, 3006L, 65L, 8168742078705262592L, 7212016545671348224L, 8079573715140485120L, 3965L, 8555933456197828608L, 2903L, 7648729477297987584L, 1L, 8223732800007864320L, -7412431471807283200L, 2560L, 2988L, 1243L, 1837L, 7014537632150224896L, 3747L, 2682L, 8073733016154431488L, 2938L, 1312L, 7006803044329021440L, 7701723309715685376L, 7528074274555305984L, -7532751268425261056L, 8000440057238052864L, -7964801953178091520L, 2846L, 8723248113030782976L, 7440265908266827776L, 927L, -7063777488249085952L, 9194388393453060096L, 7720187583697502208L, 8557218322962644992L, 950L, 2189L, 1371L, 7370803940448305152L, -8914039133569400832L, 3663L, 2341L, -8877431933441327104L, 8171188598958407680L, 8525336514806317056L, 1608L, -7094189393339678720L, 1752L, 3084L, 3673L, 9169248521377374208L, -7866079955473989632L, -9004892183139811328L, 1892L, 6928080429732536320L, -7623047151287754752L, 2492L, -7695491171376291840L, -7797151404935618560L, 8208354137450766336L, -7395553021620731904L, -8453491903284994048L, -7140008543769042944L, 2724L, 3443L, -7512297136103800832L, 9136234417125007360L, 8192304692696383488L, 8199513544090730496L, 311L, -8488247955875618816L, 1L, 2540L, 586L, -7444070205513138176L, 1141L, -8076479329071955968L, 3103L, -7629401308029976576L, -7507424948896415744L, 2821L, 2017L, 1134L, 347L, -7246123871306244096L, 2020L, 1693L, 2020L, 8570983266408103936L, 2919L, 2283L, 7534042483076857856L, 1L, 8991442360387584000L, -7240213957902663680L, 3365L, 1899L, 7199539820886958080L, 7165364563962191872L, 8407869317250220032L, 1489L, 2400L, -7037375807670501376L, 7235109456886816768L, 8569030475428511744L, 2067L, 8332670681629106176L, 168L, 1L, -83}; + + validateLongArray(rand, test); + + validateLongRepeating(rand, 1, 5); + validateLongRepeating(rand, 20, 0); + validateLongRepeating(rand, 1024, 9383844); + } + + private void validateKeySeriesDoubleArray(VectorizedRowBatch batch, double[] test) throws IOException { + VectorKeySeriesDouble doubleKeySeries = new VectorKeySeriesDouble(0, TypeInfoFactory.doubleTypeInfo); + doubleKeySeries.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < doubleKeySeries.getKeyCount(); k++) { + assertTrue(logicalIndex == doubleKeySeries.getCurrentLogical()); + int duplicateCount = doubleKeySeries.getCurrentDuplicateCount(); + if (!doubleKeySeries.getCurrentKeyAllNull()) { + double currentKey = doubleKeySeries.getCurrentDoubleKey(); + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + assertTrue(test[batchIndex] == currentKey); + } + } + logicalIndex += duplicateCount; + boolean isNext = doubleKeySeries.next(); + if (k + 1 < doubleKeySeries.getKeyCount()) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateDoubleArray(Random rand, double[] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + DoubleColumnVector doubleColVector = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = doubleColVector; + System.arraycopy(test, 0, doubleColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesDoubleArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesDoubleArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesDoubleArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesDoubleArray(batch, test); + } + } + + private void validateDoubleRepeating(Random rand, int count, int value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + DoubleColumnVector doubleColVector = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = doubleColVector; + doubleColVector.isRepeating = true; + doubleColVector.vector[0] = value; + batch.size = count; + double[] test = new double[count]; + Arrays.fill(test, value); + + validateKeySeriesDoubleArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesDoubleArray(batch, test); + } + } + + public void testVectorKeySeriesDouble() throws Throwable { + + Random rand = new Random(3452); + + double[] test = {1906L, -7838598833900584960L, 1165L, -7456869587112255488L, 2013L, 7333512171174223872L, -7571293705217687552L, -8523434203900674048L, 8099215208813903872L, 9040958359122640896L, -7356685674003021824L, 2072L, 2073L, 871L, -7551394356730339328L, -8172827216441573376L, -8082793390939193344L, 8854495099223375872L, -8358130693961195520L, 9050032047355125760L, -7162299524557471744L, 809L, -8946656952763777024L, 1053L, 482L, -6968892545529896960L, 203L, 1614L, -8593419958317056000L, 8190967051000659968L, 808L, 1L, 412L, 8656571350884048896L, 8769199243315814400L, -8546758906409312256L, 8829545979081744384L, 7545689659010949120L, 618L, 8573305425181941760L, 94L, -7266719102957125632L, 2772L, 379L, 8302473563519950848L, -7802538500225777664L, -7273694358642851840L, 8987827141270880256L, 914L, 723L, -7104310188119834624L, 154L, -8494118409594650624L, 3781L, 1466L, 724L, -7270034223527993344L, 913L, 1704L, 1L, -7883252982752665600L, 412L, 4024L, 7226360892091416576L, -8244657976255889408L, 2862L, 1521L, 1L, 7386087924003676160L, 3907L, 7818464507324121088L, -8293833565967810560L, -7892780594910871552L, 1509L, 7592440105065308160L, -7600138468036386816L, 9064847977742032896L, -8379964450833367040L, 7217123582035116032L, -7879864376629567488L, 2878L, 2412L, 524L, 784L, -7046180371529351168L, 471L, 612L, 8368012468775608320L, -7547245548870025216L, 3841L, 8752150411997356032L, -8623965248051789824L, 7637152193832886272L, 9191943992860327936L, 2700L, 9180098147855769600L, 1775L, 797L, -7773957003968675840L, -8660149447361404928L, 8641221723991433216L, 392L, 1L, 8489735221193138176L, 7944741547145502720L, 6933731240564056064L, 9083704659251798016L, -9084940280061485056L, 8222714144797368320L, 8817665768680906752L, 1995L, 1561L, 2485L, 1826L, 845L, 8376440110255243264L, 9075404705968840704L, -8379109122834997248L, -6938706403992854528L, 961L, 1422L, 9149216169284091904L, 2752L, 2255L, -9080568167841226752L, 1046L, 7926898770090491904L, 7784489776013295616L, 6991316084916879360L, 1566L, 1671L, -8543982423727128576L, -8832750849949892608L, 6963217546192322560L, 236L, 7086206629592252416L, 9053187076403060736L, -8067243114610532352L, 1751L, 2502L, 294L, 7892281003266408448L, 8577096957495025664L, -8665764757143658496L, 2855L, 2811L, 8785153741735616512L, 1726L, 7186401810812059648L, -7603569103205916672L, 4018L, 3566L, 2725L, 1234L, 346L, 7961515985722605568L, 7274777328897802240L, -6933565857643814912L, -8330233444291084288L, 34L, 7080269176324218880L, 2941L, 9117063974299148288L, -6917607783359897600L, -8566940231897874432L, -8710298418608619520L, 1520L, 3728L, -8835408234247168000L, 7705445437881278464L, 6926925215281774592L, 835L, 1L, 3232L, -7840338174858199040L, 7748799008146366464L, 7410096605330227200L, 188L, 1L, -7709958788604936192L, -6920172215209426944L, -9109392978217484288L, 3608L, -8214462866994339840L, 2306L, -7759238919361888256L, -8922409715403112448L, 3664L, -9203942396257984512L, 8116738401948377088L, 1791L, -7419068456205385728L, 8795069490394882048L, 3043L, 3174L, 7625728883085025280L, -8585134536083660800L, 8558000156325707776L, -8572949572756774912L, 661L, 2393L, -7800879252150779904L, 7534549597202194432L, -7642381493746483200L, -7330413050756235264L, 7596563216912211968L, 3307L, 2971L, 2285L, 1880L, 4088L, 743L, -8317591428117274624L, 8854715632851345408L, 7768984605670604800L, 2900L, 7062605127422894080L, 7394967727502467072L, 1781L, 7238339720750948352L, 1638L, 1L, -8522878384019169280L, -8051587217208967168L, -7425160895830573056L, 7344029858387820544L, -8013397854633648128L, 8808467247666241536L, -8768744394742235136L, 9185458640237641728L, -7686220526274502656L, -8203075743525806080L, 3462L, 6964585306125008896L, 3418L, 3366L, -7867219225874571264L, 8367680396909404160L, 7524958388842078208L, 2897L, 8391785334471589888L, -8581979259158929408L, 587L, 130L, 1030L, 8362046808797306880L, 3691L, 7454632396542074880L, 7125231541858205696L, 2580L, 2512L, 7061498706968428544L, -7255686273677328384L, 9048002942653710336L, 8868529429494071296L, 8815398225009967104L, 7128222874437238784L, 8371939471056470016L, -8335810316927213568L, -7144791190333546496L, 1L, 1L, -7572962089372991488L, 8850055384477401088L, 2626L, 3599L, 213L, 2232L, -8297230235506343936L, 3430L, 391L, -7395343938785738752L, 9038087402564657152L, -9013952631912325120L, 3446L, -8703026916864802816L, -7833618000492109824L, 1541L, 8759184090543857664L, -7042183597114081280L, -7147490721376591872L, 3725L, 7961909238130270208L, -8930307926221807616L, 2719L, -6988970700649168896L, -7155539549555105792L, 3625L, 8113585123802529792L, 9207927479837319168L, -8387347109404286976L, 1L, 8896237972875370496L, 8372408423196270592L, 922L, 7255302164215013376L, -8585966098173870080L, 8424515140664360960L, -6997233584896229376L, 8087737899452432384L, 1493L, 8779711700787298304L, 2533L, 1L, 8017403886247927808L, 1282L, 2177L, -8632237187473088512L, 8109381965028548608L, 1157L, 7378993334503694336L, 1L, 2560L, 4037L, -8562524688907485184L, 2325L, 6962726713896484864L, 8120593157178228736L, 6924820982050758656L, -7366430883634929664L, -7209060152494817280L, -8689606130068611072L, 3190L, 3725L, -8581765103969312768L, 1L, 3542L, 8553195689344991232L, 1789L, 8698055291501543424L, 296L, -9095689235523264512L, 7998687089080467456L, 8160569434550403072L, 489L, -9175038118837149696L, 8571268359622172672L, -7916510129632296960L, 8323460620425330688L, 346L, 3980L, -7707242953271500800L, 1811L, 2803L, 7370078518278397952L, 7497276415392407552L, 2323L, 8467976965865799680L, 691L, 1914L, 6982145326341423104L, -9203804401302323200L, 7823874904139849728L, 7534145866886782976L, 9085434340468473856L, 8579974641030365184L, 8536948829863198720L, 341L, -9102482277760983040L, 658L, 1L, 2843L, 7584007864107778048L, 590L, 8899122608190930944L, 3588L, 3609L, 3824L, 7690986322714066944L, 7765456790394871808L, -8649711322250362880L, 1948L, -9101953184875757568L, 2463L, 1813L, 7054271419461812224L, 7548958830580563968L, -9206329156028112896L, 2637L, -7661250850555633664L, 664L, 2487L, 8221561626658881536L, 8169878743136043008L, 6927260280037097472L, 342L, -7501803640821456896L, 2745L, 677L, 8435912708683087872L, 7412924364686458880L, 3563L, 1L, 7153922334283776000L, 8849475396952514560L, 2977L, -7910019233726242816L, 2835L, 2335L, 1L, 2515L, -7617860842651017216L, -7637755520917741568L, 2647L, 707L, 8856674723376668672L, 7857878068300898304L, -8887058200926093312L, 108L, 2762L, 3622L, 868L, 138L, 1786L, 9116137265342169088L, 7955126053367119872L, 491L, -7778829032042790912L, -7753051494275432448L, 8962097525980225536L, 8163948965373386752L, 1145L, -8438554249514491904L, 522L, 1785L, 1545L, 999L, 1941L, 1L, 7454442625055145984L, 3510L, 2373L, -8127494999848919040L, 1643L, -7819437864839495680L, -7822452149325094912L, 7411793502161182720L, 2274L, 8783241818558193664L, 8316336224427483136L, -7669169138124275712L, 2984L, -7772064021830574080L, 3397L, 1L, 8523972434954510336L, -7127548949860818944L, 8286706213485297664L, 3147L, -7536330682873937920L, -7115054815375073280L, -7319315187617587200L, 1099L, -8989473881707921408L, 2816L, -6986178228432322560L, -7759425383684849664L, -7893577088764174336L, 8091421389575282688L, -7409653086454030336L, 7348598907182800896L, -7362189611124563968L, 1L, 2465L, 350L, 2619L, 3722L, 898L, 782L, 1780L, 2186L, -6921654334727036928L, 4020L, 8325227661920133120L, -7036607470351654912L, 7304839835188609024L, 8792059919353348096L, -8856821118526734336L, 8720504651219001344L, 1055L, 1368L, 8736061027343859712L, 7919597361814577152L, 7381659098423926784L, 8731960288562044928L, -7594824008626372608L, -9178166810751909888L, 3083L, -8948335470186373120L, 2569L, 823L, 259L, 8461498293348065280L, -8961059046745669632L, -8607195685207408640L, -8754966081778565120L, -8418913260807217152L, -8877053610728161280L, -6935548339131138048L, -8219876839318716416L, 1132L, 1337L, 1341L, 976L, -7557017910095650816L, 1L, -8683802826440105984L, 1845L, 1965L, -8104684579106914304L, 1835L, 7345991518378442752L, 3212L, -7081500255163727872L, 1074L, 8372588378498777088L, -7593363318079610880L, -7451660755269853184L, 1983L, 8514851182589771776L, 1864L, 8463868417649524736L, 3094L, -8858063395050110976L, 1981L, -8140349174954893312L, -7041362811802148864L, 8972161729142095872L, 7989119273552158720L, 2469L, 1481L, -8566856504746352640L, 8272001752345690112L, -7094827141662539776L, 8396433451610652672L, -7679894005808693248L, 8613562211893919744L, 3407L, 7686992843032010752L, 1048L, 3507L, 7784169796350730240L, 8551446856960942080L, 3467L, 1458L, 213L, 735L, 9190466190353661952L, -8280276629934981120L, -7895991410072928256L, -9145593811310010368L, 8059284960252731392L, 367L, 7614435638888210432L, 9174894805640142848L, -8941201923743703040L, 1075L, 7492436934952574976L, -8714995808835444736L, 7782245855193874432L, 8525894870444638208L, -7661192563533062144L, 1L, 8995562121346260992L, 7626715182847090688L, 8146492373537660928L, 7682327310082531328L, 2968L, 7309156463509061632L, 1955L, 1L, 7022349041913978880L, 7045967493826387968L, 3006L, 65L, 8168742078705262592L, 7212016545671348224L, 8079573715140485120L, 3965L, 8555933456197828608L, 2903L, 7648729477297987584L, 1L, 8223732800007864320L, -7412431471807283200L, 2560L, 2988L, 1243L, 1837L, 7014537632150224896L, 3747L, 2682L, 8073733016154431488L, 2938L, 1312L, 7006803044329021440L, 7701723309715685376L, 7528074274555305984L, -7532751268425261056L, 8000440057238052864L, -7964801953178091520L, 2846L, 8723248113030782976L, 7440265908266827776L, 927L, -7063777488249085952L, 9194388393453060096L, 7720187583697502208L, 8557218322962644992L, 950L, 2189L, 1371L, 7370803940448305152L, -8914039133569400832L, 3663L, 2341L, -8877431933441327104L, 8171188598958407680L, 8525336514806317056L, 1608L, -7094189393339678720L, 1752L, 3084L, 3673L, 9169248521377374208L, -7866079955473989632L, -9004892183139811328L, 1892L, 6928080429732536320L, -7623047151287754752L, 2492L, -7695491171376291840L, -7797151404935618560L, 8208354137450766336L, -7395553021620731904L, -8453491903284994048L, -7140008543769042944L, 2724L, 3443L, -7512297136103800832L, 9136234417125007360L, 8192304692696383488L, 8199513544090730496L, 311L, -8488247955875618816L, 1L, 2540L, 586L, -7444070205513138176L, 1141L, -8076479329071955968L, 3103L, -7629401308029976576L, -7507424948896415744L, 2821L, 2017L, 1134L, 347L, -7246123871306244096L, 2020L, 1693L, 2020L, 8570983266408103936L, 2919L, 2283L, 7534042483076857856L, 1L, 8991442360387584000L, -7240213957902663680L, 3365L, 1899L, 7199539820886958080L, 7165364563962191872L, 8407869317250220032L, 1489L, 2400L, -7037375807670501376L, 7235109456886816768L, 8569030475428511744L, 2067L, 8332670681629106176L, 168L, 1L, -83}; + + validateDoubleArray(rand, test); + + validateDoubleRepeating(rand, 1, 5); + validateDoubleRepeating(rand, 20, 0); + validateDoubleRepeating(rand, 1024, 9383844); + } + + private void validateKeySeriesBytesArray(VectorizedRowBatch batch, byte[][] test) throws IOException { + VectorKeySeriesBytes bytesKeySeries = new VectorKeySeriesBytes(0); + bytesKeySeries.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < bytesKeySeries.getKeyCount(); k++) { + assertTrue(logicalIndex == bytesKeySeries.getCurrentLogical()); + int duplicateCount = bytesKeySeries.getCurrentDuplicateCount(); + if (!bytesKeySeries.getCurrentKeyAllNull()) { + byte[] currentKey = bytesKeySeries.getCurrentBytes(); + int start = bytesKeySeries.getCurrentStart(); + int length = bytesKeySeries.getCurrentLength(); + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + byte[] testBytes = test[batchIndex]; + if (!StringExpr.equal(testBytes, 0, testBytes.length, currentKey, start, length)) { + assertTrue(false); + } + } + } + logicalIndex += duplicateCount; + boolean isNext = bytesKeySeries.next(); + if (k + 1 < bytesKeySeries.getKeyCount()) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateBytesArray(Random rand, byte[][] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + BytesColumnVector bytesColVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = bytesColVector; + for (int i = 0; i < test.length; i++) { + bytesColVector.start[i] = 0; + bytesColVector.length[i] = test[i].length; + bytesColVector.vector[i] = test[i]; + } + System.arraycopy(test, 0, bytesColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesBytesArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesBytesArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + } + + private void validateBytesRepeating(Random rand, int count, byte[] value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + BytesColumnVector bytesColVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = bytesColVector; + bytesColVector.isRepeating = true; + bytesColVector.vector[0] = value; + bytesColVector.start[0] = 0; + bytesColVector.length[0] = value.length; + batch.size = count; + byte[][] test = new byte[count][]; + for (int i = 0; i < count; i++) { + test[i] = value; + } + + validateKeySeriesBytesArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + } + + public void testVectorKeySeriesBytes() throws Throwable { + Random rand = new Random(933); + + byte[][] test = new byte[200][]; + for (int i = 0; i < test.length; i++) { + if (i > 0 && rand.nextInt(10) == 2) { + test[i] = test[i - 1]; + continue; + } + test[i] = RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes(); + } + + validateBytesArray(rand, test); + + validateBytesRepeating(rand, 1, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + validateBytesRepeating(rand, 20, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + validateBytesRepeating(rand, 1024, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeriesFast.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeriesFast.java new file mode 100644 index 0000000..4d3a146 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeriesFast.java @@ -0,0 +1,438 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Random; + +import org.apache.hadoop.hive.common.type.RandomTypeUtil; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.RandomRowObjectSource; +import org.apache.hadoop.hive.ql.exec.vector.RandomRowUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorAssignRowSameBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesBytesFast; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesLongFast; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.DeserializeRead.ReadStringResults; +import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; +import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import junit.framework.TestCase; + +/** + * Unit test for the vectorized key series. + */ +public class TestVectorKeySeriesFast extends TestCase { + + private void validateKeySeriesLongArray(VectorizedRowBatch batch, long[] test) throws IOException { + // Lazy binary key serializer. + LazyBinarySerializeWrite keyLazyBinarySerializeWrite = + new LazyBinarySerializeWrite(1); + LazyBinaryDeserializeRead keyLazyBinarySerializeRead = + new LazyBinaryDeserializeRead(new TypeInfo[] {TypeInfoFactory.longTypeInfo}); + + VectorKeySeriesLongFast longKeySeriesFast = + new VectorKeySeriesLongFast( + 0, TypeInfoFactory.longTypeInfo, keyLazyBinarySerializeWrite); + + longKeySeriesFast.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < longKeySeriesFast.getKeyCount(); k++) { + assertTrue(logicalIndex == longKeySeriesFast.getCurrentLogical()); + int duplicateCount = longKeySeriesFast.getCurrentDuplicateCount(); + + if (!longKeySeriesFast.getCurrentKeyAllNull()) { + byte[] serializedBytes = longKeySeriesFast.getSerializedBytes(); + int serializedStart = longKeySeriesFast.getSerializedStart(); + int serializedLength = longKeySeriesFast.getSerializedLength(); + + keyLazyBinarySerializeRead.set(serializedBytes, serializedStart, serializedLength); + assertTrue(!keyLazyBinarySerializeRead.readCheckNull()); + + long currentKey = keyLazyBinarySerializeRead.readLong(); + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + assertTrue(test[batchIndex] == currentKey); + } + } + logicalIndex += duplicateCount; + boolean isNext = longKeySeriesFast.next(); + if (k + 1 < longKeySeriesFast.getKeyCount()) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateLongArray(Random rand, long[] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + LongColumnVector longColVector = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = longColVector; + System.arraycopy(test, 0, longColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesLongArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesLongArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + } + + private void validateLongRepeating(Random rand, int count, int value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + LongColumnVector longColVector = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = longColVector; + longColVector.isRepeating = true; + longColVector.vector[0] = value; + batch.size = count; + long[] test = new long[count]; + Arrays.fill(test, value); + + validateKeySeriesLongArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + } + + public void testVectorKeySeriesLongFast() throws Throwable { + + Random rand = new Random(36777); + + long[] test = {1906L, -7838598833900584960L, 1165L, -7456869587112255488L, 2013L, 7333512171174223872L, -7571293705217687552L, -8523434203900674048L, 8099215208813903872L, 9040958359122640896L, -7356685674003021824L, 2072L, 2073L, 871L, -7551394356730339328L, -8172827216441573376L, -8082793390939193344L, 8854495099223375872L, -8358130693961195520L, 9050032047355125760L, -7162299524557471744L, 809L, -8946656952763777024L, 1053L, 482L, -6968892545529896960L, 203L, 1614L, -8593419958317056000L, 8190967051000659968L, 808L, 1L, 412L, 8656571350884048896L, 8769199243315814400L, -8546758906409312256L, 8829545979081744384L, 7545689659010949120L, 618L, 8573305425181941760L, 94L, -7266719102957125632L, 2772L, 379L, 8302473563519950848L, -7802538500225777664L, -7273694358642851840L, 8987827141270880256L, 914L, 723L, -7104310188119834624L, 154L, -8494118409594650624L, 3781L, 1466L, 724L, -7270034223527993344L, 913L, 1704L, 1L, -7883252982752665600L, 412L, 4024L, 7226360892091416576L, -8244657976255889408L, 2862L, 1521L, 1L, 7386087924003676160L, 3907L, 7818464507324121088L, -8293833565967810560L, -7892780594910871552L, 1509L, 7592440105065308160L, -7600138468036386816L, 9064847977742032896L, -8379964450833367040L, 7217123582035116032L, -7879864376629567488L, 2878L, 2412L, 524L, 784L, -7046180371529351168L, 471L, 612L, 8368012468775608320L, -7547245548870025216L, 3841L, 8752150411997356032L, -8623965248051789824L, 7637152193832886272L, 9191943992860327936L, 2700L, 9180098147855769600L, 1775L, 797L, -7773957003968675840L, -8660149447361404928L, 8641221723991433216L, 392L, 1L, 8489735221193138176L, 7944741547145502720L, 6933731240564056064L, 9083704659251798016L, -9084940280061485056L, 8222714144797368320L, 8817665768680906752L, 1995L, 1561L, 2485L, 1826L, 845L, 8376440110255243264L, 9075404705968840704L, -8379109122834997248L, -6938706403992854528L, 961L, 1422L, 9149216169284091904L, 2752L, 2255L, -9080568167841226752L, 1046L, 7926898770090491904L, 7784489776013295616L, 6991316084916879360L, 1566L, 1671L, -8543982423727128576L, -8832750849949892608L, 6963217546192322560L, 236L, 7086206629592252416L, 9053187076403060736L, -8067243114610532352L, 1751L, 2502L, 294L, 7892281003266408448L, 8577096957495025664L, -8665764757143658496L, 2855L, 2811L, 8785153741735616512L, 1726L, 7186401810812059648L, -7603569103205916672L, 4018L, 3566L, 2725L, 1234L, 346L, 7961515985722605568L, 7274777328897802240L, -6933565857643814912L, -8330233444291084288L, 34L, 7080269176324218880L, 2941L, 9117063974299148288L, -6917607783359897600L, -8566940231897874432L, -8710298418608619520L, 1520L, 3728L, -8835408234247168000L, 7705445437881278464L, 6926925215281774592L, 835L, 1L, 3232L, -7840338174858199040L, 7748799008146366464L, 7410096605330227200L, 188L, 1L, -7709958788604936192L, -6920172215209426944L, -9109392978217484288L, 3608L, -8214462866994339840L, 2306L, -7759238919361888256L, -8922409715403112448L, 3664L, -9203942396257984512L, 8116738401948377088L, 1791L, -7419068456205385728L, 8795069490394882048L, 3043L, 3174L, 7625728883085025280L, -8585134536083660800L, 8558000156325707776L, -8572949572756774912L, 661L, 2393L, -7800879252150779904L, 7534549597202194432L, -7642381493746483200L, -7330413050756235264L, 7596563216912211968L, 3307L, 2971L, 2285L, 1880L, 4088L, 743L, -8317591428117274624L, 8854715632851345408L, 7768984605670604800L, 2900L, 7062605127422894080L, 7394967727502467072L, 1781L, 7238339720750948352L, 1638L, 1L, -8522878384019169280L, -8051587217208967168L, -7425160895830573056L, 7344029858387820544L, -8013397854633648128L, 8808467247666241536L, -8768744394742235136L, 9185458640237641728L, -7686220526274502656L, -8203075743525806080L, 3462L, 6964585306125008896L, 3418L, 3366L, -7867219225874571264L, 8367680396909404160L, 7524958388842078208L, 2897L, 8391785334471589888L, -8581979259158929408L, 587L, 130L, 1030L, 8362046808797306880L, 3691L, 7454632396542074880L, 7125231541858205696L, 2580L, 2512L, 7061498706968428544L, -7255686273677328384L, 9048002942653710336L, 8868529429494071296L, 8815398225009967104L, 7128222874437238784L, 8371939471056470016L, -8335810316927213568L, -7144791190333546496L, 1L, 1L, -7572962089372991488L, 8850055384477401088L, 2626L, 3599L, 213L, 2232L, -8297230235506343936L, 3430L, 391L, -7395343938785738752L, 9038087402564657152L, -9013952631912325120L, 3446L, -8703026916864802816L, -7833618000492109824L, 1541L, 8759184090543857664L, -7042183597114081280L, -7147490721376591872L, 3725L, 7961909238130270208L, -8930307926221807616L, 2719L, -6988970700649168896L, -7155539549555105792L, 3625L, 8113585123802529792L, 9207927479837319168L, -8387347109404286976L, 1L, 8896237972875370496L, 8372408423196270592L, 922L, 7255302164215013376L, -8585966098173870080L, 8424515140664360960L, -6997233584896229376L, 8087737899452432384L, 1493L, 8779711700787298304L, 2533L, 1L, 8017403886247927808L, 1282L, 2177L, -8632237187473088512L, 8109381965028548608L, 1157L, 7378993334503694336L, 1L, 2560L, 4037L, -8562524688907485184L, 2325L, 6962726713896484864L, 8120593157178228736L, 6924820982050758656L, -7366430883634929664L, -7209060152494817280L, -8689606130068611072L, 3190L, 3725L, -8581765103969312768L, 1L, 3542L, 8553195689344991232L, 1789L, 8698055291501543424L, 296L, -9095689235523264512L, 7998687089080467456L, 8160569434550403072L, 489L, -9175038118837149696L, 8571268359622172672L, -7916510129632296960L, 8323460620425330688L, 346L, 3980L, -7707242953271500800L, 1811L, 2803L, 7370078518278397952L, 7497276415392407552L, 2323L, 8467976965865799680L, 691L, 1914L, 6982145326341423104L, -9203804401302323200L, 7823874904139849728L, 7534145866886782976L, 9085434340468473856L, 8579974641030365184L, 8536948829863198720L, 341L, -9102482277760983040L, 658L, 1L, 2843L, 7584007864107778048L, 590L, 8899122608190930944L, 3588L, 3609L, 3824L, 7690986322714066944L, 7765456790394871808L, -8649711322250362880L, 1948L, -9101953184875757568L, 2463L, 1813L, 7054271419461812224L, 7548958830580563968L, -9206329156028112896L, 2637L, -7661250850555633664L, 664L, 2487L, 8221561626658881536L, 8169878743136043008L, 6927260280037097472L, 342L, -7501803640821456896L, 2745L, 677L, 8435912708683087872L, 7412924364686458880L, 3563L, 1L, 7153922334283776000L, 8849475396952514560L, 2977L, -7910019233726242816L, 2835L, 2335L, 1L, 2515L, -7617860842651017216L, -7637755520917741568L, 2647L, 707L, 8856674723376668672L, 7857878068300898304L, -8887058200926093312L, 108L, 2762L, 3622L, 868L, 138L, 1786L, 9116137265342169088L, 7955126053367119872L, 491L, -7778829032042790912L, -7753051494275432448L, 8962097525980225536L, 8163948965373386752L, 1145L, -8438554249514491904L, 522L, 1785L, 1545L, 999L, 1941L, 1L, 7454442625055145984L, 3510L, 2373L, -8127494999848919040L, 1643L, -7819437864839495680L, -7822452149325094912L, 7411793502161182720L, 2274L, 8783241818558193664L, 8316336224427483136L, -7669169138124275712L, 2984L, -7772064021830574080L, 3397L, 1L, 8523972434954510336L, -7127548949860818944L, 8286706213485297664L, 3147L, -7536330682873937920L, -7115054815375073280L, -7319315187617587200L, 1099L, -8989473881707921408L, 2816L, -6986178228432322560L, -7759425383684849664L, -7893577088764174336L, 8091421389575282688L, -7409653086454030336L, 7348598907182800896L, -7362189611124563968L, 1L, 2465L, 350L, 2619L, 3722L, 898L, 782L, 1780L, 2186L, -6921654334727036928L, 4020L, 8325227661920133120L, -7036607470351654912L, 7304839835188609024L, 8792059919353348096L, -8856821118526734336L, 8720504651219001344L, 1055L, 1368L, 8736061027343859712L, 7919597361814577152L, 7381659098423926784L, 8731960288562044928L, -7594824008626372608L, -9178166810751909888L, 3083L, -8948335470186373120L, 2569L, 823L, 259L, 8461498293348065280L, -8961059046745669632L, -8607195685207408640L, -8754966081778565120L, -8418913260807217152L, -8877053610728161280L, -6935548339131138048L, -8219876839318716416L, 1132L, 1337L, 1341L, 976L, -7557017910095650816L, 1L, -8683802826440105984L, 1845L, 1965L, -8104684579106914304L, 1835L, 7345991518378442752L, 3212L, -7081500255163727872L, 1074L, 8372588378498777088L, -7593363318079610880L, -7451660755269853184L, 1983L, 8514851182589771776L, 1864L, 8463868417649524736L, 3094L, -8858063395050110976L, 1981L, -8140349174954893312L, -7041362811802148864L, 8972161729142095872L, 7989119273552158720L, 2469L, 1481L, -8566856504746352640L, 8272001752345690112L, -7094827141662539776L, 8396433451610652672L, -7679894005808693248L, 8613562211893919744L, 3407L, 7686992843032010752L, 1048L, 3507L, 7784169796350730240L, 8551446856960942080L, 3467L, 1458L, 213L, 735L, 9190466190353661952L, -8280276629934981120L, -7895991410072928256L, -9145593811310010368L, 8059284960252731392L, 367L, 7614435638888210432L, 9174894805640142848L, -8941201923743703040L, 1075L, 7492436934952574976L, -8714995808835444736L, 7782245855193874432L, 8525894870444638208L, -7661192563533062144L, 1L, 8995562121346260992L, 7626715182847090688L, 8146492373537660928L, 7682327310082531328L, 2968L, 7309156463509061632L, 1955L, 1L, 7022349041913978880L, 7045967493826387968L, 3006L, 65L, 8168742078705262592L, 7212016545671348224L, 8079573715140485120L, 3965L, 8555933456197828608L, 2903L, 7648729477297987584L, 1L, 8223732800007864320L, -7412431471807283200L, 2560L, 2988L, 1243L, 1837L, 7014537632150224896L, 3747L, 2682L, 8073733016154431488L, 2938L, 1312L, 7006803044329021440L, 7701723309715685376L, 7528074274555305984L, -7532751268425261056L, 8000440057238052864L, -7964801953178091520L, 2846L, 8723248113030782976L, 7440265908266827776L, 927L, -7063777488249085952L, 9194388393453060096L, 7720187583697502208L, 8557218322962644992L, 950L, 2189L, 1371L, 7370803940448305152L, -8914039133569400832L, 3663L, 2341L, -8877431933441327104L, 8171188598958407680L, 8525336514806317056L, 1608L, -7094189393339678720L, 1752L, 3084L, 3673L, 9169248521377374208L, -7866079955473989632L, -9004892183139811328L, 1892L, 6928080429732536320L, -7623047151287754752L, 2492L, -7695491171376291840L, -7797151404935618560L, 8208354137450766336L, -7395553021620731904L, -8453491903284994048L, -7140008543769042944L, 2724L, 3443L, -7512297136103800832L, 9136234417125007360L, 8192304692696383488L, 8199513544090730496L, 311L, -8488247955875618816L, 1L, 2540L, 586L, -7444070205513138176L, 1141L, -8076479329071955968L, 3103L, -7629401308029976576L, -7507424948896415744L, 2821L, 2017L, 1134L, 347L, -7246123871306244096L, 2020L, 1693L, 2020L, 8570983266408103936L, 2919L, 2283L, 7534042483076857856L, 1L, 8991442360387584000L, -7240213957902663680L, 3365L, 1899L, 7199539820886958080L, 7165364563962191872L, 8407869317250220032L, 1489L, 2400L, -7037375807670501376L, 7235109456886816768L, 8569030475428511744L, 2067L, 8332670681629106176L, 168L, 1L, -83}; + + validateLongArray(rand, test); + + validateLongRepeating(rand, 1, 5); + validateLongRepeating(rand, 20, 0); + validateLongRepeating(rand, 1024, 9383844); + } + + private void validateKeySeriesBytesArray(VectorizedRowBatch batch, byte[][] test) throws IOException { + // Lazy binary key serializer. + LazyBinarySerializeWrite keyLazyBinarySerializeWrite = + new LazyBinarySerializeWrite(1); + LazyBinaryDeserializeRead keyLazyBinarySerializeRead = + new LazyBinaryDeserializeRead(new TypeInfo[] {TypeInfoFactory.stringTypeInfo}); + ReadStringResults readStringResults = keyLazyBinarySerializeRead.createReadStringResults(); + + VectorKeySeriesBytesFast bytesKeySeriesFast = + new VectorKeySeriesBytesFast( + 0, keyLazyBinarySerializeWrite); + bytesKeySeriesFast.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < bytesKeySeriesFast.getKeyCount(); k++) { + assertTrue(logicalIndex == bytesKeySeriesFast.getCurrentLogical()); + int duplicateCount = bytesKeySeriesFast.getCurrentDuplicateCount(); + + if (!bytesKeySeriesFast.getCurrentKeyAllNull()) { + + byte[] serializedBytes = bytesKeySeriesFast.getSerializedBytes(); + int serializedStart = bytesKeySeriesFast.getSerializedStart(); + int serializedLength = bytesKeySeriesFast.getSerializedLength(); + + keyLazyBinarySerializeRead.set(serializedBytes, serializedStart, serializedLength); + assertTrue(!keyLazyBinarySerializeRead.readCheckNull()); + + keyLazyBinarySerializeRead.readString(readStringResults); + + byte[] currentKey = readStringResults.bytes; + int start = readStringResults.start; + int length = readStringResults.length; + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + byte[] testBytes = test[batchIndex]; + if (!StringExpr.equal(testBytes, 0, testBytes.length, currentKey, start, length)) { + assertTrue(false); + } + } + } + logicalIndex += duplicateCount; + boolean isNext = bytesKeySeriesFast.next(); + if (k + 1 < bytesKeySeriesFast.getKeyCount()) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateBytesArray(Random rand, byte[][] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + BytesColumnVector bytesColVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = bytesColVector; + for (int i = 0; i < test.length; i++) { + bytesColVector.start[i] = 0; + bytesColVector.length[i] = test[i].length; + bytesColVector.vector[i] = test[i]; + } + System.arraycopy(test, 0, bytesColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesBytesArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesBytesArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + } + + private void validateBytesRepeating(Random rand, int count, byte[] value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + BytesColumnVector bytesColVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = bytesColVector; + bytesColVector.isRepeating = true; + bytesColVector.vector[0] = value; + bytesColVector.start[0] = 0; + bytesColVector.length[0] = value.length; + batch.size = count; + byte[][] test = new byte[count][]; + for (int i = 0; i < count; i++) { + test[i] = value; + } + + validateKeySeriesBytesArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + } + + public void testVectorKeySeriesBytesFast() throws Throwable { + Random rand = new Random(933); + + byte[][] test = new byte[200][]; + for (int i = 0; i < test.length; i++) { + if (i > 0 && rand.nextInt(10) == 2) { + test[i] = test[i - 1]; + continue; + } + test[i] = RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes(); + } + + validateBytesArray(rand, test); + + validateBytesRepeating(rand, 1, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + validateBytesRepeating(rand, 20, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + validateBytesRepeating(rand, 1024, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + } + + private void validateMulti(VectorizedRowBatch batch, + VectorKeySeriesMultiFast multiKeySeriesFast, + LazyBinaryDeserializeRead keyLazyBinarySerializeRead, + LazyBinarySerializeWrite keyLazyBinarySerializeWrite, + RandomRowObjectSource source, Object[][] randomRows, + int firstRandomRowIndex) throws IOException, HiveException { + + multiKeySeriesFast.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < multiKeySeriesFast.getKeyCount(); k++) { + assertTrue(logicalIndex == multiKeySeriesFast.getCurrentLogical()); + + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex] : logicalIndex; + + Object[] row = randomRows[firstRandomRowIndex + batchIndex]; + Output output = RandomRowUtil.serializeRow(row, source, keyLazyBinarySerializeWrite); + byte[] testBytes = output.getData(); + int length = output.getLength(); + + int nullCount = 0; + for (int c = 0; c < row.length; c++) { + if (row[c] == null) { + nullCount++; + } + } + int duplicateCount = multiKeySeriesFast.getCurrentDuplicateCount(); + boolean keyAllNulls = multiKeySeriesFast.getCurrentKeyAllNull(); + if (keyAllNulls) { + assertEquals(nullCount, row.length); + } else { + boolean keyHasAnyNulls = multiKeySeriesFast.getCurrentKeyHasAnyNulls(); + if (keyHasAnyNulls) { + assertTrue(nullCount > 0); + } + + byte[] serializedBytes = multiKeySeriesFast.getSerializedBytes(); + int serializedStart = multiKeySeriesFast.getSerializedStart(); + int serializedLength = multiKeySeriesFast.getSerializedLength(); + + for (int d = 0; d < duplicateCount; d++) { + if (!StringExpr.equal(testBytes, 0, length, serializedBytes, serializedStart, serializedLength)) { + assertTrue(false); + } + } + } + logicalIndex += duplicateCount; + boolean isNext = multiKeySeriesFast.next(); + if (k + 1 < multiKeySeriesFast.getKeyCount()) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + public void testVectorKeySeriesMultiFastOne(Random rand, boolean addRandomNulls, + boolean addRandomDuplicates, boolean addRandomSelectToBatch) throws Throwable { + + String[] emptyScratchTypeNames = new String[0]; + + RandomRowObjectSource source = new RandomRowObjectSource(); + source.init(rand); + + VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); + batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); + VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); + + // junk the destination for the 1st pass + for (ColumnVector cv : batch.cols) { + Arrays.fill(cv.isNull, true); + } + + int fieldCount = source.typeNames().size(); + LazyBinaryDeserializeRead deserializeRead = new LazyBinaryDeserializeRead(source.primitiveTypeInfos()); + LazyBinarySerializeWrite serializeWrite = new LazyBinarySerializeWrite(fieldCount); + + // junk the destination for the 1st pass + for (ColumnVector cv : batch.cols) { + Arrays.fill(cv.isNull, true); + cv.noNulls = false; + } + + VectorAssignRowSameBatch vectorAssignRow = new VectorAssignRowSameBatch(); + vectorAssignRow.init(source.typeNames()); + vectorAssignRow.setOneBatch(batch); + + VectorKeySeriesMultiFast multiKeySeriesFast = + new VectorKeySeriesMultiFast( + serializeWrite); + int[] columnNums = new int[source.typeNames().size()]; + for (int i = 0; i < columnNums.length; i++) { + columnNums[i] = i; + } + multiKeySeriesFast.init(source.primitiveTypeInfos(), columnNums); + + Object[][] randomRows = source.randomRows(100000); + if (addRandomNulls) { + RandomRowUtil.addRandomNulls(rand, randomRows); + } + if (addRandomDuplicates) { + RandomRowUtil.addRandomDuplicates(rand, randomRows); + } + int firstRandomRowIndex = 0; + for (int i = 0; i < randomRows.length; i++) { + Object[] row = randomRows[i]; + + vectorAssignRow.assignRow(batch.size, row); + batch.size++; + if (batch.size == batch.DEFAULT_SIZE) { + if (addRandomSelectToBatch) { + RandomRowUtil.addRandomSelectToBatch(rand, batch); + } + validateMulti(batch, multiKeySeriesFast, deserializeRead, serializeWrite, + source, randomRows, firstRandomRowIndex); + firstRandomRowIndex = i + 1; + batch.reset(); + } + } + if (batch.size > 0) { + if (addRandomSelectToBatch) { + RandomRowUtil.addRandomSelectToBatch(rand, batch); + } + validateMulti(batch, multiKeySeriesFast, deserializeRead, serializeWrite, + source, randomRows, firstRandomRowIndex); + } + } + + public void testVectorKeySeriesMultiFast() throws Throwable { + Random rand = new Random(933); + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ false, /* addRandomDuplicates */ false, /* addRandomSelectToBatch */ false); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ true, /* addRandomDuplicates */ false, /* addRandomSelectToBatch */ false); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ false, /* addRandomDuplicates */ true, /* addRandomSelectToBatch */ false); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ true, /* addRandomDuplicates */ true, /* addRandomSelectToBatch */ false); + } + + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ false, /* addRandomDuplicates */ false, /* addRandomSelectToBatch */ true); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ true, /* addRandomDuplicates */ false, /* addRandomSelectToBatch */ true); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ false, /* addRandomDuplicates */ true, /* addRandomSelectToBatch */ true); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ true, /* addRandomDuplicates */ true, /* addRandomSelectToBatch */ true); + } + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CommonFastHashTable.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CommonFastHashTable.java index c2375e0..64b0607 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CommonFastHashTable.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CommonFastHashTable.java @@ -23,7 +23,7 @@ import java.util.List; import java.util.Random; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; import org.apache.hadoop.hive.serde2.WriteBuffers; import static org.junit.Assert.*; @@ -39,6 +39,8 @@ protected static final int LARGE_CAPACITY = 8388608; protected static Random random; + + public static int generateLargeCount() { int count = 0; if (random.nextInt(100) != 0) { @@ -75,7 +77,7 @@ public static int generateLargeCount() { } return count; } - public static void verifyHashMapResult(VectorMapJoinHashMapResult hashMapResult, + public static void verifyHashMapResult(MapJoinHashMapResult hashMapResult, RandomByteArrayStream randomByteArrayStream ) { List resultBytes = new ArrayList(); @@ -105,7 +107,7 @@ public static void verifyHashMapResult(VectorMapJoinHashMapResult hashMapResult, } } - public static void verifyHashMapResult(VectorMapJoinHashMapResult hashMapResult, + public static void verifyHashMapResult(MapJoinHashMapResult hashMapResult, byte[] valueBytes ) { assertTrue(hashMapResult.hasRows()); diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestKeyValueLong.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestKeyValueLong.java new file mode 100644 index 0000000..2182e89 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestKeyValueLong.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; +import org.apache.hadoop.io.BytesWritable; + +public class TestKeyValueLong { + + private BinarySortableSerializeWrite serializeWrite = new BinarySortableSerializeWrite(1); + private Output output = new Output(); + + public TestKeyValueLong() { + } + + public BytesWritable getLongAsBytesWritable(long longValue) throws IOException { + serializeWrite.set(output); + serializeWrite.writeLong(longValue); + byte[] bytes = Arrays.copyOf(output.getData(), output.getLength()); + return new BytesWritable(bytes); + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestLongKeyValueWriter.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestLongKeyValueWriter.java new file mode 100644 index 0000000..273fa51 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestLongKeyValueWriter.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HashCodeUtil; + +public class TestLongKeyValueWriter implements MapJoinHashTable.KeyValuePutWriter { + + private long key; + private byte[] value; + + public TestLongKeyValueWriter() { + } + + void setLongKeyValue(long key, byte[] value) { + this.key = key; + this.value = value; + } + + @Override + public void writeKey(RandomAccessOutput dest) throws SerDeException { + LazyBinaryUtils.writeVLong(dest, key); + } + + @Override + public void writeValue(RandomAccessOutput dest) throws SerDeException { + try { + dest.write(value); + } catch (Exception e) { + throw new SerDeException(e); + } + } + + @Override + public byte updateStateByte(Byte previousValue) { + return 0; + } + + @Override + public void setKeyValue(Writable key, Writable value) throws SerDeException, + IOException { + throw new RuntimeException("Not used"); + } + + @Override + public boolean hasHashCode() { + return true; + } + + @Override + public int getKeyHashCode() throws SerDeException { + return HashCodeUtil.calculateLongHashCode(key); + } + + @Override + public long getLongKey() { + throw new RuntimeException("Not used"); + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java index a45275b..129ad1d 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java @@ -18,12 +18,17 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; +import java.io.IOException; import java.util.Random; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastLongHashMap; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; import static org.junit.Assert.*; @@ -33,100 +38,148 @@ @Test public void testPutGetOne() throws Exception { random = new Random(47496); + TestKeyValueLong testKeyValueLong = new TestKeyValueLong(); - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap(false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.LONG); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); + + MapJoinHashTable map = + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); RandomLongStream randomLongKeyStream = new RandomLongStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); long key = randomLongKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); - verifyHashMapResult(map, key, randomByteArrayValueStream.get(0)); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); + verifyHashMapResult(map, key, randomByteArrayValueStream.get(0), + hashMapResult); key = randomLongKeyStream.next(); value = randomByteArrayValueStream.next(); - map.putRow(key, value); - verifyHashMapResult(map, key, randomByteArrayValueStream.get(1)); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); + verifyHashMapResult(map, key, randomByteArrayValueStream.get(1), + hashMapResult); } @Test public void testPutGetMultiple() throws Exception { random = new Random(2990); + TestKeyValueLong testKeyValueLong = new TestKeyValueLong(); + + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.LONG); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); - VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + MapJoinHashTable map = + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); RandomLongStream randomLongKeyStream = new RandomLongStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); long key = randomLongKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); - verifyHashMapResult(map, key, value); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); + + verifyHashMapResult(map, key, value, hashMapResult); // Same key, multiple values. for (int i = 0; i < 3; ++i) { value = randomByteArrayValueStream.next(); - map.putRow(key, value); - verifyHashMapResult(map, key, randomByteArrayValueStream); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); + verifyHashMapResult(map, key, randomByteArrayValueStream, + hashMapResult); } } @Test public void testGetNonExistent() throws Exception { random = new Random(16916); + TestKeyValueLong testKeyValueLong = new TestKeyValueLong(); + + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.LONG); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); - VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + MapJoinHashTable map = + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); RandomLongStream randomLongKeyStream = new RandomLongStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); long key = randomLongKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); key += 1; - map.putRow(key, value); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); key += 1; - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + int hashCode = HashCodeUtil.calculateLongHashCode(key); + map.hashMapLookup(key, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + assertTrue(lookupResult == MapJoinHashTableResult.MapJoinResult.NO_MATCH); assertTrue(!hashMapResult.hasRows()); } @Test public void testPutWithFullMap() throws Exception { random = new Random(26078); + TestKeyValueLong testKeyValueLong = new TestKeyValueLong(); + + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.LONG); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); // Make sure the map does not expand; should be able to find space. - VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, false, HashTableKeyType.LONG, CAPACITY, 1f, WB_SIZE); + MapJoinHashTable map = + factory.createHashTable(CAPACITY, 1f, WB_SIZE, 0); RandomLongStream randomLongKeyStream = new RandomLongStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); for (int i = 0; i < CAPACITY; ++i) { long key = randomLongKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); } for (int i = 0; i < randomLongKeyStream.size(); ++i) { - verifyHashMapResult(map, randomLongKeyStream.get(i), randomByteArrayValueStream.get(i)); + verifyHashMapResult(map, randomLongKeyStream.get(i), randomByteArrayValueStream.get(i), + hashMapResult); } // assertEquals(CAPACITY, map.getCapacity()); // Get of non-existent key should terminate.. long anotherKey = randomLongKeyStream.next(); - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(anotherKey, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + int hashCode = HashCodeUtil.calculateLongHashCode(anotherKey); + map.hashMapLookup(anotherKey, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + assertTrue(lookupResult == MapJoinHashTableResult.MapJoinResult.NO_MATCH); } @Test public void testExpand() throws Exception { random = new Random(22470); + TestKeyValueLong testKeyValueLong = new TestKeyValueLong(); + + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.LONG); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); // Start with capacity 1; make sure we expand on every put. - VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, false, HashTableKeyType.LONG, 1, 0.0000001f, WB_SIZE); + MapJoinHashTable map = + factory.createHashTable(1, 0.0000001f, WB_SIZE, 0); RandomLongStream randomLongKeyStream = new RandomLongStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); @@ -134,9 +187,11 @@ public void testExpand() throws Exception { for (int i = 0; i < 18; ++i) { long key = randomLongKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); for (int j = 0; j <= i; ++j) { - verifyHashMapResult(map, randomLongKeyStream.get(j), randomByteArrayValueStream.get(j)); + verifyHashMapResult(map, randomLongKeyStream.get(j), randomByteArrayValueStream.get(j), + hashMapResult); } } // assertEquals(1 << 18, map.getCapacity()); @@ -145,9 +200,16 @@ public void testExpand() throws Exception { @Test public void testLarge() throws Exception { random = new Random(40719); + TestKeyValueLong testKeyValueLong = new TestKeyValueLong(); + + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.LONG); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, false, HashTableKeyType.LONG, LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + MapJoinHashTable map = + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); RandomLongStream randomLongKeyStream = new RandomLongStream(random); @@ -159,20 +221,30 @@ public void testLarge() throws Exception { long key = randomLongKeyStream.next(); for (int v = 0; v < count; v++) { byte[] value = randomByteArrayValueStreams[i].next(); - map.putRow(key, value); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); } } for (int i = 0; i < largeSize; i++) { - verifyHashMapResult(map, randomLongKeyStream.get(i), randomByteArrayValueStreams[i]); + verifyHashMapResult(map, randomLongKeyStream.get(i), randomByteArrayValueStreams[i], + hashMapResult); } } @Test public void testLargeAndExpand() throws Exception { random = new Random(46809); + TestKeyValueLong testKeyValueLong = new TestKeyValueLong(); + + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.LONG); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, false, HashTableKeyType.LONG, MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + MapJoinHashTable map = + factory.createHashTable( + MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE, 0); RandomLongStream randomLongKeyStream = new RandomLongStream(random); @@ -184,32 +256,40 @@ public void testLargeAndExpand() throws Exception { long key = randomLongKeyStream.next(); for (int v = 0; v < count; v++) { byte[] value = randomByteArrayValueStreams[i].next(); - map.putRow(key, value); + kvWriter.setKeyValue(testKeyValueLong.getLongAsBytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); } } for (int i = 0; i < largeSize; i++) { - verifyHashMapResult(map, randomLongKeyStream.get(i), randomByteArrayValueStreams[i]); + verifyHashMapResult(map, randomLongKeyStream.get(i), randomByteArrayValueStreams[i], + hashMapResult); } } - private void verifyHashMapResult(VectorMapJoinFastLongHashMap map, long key, - RandomByteArrayStream randomByteArrayValueStream) { + private void verifyHashMapResult(MapJoinHashTable map, long key, + RandomByteArrayStream randomByteArrayValueStream, + MapJoinHashMapResult hashMapResult) + throws IOException { - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, hashMapResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + int hashCode = HashCodeUtil.calculateLongHashCode(key); + map.hashMapLookup(key, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + if (lookupResult != MapJoinHashTableResult.MapJoinResult.MATCH) { assertTrue(false); } CommonFastHashTable.verifyHashMapResult(hashMapResult, randomByteArrayValueStream); } - private void verifyHashMapResult(VectorMapJoinFastLongHashMap map, long key, - byte[] valueBytes) { + private void verifyHashMapResult(MapJoinHashTable map, + long key, byte[] valueBytes, + MapJoinHashMapResult hashMapResult) + throws IOException { - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, hashMapResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + int hashCode = HashCodeUtil.calculateLongHashCode(key); + map.hashMapLookup(key, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + if (lookupResult != MapJoinHashTableResult.MapJoinResult.MATCH) { assertTrue(false); } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastMultiKeyHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastMultiKeyHashMap.java index 944bda6..d3155c8 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastMultiKeyHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastMultiKeyHashMap.java @@ -18,12 +18,16 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; +import java.io.IOException; import java.util.Random; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastMultiKeyHashMap; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; import static org.junit.Assert.*; @@ -34,42 +38,61 @@ public void testPutGetOne() throws Exception { random = new Random(47496); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.MULTI_KEY); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); + VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, LOAD_FACTOR, WB_SIZE); + (VectorMapJoinFastMultiKeyHashMap) factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); RandomByteArrayStream randomByteArrayKeyStream = new RandomByteArrayStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); byte[] key = randomByteArrayKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); - verifyHashMapResult(map, key, randomByteArrayValueStream.get(0)); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); + verifyHashMapResult(map, key, randomByteArrayValueStream.get(0), + hashMapResult); key = randomByteArrayKeyStream.next(); value = randomByteArrayValueStream.next(); - map.putRow(key, value); - verifyHashMapResult(map, key, randomByteArrayValueStream.get(1)); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); + verifyHashMapResult(map, key, randomByteArrayValueStream.get(1), + hashMapResult); } @Test public void testPutGetMultiple() throws Exception { random = new Random(2990); - VectorMapJoinFastMultiKeyHashMap map = new VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.MULTI_KEY); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); + + VectorMapJoinFastMultiKeyHashMap map = + (VectorMapJoinFastMultiKeyHashMap) factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); RandomByteArrayStream randomByteArrayKeyStream = new RandomByteArrayStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); byte[] key = randomByteArrayKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); - verifyHashMapResult(map, key, value); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); + verifyHashMapResult(map, key, value, + hashMapResult); // Same key, multiple values. for (int i = 0; i < 3; ++i) { value = randomByteArrayValueStream.next(); - map.putRow(key, value); - verifyHashMapResult(map, key, randomByteArrayValueStream); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); + verifyHashMapResult(map, key, randomByteArrayValueStream, + hashMapResult); } } @@ -77,22 +100,31 @@ public void testPutGetMultiple() throws Exception { public void testGetNonExistent() throws Exception { random = new Random(16916); - VectorMapJoinFastMultiKeyHashMap map = new VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.MULTI_KEY); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); + + VectorMapJoinFastMultiKeyHashMap map = + (VectorMapJoinFastMultiKeyHashMap) factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); RandomByteArrayStream randomByteArrayKeyStream = new RandomByteArrayStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); byte[] key = randomByteArrayKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); key[0] = (byte) (key[0] + 1); - map.putRow(key, value); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); key[0] = (byte) (key[0] + 1); - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, 0, key.length, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + assertTrue(lookupResult == MapJoinHashTableResult.MapJoinResult.NO_MATCH); assertTrue(!hashMapResult.hasRows()); } @@ -100,33 +132,48 @@ public void testGetNonExistent() throws Exception { public void testPutWithFullMap() throws Exception { random = new Random(26078); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.MULTI_KEY); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); + // Make sure the map does not expand; should be able to find space. - VectorMapJoinFastMultiKeyHashMap map = new VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, 1f, WB_SIZE); + VectorMapJoinFastMultiKeyHashMap map = + (VectorMapJoinFastMultiKeyHashMap) factory.createHashTable(CAPACITY, 1f, WB_SIZE, 0); RandomByteArrayStream randomByteArrayKeyStream = new RandomByteArrayStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); for (int i = 0; i < CAPACITY; ++i) { byte[] key = randomByteArrayKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); } for (int i = 0; i < randomByteArrayKeyStream.size(); ++i) { - verifyHashMapResult(map, randomByteArrayKeyStream.get(i), randomByteArrayValueStream.get(i)); + verifyHashMapResult(map, randomByteArrayKeyStream.get(i), randomByteArrayValueStream.get(i), + hashMapResult); } // assertEquals(CAPACITY, map.getCapacity()); // Get of non-existent key should terminate.. byte[] anotherKey = randomByteArrayKeyStream.next(); - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(anotherKey, 0, anotherKey.length, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + int hashCode = HashCodeUtil.murmurHash(anotherKey, 0, anotherKey.length); + map.hashMapLookup(anotherKey, 0, anotherKey.length, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + assertTrue(lookupResult == MapJoinHashTableResult.MapJoinResult.NO_MATCH); } @Test public void testExpand() throws Exception { random = new Random(22470); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.MULTI_KEY); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); + // Start with capacity 1; make sure we expand on every put. - VectorMapJoinFastMultiKeyHashMap map = new VectorMapJoinFastMultiKeyHashMap(false, 1, 0.0000001f, WB_SIZE); + VectorMapJoinFastMultiKeyHashMap map = + (VectorMapJoinFastMultiKeyHashMap) factory.createHashTable(1, 0.0000001f, WB_SIZE, 0); RandomByteArrayStream randomByteArrayKeyStream = new RandomByteArrayStream(random); RandomByteArrayStream randomByteArrayValueStream = new RandomByteArrayStream(random); @@ -134,9 +181,11 @@ public void testExpand() throws Exception { for (int i = 0; i < 18; ++i) { byte[] key = randomByteArrayKeyStream.next(); byte[] value = randomByteArrayValueStream.next(); - map.putRow(key, value); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); for (int j = 0; j <= i; ++j) { - verifyHashMapResult(map, randomByteArrayKeyStream.get(j), randomByteArrayValueStream.get(j)); + verifyHashMapResult(map, randomByteArrayKeyStream.get(j), randomByteArrayValueStream.get(j), + hashMapResult); } } // assertEquals(1 << 18, map.getCapacity()); @@ -146,8 +195,14 @@ public void testExpand() throws Exception { public void testLarge() throws Exception { random = new Random(5231); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.MULTI_KEY); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = new VectorMapJoinFastMultiKeyHashMap(false, LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap map = + (VectorMapJoinFastMultiKeyHashMap) factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); RandomByteArrayStream randomByteArrayKeyStream = new RandomByteArrayStream(random, 10); @@ -157,19 +212,22 @@ public void testLarge() throws Exception { randomByteArrayValueStreams[i] = new RandomByteArrayStream(random); int count = generateLargeCount(); byte[] key = randomByteArrayKeyStream.next(); - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, 0, key.length, hashMapResult); - if (joinResult == JoinUtil.JoinResult.MATCH) { + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + if (lookupResult == MapJoinHashTableResult.MapJoinResult.MATCH) { // A problem or need different random seed / longer key? assertTrue(false); } for (int v = 0; v < count; v++) { byte[] value = randomByteArrayValueStreams[i].next(); - map.putRow(key, value); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); } } for (int i = 0; i < largeSize; i++) { - verifyHashMapResult(map, randomByteArrayKeyStream.get(i), randomByteArrayValueStreams[i]); + verifyHashMapResult(map, randomByteArrayKeyStream.get(i), randomByteArrayValueStreams[i], + hashMapResult); } } @@ -177,8 +235,14 @@ public void testLarge() throws Exception { public void testLargeAndExpand() throws Exception { random = new Random(46809); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKeyType.MULTI_KEY); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + KeyValuePut kvWriter = factory.createKeyValuePut(); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = new VectorMapJoinFastMultiKeyHashMap(false, MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap map = + (VectorMapJoinFastMultiKeyHashMap) factory.createHashTable(MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE, 0); RandomByteArrayStream randomByteArrayKeyStream = new RandomByteArrayStream(random, 10); @@ -188,40 +252,47 @@ public void testLargeAndExpand() throws Exception { randomByteArrayValueStreams[i] = new RandomByteArrayStream(random); int count = generateLargeCount(); byte[] key = randomByteArrayKeyStream.next(); - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, 0, key.length, hashMapResult); - if (joinResult == JoinUtil.JoinResult.MATCH) { + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + if (lookupResult == MapJoinHashTableResult.MapJoinResult.MATCH) { // A problem or need different random seed / longer key? assertTrue(false); } for (int v = 0; v < count; v++) { byte[] value = randomByteArrayValueStreams[i].next(); - map.putRow(key, value); + kvWriter.setKeyValue(new BytesWritable(key), new BytesWritable(value)); + map.put(kvWriter); } } for (int i = 0; i < largeSize; i++) { - verifyHashMapResult(map, randomByteArrayKeyStream.get(i), randomByteArrayValueStreams[i]); + verifyHashMapResult(map, randomByteArrayKeyStream.get(i), randomByteArrayValueStreams[i], + hashMapResult); } } - private void verifyHashMapResult(VectorMapJoinFastMultiKeyHashMap map, byte[] key, - RandomByteArrayStream randomByteArrayValueStream) { + private void verifyHashMapResult(MapJoinHashTable map, byte[] key, + RandomByteArrayStream randomByteArrayValueStream, + MapJoinHashMapResult hashMapResult) throws IOException { - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, 0, key.length, hashMapResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + if (lookupResult != MapJoinHashTableResult.MapJoinResult.MATCH) { assertTrue(false); } CommonFastHashTable.verifyHashMapResult(hashMapResult, randomByteArrayValueStream); } - private void verifyHashMapResult(VectorMapJoinFastMultiKeyHashMap map, byte[] key, - byte[] valueBytes) { + private void verifyHashMapResult(MapJoinHashTable map, byte[] key, + byte[] valueBytes, + MapJoinHashMapResult hashMapResult) throws IOException { - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, 0, key.length, hashMapResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); + MapJoinHashTableResult.MapJoinResult lookupResult = hashMapResult.getMapJoinResult(); + if (lookupResult != MapJoinHashTableResult.MapJoinResult.MATCH) { assertTrue(false); } diff --git ql/src/test/results/clientpositive/tez/vector_decimal_mapjoin_bug.q.out ql/src/test/results/clientpositive/tez/vector_decimal_mapjoin_bug.q.out new file mode 100644 index 0000000..3e2d890 --- /dev/null +++ ql/src/test/results/clientpositive/tez/vector_decimal_mapjoin_bug.q.out @@ -0,0 +1,379 @@ +PREHOOK: query: CREATE TABLE over1k(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + dec decimal(4,2), + bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@over1k +POSTHOOK: query: CREATE TABLE over1k(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + dec decimal(4,2), + bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@over1k +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over1k +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@over1k +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over1k +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@over1k +PREHOOK: query: CREATE TABLE t1(dec decimal(4,2)) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1 +POSTHOOK: query: CREATE TABLE t1(dec decimal(4,2)) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1 +PREHOOK: query: INSERT INTO TABLE t1 select dec from over1k where dec >= 45.0 LIMIT 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@over1k +PREHOOK: Output: default@t1 +POSTHOOK: query: INSERT INTO TABLE t1 select dec from over1k where dec >= 45.0 LIMIT 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over1k +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.dec SIMPLE [(over1k)over1k.FieldSchema(name:dec, type:decimal(4,2), comment:null), ] +PREHOOK: query: CREATE TABLE t2(dec decimal(4,0)) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2 +POSTHOOK: query: CREATE TABLE t2(dec decimal(4,0)) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2 +PREHOOK: query: INSERT INTO TABLE t2 select dec from over1k where dec >= 45.0 LIMIT 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@over1k +PREHOOK: Output: default@t2 +POSTHOOK: query: INSERT INTO TABLE t2 select dec from over1k where dec >= 45.0 LIMIT 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over1k +POSTHOOK: Output: default@t2 +POSTHOOK: Lineage: t2.dec EXPRESSION [(over1k)over1k.FieldSchema(name:dec, type:decimal(4,2), comment:null), ] +PREHOOK: query: select * from t1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select * from t1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +45.00 +45.40 +45.69 +45.81 +45.94 +46.72 +46.91 +47.54 +48.17 +48.51 +48.73 +48.80 +48.81 +49.85 +50.26 +50.57 +51.36 +51.64 +51.91 +53.52 +54.01 +55.06 +55.16 +56.02 +56.33 +57.09 +57.63 +57.69 +58.05 +58.96 +59.56 +60.30 +60.88 +61.29 +61.96 +62.58 +63.06 +63.27 +63.90 +64.36 +64.39 +64.61 +64.88 +64.89 +65.77 +65.91 +66.89 +67.23 +67.54 +68.33 +68.98 +70.91 +71.59 +72.16 +72.82 +73.48 +73.61 +75.43 +75.85 +76.06 +76.21 +78.69 +79.00 +79.50 +80.12 +82.08 +82.21 +82.24 +82.99 +83.09 +84.56 +84.63 +85.44 +85.52 +85.91 +86.70 +87.99 +88.09 +90.16 +90.21 +90.69 +91.03 +91.42 +91.46 +92.06 +92.24 +92.47 +92.81 +92.95 +93.48 +93.79 +94.43 +95.26 +95.52 +96.97 +97.64 +97.87 +98.87 +99.18 +99.79 +PREHOOK: query: select * from t2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select * from t2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +100 +45 +45 +46 +46 +46 +47 +47 +48 +48 +49 +49 +49 +49 +50 +50 +51 +51 +52 +52 +54 +54 +55 +55 +56 +56 +57 +58 +58 +58 +59 +60 +60 +61 +61 +62 +63 +63 +63 +64 +64 +64 +65 +65 +65 +66 +66 +67 +67 +68 +68 +69 +71 +72 +72 +73 +73 +74 +75 +76 +76 +76 +79 +79 +80 +80 +82 +82 +82 +83 +83 +85 +85 +85 +86 +86 +87 +88 +88 +90 +90 +91 +91 +91 +91 +92 +92 +92 +93 +93 +93 +94 +94 +95 +96 +97 +98 +98 +99 +99 +PREHOOK: query: explain +select t1.dec, t2.dec from t1 join t2 on (t1.dec=t2.dec) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.dec, t2.dec from t1 join t2 on (t1.dec=t2.dec) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Map 1 <- Map 2 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 100 Data size: 11200 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: dec is not null (type: boolean) + Statistics: Num rows: 100 Data size: 11200 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: dec (type: decimal(4,2)) + outputColumnNames: _col0 + Statistics: Num rows: 100 Data size: 11200 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: decimal(6,2)) + 1 _col0 (type: decimal(6,2)) + outputColumnNames: _col0, _col1 + input vertices: + 1 Map 2 + Statistics: Num rows: 110 Data size: 12320 Basic stats: COMPLETE Column stats: NONE + HybridGraceHashJoin: true + File Output Operator + compressed: false + Statistics: Num rows: 110 Data size: 12320 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map 2 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 100 Data size: 11200 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: dec is not null (type: boolean) + Statistics: Num rows: 100 Data size: 11200 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: dec (type: decimal(4,0)) + outputColumnNames: _col0 + Statistics: Num rows: 100 Data size: 11200 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: decimal(6,2)) + sort order: + + Map-reduce partition columns: _col0 (type: decimal(6,2)) + Statistics: Num rows: 100 Data size: 11200 Basic stats: COMPLETE Column stats: NONE + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- SORT_QUERY_RESULTS + +select t1.dec, t2.dec from t1 join t2 on (t1.dec=t2.dec) +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: -- SORT_QUERY_RESULTS + +select t1.dec, t2.dec from t1 join t2 on (t1.dec=t2.dec) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +45.00 45 +45.00 45 +79.00 79 +79.00 79 diff --git serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java index 5900428..7640af4 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java +++ serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java @@ -50,6 +50,11 @@ public void clear() { buffer = null; bufferIndex = offset = -1; } + public void set(Position pos) { + buffer = pos.buffer; + bufferIndex = pos.bufferIndex; + offset = pos.offset; + } } Position writePos = new Position(); // Position where we'd write @@ -538,6 +543,21 @@ public long readNByteLong(long offset, int bytes, Position readPos) { return v; } + public long readNByteLong(int bytes, Position readPos) { + long v = 0; + if (isAllInOneReadBuffer(bytes, readPos)) { + for (int i = 0; i < bytes; ++i) { + v = (v << 8) + (readPos.buffer[readPos.offset + i] & 0xff); + } + readPos.offset += bytes; + } else { + for (int i = 0; i < bytes; ++i) { + v = (v << 8) + (readNextByte(readPos) & 0xff); + } + } + return v; + } + public void writeFiveByteULong(long offset, long v) { int prevIndex = writePos.bufferIndex, prevOffset = writePos.offset; setWritePoint(offset); @@ -560,6 +580,23 @@ public void writeFiveByteULong(long offset, long v) { writePos.offset = prevOffset; } + public void writeFiveByteULong(long v) { + if (isAllInOneWriteBuffer(5)) { + writePos.buffer[writePos.offset] = (byte)(v >>> 32); + writePos.buffer[writePos.offset + 1] = (byte)(v >>> 24); + writePos.buffer[writePos.offset + 2] = (byte)(v >>> 16); + writePos.buffer[writePos.offset + 3] = (byte)(v >>> 8); + writePos.buffer[writePos.offset + 4] = (byte)(v); + writePos.offset += 5; + } else { + write((byte)(v >>> 32)); + write((byte)(v >>> 24)); + write((byte)(v >>> 16)); + write((byte)(v >>> 8)); + write((byte)(v)); + } + } + public int readInt(long offset) { return (int)readNByteLong(offset, 4); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java index c6ff748..cdcdc6d 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java @@ -91,8 +91,8 @@ /* * Use this constructor when only ascending sort order is used. */ - public BinarySortableDeserializeRead(PrimitiveTypeInfo[] primitiveTypeInfos) { - this(primitiveTypeInfos, null); + public BinarySortableDeserializeRead(TypeInfo[] typeInfos) { + this(typeInfos, null); } public BinarySortableDeserializeRead(TypeInfo[] typeInfos, @@ -133,6 +133,14 @@ public void set(byte[] bytes, int offset, int length) { } /* + * Get context for current row being read for error reporting. + */ + @Override + public String getCurrentContext() { + throw new RuntimeException("Not implemented yet"); + } + + /* * Reads the NULL information for a field. * * @return Returns true when the field is NULL; reading is positioned to the next field. diff --git serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java index c2b0cfc..5101e7d 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/fast/DeserializeRead.java @@ -65,6 +65,11 @@ void set(byte[] bytes, int offset, int length); /* + * Get context for current row being read for error reporting. + */ + String getCurrentContext(); + + /* * Reads the NULL information for a field. * * @return Return true when the field is NULL; reading is positioned to the next field. diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java index f44a84b..ae380d0 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java @@ -155,6 +155,14 @@ public void set(byte[] bytes, int offset, int length) { } /* + * Get context for current row being read for error reporting. + */ + @Override + public String getCurrentContext() { + throw new RuntimeException("Not implemented yet"); + } + + /* * Reads the NULL information for a field. * * @return Returns true when the field is NULL; reading is positioned to the next field. diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java index c5f0730..87eee0c 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java @@ -67,6 +67,7 @@ private int offset; private int end; private int fieldCount; + private int saveFieldStart; private int fieldIndex; private byte nullByte; @@ -110,12 +111,38 @@ private LazyBinaryDeserializeRead() { public void set(byte[] bytes, int offset, int length) { this.bytes = bytes; this.offset = offset; + saveFieldStart = -1; start = offset; end = offset + length; fieldIndex = 0; } /* + * Get context for current row being read for error reporting. + */ + @Override + public String getCurrentContext() { + StringBuilder sb = new StringBuilder(); + sb.append("field "); + sb.append(fieldIndex - 1); + sb.append(" of "); + sb.append(fieldCount); + sb.append(" fields"); + + sb.append(", row start "); + sb.append(start); + sb.append(", row end "); + sb.append(end); + + sb.append(", field start "); + sb.append(saveFieldStart); + sb.append(", field length"); + sb.append(offset - saveFieldStart); + + return sb.toString(); + } + + /* * Reads the NULL information for a field. * * @return Returns true when the field is NULL; reading is positioned to the next field. @@ -123,6 +150,7 @@ public void set(byte[] bytes, int offset, int length) { */ @Override public boolean readCheckNull() throws IOException { + saveFieldStart = offset; if (fieldIndex >= fieldCount) { // Reading beyond the specified field count produces NULL. if (!readBeyondConfiguredFieldsWarned) { @@ -453,6 +481,9 @@ public void readString(ReadStringResults readStringResults) throws IOException { if (offset > end) { warnBeyondEof(); } + if (tempVInt.value < 0) { + throw new IOException("Bad string length " + tempVInt.value); + } int saveStart = offset; int length = tempVInt.value; offset += length; diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java index 91ef12d..98d4d0b 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java @@ -45,7 +45,7 @@ * * This is an alternative way to serialize than what is provided by LazyBinarySerDe. */ -public class LazyBinarySerializeWrite implements SerializeWrite { +public final class LazyBinarySerializeWrite implements SerializeWrite { public static final Logger LOG = LoggerFactory.getLogger(LazyBinarySerializeWrite.class.getName()); private Output output; diff --git storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java index 3fb0cfd..4e51428 100644 --- storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java +++ storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java @@ -112,4 +112,15 @@ public static int randomNanos(Random rand, int decimalDigits) { public static int randomNanos(Random rand) { return randomNanos(rand, 9); } + + public static String getRandString(Random r, String characters, int length) { + if (characters == null) { + characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + } + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < length; i++) { + sb.append(characters.charAt(r.nextInt(characters.length()))); + } + return sb.toString(); + } } \ No newline at end of file diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java index 99744cd..fc4ec33 100644 --- storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -170,6 +170,23 @@ public void setVal(int elementNum, byte[] sourceBuf) { } /** + * A variation on setVal that allocates room for the value but lets the caller do the copy. + * Afterward, the caller must immediately set the bytes referenced by vector[elementNum] for + * the correct length at the start index. + * @param elementNum index within column vector to set + * @param length length of source byte sequence + */ + public void allocateVal(int elementNum, int length) { + if ((nextFree + length) > buffer.length) { + increaseBufferSpace(length); + } + vector[elementNum] = buffer; + this.start[elementNum] = nextFree; + this.length[elementNum] = length; + nextFree += length; + } + + /** * Set a field to the concatenation of two string values. Result data is copied * into the internal buffer. *