diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 13cfdf1..fabfcdd 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -529,6 +529,8 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal HIVE_IN_TEZ_TEST("hive.in.tez.test", false, "internal use only, true when in testing tez", true), + HIVE_MAPJOIN_TESTING_NO_HASH_TABLE_LOAD("hive.mapjoin.testing.no.hash.table.load", false, "internal use only, true when in testing map join", + true), LOCALMODEAUTO("hive.exec.mode.local.auto", false, "Let Hive determine whether to run in local mode automatically"), diff --git common/src/java/org/apache/hive/common/util/HashCodeUtil.java common/src/java/org/apache/hive/common/util/HashCodeUtil.java index fa30273..90136b8 100644 --- common/src/java/org/apache/hive/common/util/HashCodeUtil.java +++ common/src/java/org/apache/hive/common/util/HashCodeUtil.java @@ -69,6 +69,14 @@ public static int calculateBytesHashCode(byte[] keyBytes, int keyStart, int keyL } public static void calculateBytesArrayHashCodes(byte[][] bytesArrays, + int[] starts, int[] lengths, int[] hashCodes, final int count) { + + for (int i = 0; i < count; i++) { + hashCodes[i] = murmurHash(bytesArrays[i], starts[i], lengths[i]); + } + } + + public static void calculateBytesArrayHashCodes(byte[][] bytesArrays, int[] starts, int[] lengths, int[] valueSelected, int[] hashCodes, final int count) { for (int i = 0; i < count; i++) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java index 416606e..ced16c0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java @@ -34,12 +34,11 @@ import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.HashTableLoaderFactory; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; +import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMapFactory; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition; import org.apache.hadoop.hive.ql.exec.persistence.KeyValueContainer; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer.KeyValueHelper; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinRowContainer; @@ -48,6 +47,9 @@ import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe; import org.apache.hadoop.hive.ql.exec.persistence.ObjectContainer; import org.apache.hadoop.hive.ql.exec.persistence.UnwrapRowContainer; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.spark.SparkUtilities; import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.hive.ql.log.PerfLogger; @@ -71,6 +73,7 @@ import org.slf4j.LoggerFactory; import com.esotericsoftware.kryo.KryoException; +import com.google.common.annotations.VisibleForTesting; /** * Map side Join operator implementation. @@ -93,13 +96,15 @@ protected transient ReusableGetAdaptor[] hashMapRowGetters; private UnwrapRowContainer[] unwrapContainer; - private transient Configuration hconf; + protected transient Configuration hconf; private transient boolean hybridMapJoinLeftover; // whether there's spilled data to be processed protected transient MapJoinBytesTableContainer[] spilledMapJoinTables; // used to hold restored // spilled small tables protected HybridHashTableContainer firstSmallTable; // The first small table; // Only this table has spilled big table rows + protected transient boolean isTestingNoHashTableLoad; + /** Kryo ctor. */ protected MapJoinOperator() { super(); @@ -161,6 +166,12 @@ protected void initializeOp(Configuration hconf) throws HiveException { generateMapMetaData(); + isTestingNoHashTableLoad = HiveConf.getBoolVar(hconf, + HiveConf.ConfVars.HIVE_MAPJOIN_TESTING_NO_HASH_TABLE_LOAD); + if (isTestingNoHashTableLoad) { + return; + } + final ExecMapperContext mapContext = getExecContext(); final MapredContext mrContext = MapredContext.get(); @@ -197,6 +208,9 @@ protected void initializeOp(Configuration hconf) throws HiveException { @SuppressWarnings("unchecked") @Override protected void completeInitializationOp(Object[] os) throws HiveException { + if (isTestingNoHashTableLoad) { + return; + } if (os.length != 0) { Pair pair = (Pair) os[0]; @@ -235,6 +249,14 @@ protected void completeInitializationOp(Object[] os) throws HiveException { } } + @VisibleForTesting + public void setTestMapJoinTableContainer(int posSmallTable, + MapJoinTableContainer testMapJoinTableContainer, + MapJoinTableContainerSerDe mapJoinTableContainerSerDe) { + mapJoinTables[posSmallTable] = testMapJoinTableContainer; + mapJoinTableSerdes[posSmallTable] = mapJoinTableContainerSerDe; + } + @Override protected List getValueObjectInspectors( byte alias, List[] aliasToObjectInspectors) { @@ -345,8 +367,8 @@ public void cleanUpInputFileChangedOp() throws HiveException { loadHashTable(getExecContext(), MapredContext.get()); } - protected JoinUtil.JoinResult setMapJoinKey( - ReusableGetAdaptor dest, Object row, byte alias) throws HiveException { + protected MapJoinResult setMapJoinKey( + ReusableGetAdaptor dest, Object row, byte alias) throws HiveException, IOException { return dest.setFromRow(row, joinKeys[alias], joinKeysObjectInspectors[alias]); } @@ -394,7 +416,7 @@ public void process(Object row, int tag) throws HiveException { boolean bigTableRowSpilled = false; for (byte pos = 0; pos < order.length; pos++) { if (pos != alias) { - JoinUtil.JoinResult joinResult; + MapJoinResult joinResult; ReusableGetAdaptor adaptor; if (firstSetKey == null) { adaptor = firstSetKey = hashMapRowGetters[pos]; @@ -405,7 +427,7 @@ public void process(Object row, int tag) throws HiveException { joinResult = adaptor.setFromOther(firstSetKey); } MapJoinRowContainer rowContainer = adaptor.getCurrentRows(); - if (joinResult != JoinUtil.JoinResult.MATCH) { + if (joinResult != MapJoinResult.MATCH) { assert (rowContainer == null || !rowContainer.hasRows()) : "Expecting an empty result set for no match"; } @@ -419,7 +441,7 @@ public void process(Object row, int tag) throws HiveException { // For Hybrid Grace Hash Join, during the 1st round processing, // we only keep the LEFT side if the row is not spilled if (!conf.isHybridHashJoin() || hybridMapJoinLeftover || - (joinResult != JoinUtil.JoinResult.SPILL && !bigTableRowSpilled)) { + (joinResult != MapJoinResult.SPILL && !bigTableRowSpilled)) { joinNeeded = true; storage[pos] = dummyObjVectors[pos]; } else { @@ -437,7 +459,7 @@ public void process(Object row, int tag) throws HiveException { // When the JoinResult is SPILL, it means the corresponding small table row may have been // spilled to disk (at least the partition that holds this row is on disk). So we need to // postpone the join processing for this pair by also spilling this big table row. - if (joinResult == JoinUtil.JoinResult.SPILL && + if (joinResult == MapJoinResult.SPILL && !bigTableRowSpilled) { // For n-way join, only spill big table rows once spillBigTableRow(mapJoinTables[pos], row); bigTableRowSpilled = true; @@ -523,8 +545,8 @@ public void closeOp(boolean abort) throws HiveException { if (!hashPartitions[i].isHashMapOnDisk()) { hybridHtContainer.setTotalInMemRowCount( hybridHtContainer.getTotalInMemRowCount() - - hashPartitions[i].getHashMapFromMemory().getNumValues()); - hashPartitions[i].getHashMapFromMemory().clear(); + hashPartitions[i].getHashTableFromMemory().getNumValues()); + hashPartitions[i].getHashTableFromMemory().clear(); } } assert hybridHtContainer.getTotalInMemRowCount() == 0; @@ -638,10 +660,11 @@ protected void reloadHashTable(byte pos, int partitionId) // positive number here } LOG.info("Going to restore hashmap..."); - BytesBytesMultiHashMap restoredHashMap = partition.getHashMapFromDisk(rowCount); + MapJoinHashTable restoredHashMap = partition.getHashMapFromDisk(rowCount); rowCount += restoredHashMap.getNumValues(); LOG.info("Hybrid Grace Hash Join: Deserializing spilled hash partition..."); LOG.info("Hybrid Grace Hash Join: Number of rows in hashmap: " + rowCount); + restoredHashMap.debugDumpTable(); // If based on the new key count, keyCount is smaller than a threshold, // then just load the entire restored hashmap into memory. @@ -651,20 +674,22 @@ protected void reloadHashTable(byte pos, int partitionId) " will be greater than memory limit. Recursive spilling is currently not supported"); } - KeyValueHelper writeHelper = container.getWriteHelper(); + KeyValuePut writeHelper = container.getKeyValuePutHelper(); while (kvContainer.hasNext()) { ObjectPair pair = kvContainer.next(); Writable key = pair.getFirst(); Writable val = pair.getSecond(); writeHelper.setKeyValue(key, val); - restoredHashMap.put(writeHelper, -1); + restoredHashMap.put(writeHelper); } container.setTotalInMemRowCount(container.getTotalInMemRowCount() + restoredHashMap.getNumValues()); kvContainer.clear(); - spilledMapJoinTables[pos] = new MapJoinBytesTableContainer(restoredHashMap); + // Use the BytesBytesMultiHashMap hash table. + spilledMapJoinTables[pos] = new MapJoinBytesTableContainer( + new BytesBytesMultiHashMapFactory(), restoredHashMap); spilledMapJoinTables[pos].setInternalValueOi(container.getInternalValueOi()); spilledMapJoinTables[pos].setSortableSortOrders(container.getSortableSortOrders()); spilledMapJoinTables[pos].setNullMarkers(container.getNullMarkers()); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index 20f9d64..98732c5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -81,7 +81,7 @@ protected String operatorId; protected final AtomicBoolean abortOp; private transient ExecMapperContext execContext; - private transient boolean rootInitializeCalled = false; + protected transient boolean rootInitializeCalled = false; protected transient long runTimeNumRows; protected int indexForTezUnion = -1; private transient Configuration hconf; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java index 6b89e98..648e7de 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec.persistence; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -27,9 +28,15 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.debug.Utils; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResultImpl; import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.WriteBuffers; +import org.apache.hive.common.util.HashCodeUtil; import com.google.common.annotations.VisibleForTesting; @@ -45,7 +52,7 @@ * Initially inspired by HPPC LongLongOpenHashMap; however, the code is almost completely reworked * and there's very little in common left save for quadratic probing (and that with some changes). */ -public final class BytesBytesMultiHashMap { +public final class BytesBytesMultiHashMap implements MapJoinHashTable { public static final Logger LOG = LoggerFactory.getLogger(BytesBytesMultiHashMap.class); /* @@ -193,8 +200,11 @@ public BytesBytesMultiHashMap(int initialCapacity, * The result of looking up a key in the multi-hash map. * * This object can read through the 0, 1, or more values found for the key. + * + * It implements the standard map join hash map result interface. + * */ - public static class Result { + public static class Result extends MapJoinHashTableResultImpl implements MapJoinHashMapResult { // Whether there are more than 0 rows. private boolean hasRows; @@ -221,6 +231,9 @@ public BytesBytesMultiHashMap(int initialCapacity, // A reference to the current row. private WriteBuffers.ByteSegmentRef byteSegmentRef; + // The associated alias filter value. + private byte aliasFilter; + public Result() { hasRows = false; byteSegmentRef = new WriteBuffers.ByteSegmentRef(); @@ -262,13 +275,16 @@ public boolean isSingleRow() { * The offset of just after the key length in the list record. Or, 0 when single row. */ public void set(BytesBytesMultiHashMap hashMap, long firstOffset, boolean hasList, - long offsetAfterListRecordKeyLen) { + long offsetAfterListRecordKeyLen, byte aliasFilter) { + + this.mapJoinResult = MapJoinResult.MATCH; this.hashMap = hashMap; this.firstOffset = firstOffset; this.hasList = hasList; this.offsetAfterListRecordKeyLen = offsetAfterListRecordKeyLen; + this.aliasFilter = aliasFilter; // Position at first row. readIndex = 0; @@ -389,22 +405,31 @@ public void forget() { readIndex = 0; nextTailOffset = -1; } - } - /** The source of keys and values to put into hashtable; avoids byte copying. */ - public static interface KvSource { - /** Write key into output. */ - public void writeKey(RandomAccessOutput dest) throws SerDeException; + @Override + public int cappedCount() { + return 0; + } + + @Override + public boolean isCappedCountAvailable() { + return false; + } + + @Override + public boolean isAliasFilterAvailable() { + return true; + } - /** Write value into output. */ - public void writeValue(RandomAccessOutput dest) throws SerDeException; + @Override + public byte aliasFilter() { + return aliasFilter; + } - /** - * Provide updated value for state byte for a key. - * @param previousValue Previous value; null if this is the first call per key. - * @return The updated value. - */ - public byte updateStateByte(Byte previousValue); + @Override + public String getDetailedHashMapResultPositionString() { + return "none"; + } } /** @@ -412,11 +437,15 @@ public void forget() { * @param kv Keyvalue writer. Each method will be called at most once. */ private static final byte[] FOUR_ZEROES = new byte[] { 0, 0, 0, 0 }; - public void put(KvSource kv, int keyHashCode) throws SerDeException { + + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { if (resizeThreshold <= keysAssigned) { expandAndRehash(); } + KeyValuePutWriter keyValuePutWriter = (KeyValuePutWriter) keyValuePut; + // Reserve 4 bytes for the hash (don't just reserve, there may be junk there) writeBuffers.write(FOUR_ZEROES); @@ -424,9 +453,11 @@ public void put(KvSource kv, int keyHashCode) throws SerDeException { // become part of the record; otherwise, we will just write over it later. long keyOffset = writeBuffers.getWritePoint(); - kv.writeKey(writeBuffers); + keyValuePutWriter.writeKey(writeBuffers); int keyLength = (int)(writeBuffers.getWritePoint() - keyOffset); - int hashCode = (keyHashCode == -1) ? writeBuffers.unsafeHashCode(keyOffset, keyLength) : keyHashCode; + int hashCode = (keyValuePut.hasHashCode()) ? + keyValuePut.getKeyHashCode() : + writeBuffers.unsafeHashCode(keyOffset, keyLength); int slot = findKeySlotToWrite(keyOffset, keyLength, hashCode); // LOG.info("Write hash code is " + Integer.toBinaryString(hashCode) + " - " + slot); @@ -434,18 +465,18 @@ public void put(KvSource kv, int keyHashCode) throws SerDeException { long ref = refs[slot]; if (ref == 0) { // This is a new key, keep writing the first record. - long tailOffset = writeFirstValueRecord(kv, keyOffset, keyLength, hashCode); - byte stateByte = kv.updateStateByte(null); + long tailOffset = writeFirstValueRecord(keyValuePutWriter, keyOffset, keyLength, hashCode); + byte stateByte = keyValuePutWriter.updateStateByte(null); refs[slot] = Ref.makeFirstRef(tailOffset, stateByte, hashCode, startingHashBitCount); ++keysAssigned; } else { // This is not a new key; we'll overwrite the key and hash bytes - not needed anymore. writeBuffers.setWritePoint(keyOffset - 4); long lrPtrOffset = createOrGetListRecord(ref); - long tailOffset = writeValueAndLength(kv); + long tailOffset = writeValueAndLength(keyValuePutWriter); addRecordToList(lrPtrOffset, tailOffset); byte oldStateByte = Ref.getStateByte(ref); - byte stateByte = kv.updateStateByte(oldStateByte); + byte stateByte = keyValuePutWriter.updateStateByte(oldStateByte); if (oldStateByte != stateByte) { ref = Ref.setStateByte(ref, stateByte); } @@ -463,18 +494,22 @@ public void put(KvSource kv, int keyHashCode) throws SerDeException { * @param key Key buffer. * @param offset the offset to the key in the buffer * @param hashMapResult The object to fill in that can read the values. - * @return The state byte. */ - public byte getValueResult(byte[] key, int offset, int length, Result hashMapResult) { + @Override + public void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, + int hashCode, MapJoinHashMapResult hashMapResult) { hashMapResult.forget(); - WriteBuffers.Position readPos = hashMapResult.getReadPos(); + Result internalHashMapResult = (Result) hashMapResult; + + WriteBuffers.Position readPos = internalHashMapResult.getReadPos(); // First, find first record for the key. - long ref = findKeyRefToRead(key, offset, length, readPos); + long ref = findKeyRefToRead(keyBytes, keyStart, keyLength, readPos); if (ref == 0) { - return 0; + hashMapResult.setNoMatch(); + return; } boolean hasList = Ref.hasList(ref); @@ -482,9 +517,8 @@ public byte getValueResult(byte[] key, int offset, int length, Result hashMapRes // This relies on findKeyRefToRead doing key equality check and leaving read ptr where needed. long offsetAfterListRecordKeyLen = hasList ? writeBuffers.getReadPoint(readPos) : 0; - hashMapResult.set(this, Ref.getOffset(ref), hasList, offsetAfterListRecordKeyLen); - - return Ref.getStateByte(ref); + internalHashMapResult.set(this, Ref.getOffset(ref), hasList, offsetAfterListRecordKeyLen, + Ref.getStateByte(ref)); } /** @@ -500,6 +534,7 @@ public void populateValue(WriteBuffers.ByteSegmentRef valueRef) { * Number of keys in the hashmap * @return number of keys */ + @Override public int size() { return keysAssigned; } @@ -509,6 +544,7 @@ public int size() { * This is equal to or bigger than number of keys, since some values may share the same key * @return number of values */ + @Override public int getNumValues() { return numValues; } @@ -519,6 +555,7 @@ public int getNumValues() { * Others include instance fields: 100 * @return number of bytes */ + @Override public long memorySize() { return writeBuffers.size() + refs.length * 8 + 100; } @@ -535,6 +572,7 @@ public void clear() { this.numValues = 0; } + @Override public void expandAndRehashToTarget(int estimateNewRowCount) { int oldRefsCount = refs.length; int newRefsCount = oldRefsCount + estimateNewRowCount; @@ -811,9 +849,9 @@ private void addRecordToList(long lrPtrOffset, long tailOffset) { * @return The offset of the new record. */ private long writeFirstValueRecord( - KvSource kv, long keyOffset, int keyLength, int hashCode) throws SerDeException { + KeyValuePutWriter keyValuePutWriter, long keyOffset, int keyLength, int hashCode) throws SerDeException { long valueOffset = writeBuffers.getWritePoint(); - kv.writeValue(writeBuffers); + keyValuePutWriter.writeValue(writeBuffers); long tailOffset = writeBuffers.getWritePoint(); int valueLength = (int)(tailOffset - valueOffset); // LOG.info("Writing value at " + valueOffset + " length " + valueLength); @@ -840,9 +878,9 @@ private long writeFirstValueRecord( * @param kv Key-value writer. * @return The offset of the new record. */ - private long writeValueAndLength(KvSource kv) throws SerDeException { + private long writeValueAndLength(KeyValuePutWriter keyValuePutWriter) throws SerDeException { long valueOffset = writeBuffers.getWritePoint(); - kv.writeValue(writeBuffers); + keyValuePutWriter.writeValue(writeBuffers); long tailOffset = writeBuffers.getWritePoint(); writeBuffers.writeVLong(tailOffset - valueOffset); // LOG.info("Writing value at " + valueOffset + " length " + (tailOffset - valueOffset)); @@ -850,6 +888,7 @@ private long writeValueAndLength(KvSource kv) throws SerDeException { } /** Writes the debug dump of the table into logs. Not thread-safe. */ + @Override public void debugDumpTable() { StringBuilder dump = new StringBuilder(keysAssigned + " keys\n"); TreeMap byteIntervals = new TreeMap(); @@ -878,7 +917,8 @@ public void debugDumpTable() { dump.append(Utils.toStringBinary(key, 0, key.length)).append(" ref [").append(dumpRef(ref)) .append("]: "); Result hashMapResult = new Result(); - getValueResult(key, 0, key.length, hashMapResult); + int hashCode = HashCodeUtil.calculateBytesHashCode(key, 0, key.length); + hashMapLookup(key, 0, key.length, hashCode, hashMapResult); List results = new ArrayList(); WriteBuffers.ByteSegmentRef byteSegmentRef = hashMapResult.first(); while (byteSegmentRef != null) { @@ -975,6 +1015,7 @@ private static String dumpRef(long ref) { + " h=" + Long.toBinaryString(Ref.getHashBits(ref)); } + @Override public void debugDumpMetrics() { LOG.info("Map metrics: keys allocated " + this.refs.length +", keys assigned " + keysAssigned + ", write conflict " + metricPutConflict + ", write max dist " + largestNumberOfSteps @@ -1001,4 +1042,51 @@ private void debugDumpKeyProbe(long keyOffset, int keyLength, int hashCode, int } LOG.info(sb.toString()); } + + // New methods for Native Vector Map Join not implemented here. + @Override + public boolean useMinMax() { + return false; + } + + @Override + public long min() { + throw new RuntimeException("Not supported"); + } + + @Override + public long max() { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashMapLookup(long key, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashMultiSetContains(byte[] keyBytes, int keyStart, + int keyLength, int hashCode, MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashMultiSetContains(long key, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) throws IOException { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, + int hashCode, MapJoinHashSetResult hashSetResult) throws IOException { + throw new RuntimeException("Not supported"); + } + + @Override + public void hashSetContains(long key, int hashCode, + MapJoinHashSetResult hashSetResult) throws IOException { + throw new RuntimeException("Not supported"); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMapFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMapFactory.java new file mode 100644 index 0000000..1cc1c97 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMapFactory.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; + +/* + * Factory for creating BytesBytesMultiHashMap. + */ +public class BytesBytesMultiHashMapFactory implements MapJoinHashTableFactory { + + public static final Log LOG = LogFactory.getLog(BytesBytesMultiHashMapFactory.class); + + @Override + public MapJoinHashTable createHashTable(int initialCapacity, float loadFactor, + int writeBuffersSize, long memUsage) { + return new BytesBytesMultiHashMap(initialCapacity, loadFactor, writeBuffersSize, memUsage); + } + + /* + * @return A new hash map result implementation specific object. + * + * The object can be used to access the values when there is a match, or + * access spill information when the partition with the key is currently spilled. + */ + @Override + public MapJoinHashMapResult createHashMapResult() { + return (MapJoinHashMapResult) new BytesBytesMultiHashMap.Result(); + } + + /* + * @return A new hash multi-set result implementation specific object. + * + * The object can be used to access the *count* of values when the key is contained in the + * multi-set, or access spill information when the partition with the key is currently spilled. + */ + @Override + public MapJoinHashMultiSetResult createHashMultiSetResult() { + throw new RuntimeException("Not supported"); + } + + /* + * @return A new hash set result implementation specific object. + * + * The object can be used to access access spill information when the partition with the key + * is currently spilled. + */ + @Override + public MapJoinHashSetResult createHashSetResult() { + throw new RuntimeException("Not supported"); + } + + @Override + public boolean keyValuePutHelperIsExternal() { + // MapJoinBytesTableContainer will implement a KeyValuePutWriter for BytesBytesMultiHashMap + // to use. + return false; + } + + @Override + public KeyValuePut createKeyValuePut() { + // Not supplied when keyValuePutHelperIsExternal is false. + return null; + } + + @Override + public boolean useMinMax() { + // Min/Max not supported for BytesBytesMultiHashMap. + return false; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java index a3bccc6..67a9f0e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java @@ -32,6 +32,9 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapper; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapperBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; @@ -155,7 +158,7 @@ public GetAdaptor(MapJoinKey key) { } @Override - public JoinUtil.JoinResult setFromVector(VectorHashKeyWrapper kw, + public MapJoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, VectorHashKeyWrapperBatch keyWrapperBatch) throws HiveException { if (currentKey == null) { @@ -172,15 +175,15 @@ public GetAdaptor(MapJoinKey key) { isFirstKey = false; this.currentValue = mHash.get(key); if (this.currentValue == null) { - return JoinUtil.JoinResult.NOMATCH; + return MapJoinResult.NO_MATCH; } else { - return JoinUtil.JoinResult.MATCH; + return MapJoinResult.MATCH; } } @Override - public JoinUtil.JoinResult setFromRow(Object row, List fields, + public MapJoinResult setFromRow(Object row, List fields, List ois) throws HiveException { if (currentKey == null) { currentKey = new Object[fields.size()]; @@ -192,25 +195,25 @@ public GetAdaptor(MapJoinKey key) { isFirstKey = false; this.currentValue = mHash.get(key); if (this.currentValue == null) { - return JoinUtil.JoinResult.NOMATCH; + return MapJoinResult.NO_MATCH; } else { - return JoinUtil.JoinResult.MATCH; + return MapJoinResult.MATCH; } } @Override - public JoinUtil.JoinResult setFromOther(ReusableGetAdaptor other) { + public MapJoinResult setFromOther(ReusableGetAdaptor other) { assert other instanceof GetAdaptor; GetAdaptor other2 = (GetAdaptor)other; this.key = other2.key; this.isFirstKey = other2.isFirstKey; this.currentValue = mHash.get(key); if (this.currentValue == null) { - return JoinUtil.JoinResult.NOMATCH; + return MapJoinResult.NO_MATCH; } else { - return JoinUtil.JoinResult.MATCH; + return MapJoinResult.MATCH; } } @@ -250,10 +253,19 @@ public boolean hasSpill() { return false; } - @Override public void setSerde(MapJoinObjectSerDeContext keyCtx, MapJoinObjectSerDeContext valCtx) throws SerDeException { this.keyContext = keyCtx; this.valueContext = valCtx; } + + @Override + public MapJoinHashTableFind getMapJoinHashTableFind() { + throw new RuntimeException("Not supported"); + } + + @Override + public MapJoinHashTableFactory getMapJoinHashTableFactory() { + throw new RuntimeException("Not supported"); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HybridHashTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HybridHashTableContainer.java index 573dc08..12dbb82 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HybridHashTableContainer.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HybridHashTableContainer.java @@ -35,10 +35,15 @@ import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer.KeyValueHelper; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapper; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapperBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; @@ -74,11 +79,23 @@ * * Partitions that can fit in memory will be processed first, and then every spilled partition will * be restored and processed one by one. + * + * It implements the standard map join hash map find interface. + * */ -public class HybridHashTableContainer - implements MapJoinTableContainer, MapJoinTableContainerDirectAccess { +public final class HybridHashTableContainer + implements MapJoinTableContainer, MapJoinHashTableFind { private static final Logger LOG = LoggerFactory.getLogger(HybridHashTableContainer.class); + private final MapJoinHashTableFactory mapJoinHashTableFactory; + // Factory for creating hash tables. + + private final boolean useMinMax; + // Whether we should maintain min/max for + // small table for optimizing lookup. + private long longMin; + private long longMax; + private final HashPartition[] hashPartitions; // an array of partitions holding the triplets private int totalInMemRowCount = 0; // total number of small table rows in memory private long memoryThreshold; // the max memory limit that can be allocated @@ -98,8 +115,8 @@ private boolean[] sortableSortOrders; private byte[] nullMarkers; private byte[] notNullMarkers; - private MapJoinBytesTableContainer.KeyValueHelper writeHelper; - private final MapJoinBytesTableContainer.DirectKeyValueWriter directWriteHelper; + private KeyValuePut keyValuePutHelper; + /* * this is not a real bloom filter, but is a cheap version of the 1-memory * access bloom filters @@ -123,7 +140,10 @@ * The triplet: hashmap (either in memory or on disk), small table container, big table container */ public static class HashPartition { - BytesBytesMultiHashMap hashMap; // In memory hashMap + MapJoinHashTableFactory mapJoinHashTableFactory; + // Hash table factor to use. + MapJoinHashTable hashTable; // In memory hashTable + Class hashTableClass; // Class of hashTable so we can bring it back from disk. KeyValueContainer sidefileKVContainer; // Stores small table key/value pairs ObjectContainer matchfileObjContainer; // Stores big table rows VectorMapJoinRowBytesContainer matchfileRowBytesContainer; @@ -131,7 +151,7 @@ Path hashMapLocalPath; // Local file system path for spilled hashMap boolean hashMapOnDisk; // Status of hashMap. true: on disk, false: in memory boolean hashMapSpilledOnCreation; // When there's no enough memory, cannot create hashMap - int initialCapacity; // Used to create an empty BytesBytesMultiHashMap + int initialCapacity; // Used to create an empty MapJoinHashTable float loadFactor; // Same as above int wbSize; // Same as above int rowsOnDisk; // How many rows saved to the on-disk hashmap (if on disk) @@ -141,11 +161,13 @@ * In that case, we don't create the hashmap, but pretend the hashmap is directly "spilled". */ public HashPartition(int initialCapacity, float loadFactor, int wbSize, long maxProbeSize, - boolean createHashMap, String spillLocalDirs) { + boolean createHashMap, String spillLocalDirs, + MapJoinHashTableFactory mapJoinHashTableFactory) { + this.mapJoinHashTableFactory = mapJoinHashTableFactory; if (createHashMap) { // Probe space should be at least equal to the size of our designated wbSize maxProbeSize = Math.max(maxProbeSize, wbSize); - hashMap = new BytesBytesMultiHashMap(initialCapacity, loadFactor, wbSize, maxProbeSize); + hashTable = mapJoinHashTableFactory.createHashTable(initialCapacity, loadFactor, wbSize, maxProbeSize); } else { hashMapSpilledOnCreation = true; hashMapOnDisk = true; @@ -157,24 +179,24 @@ public HashPartition(int initialCapacity, float loadFactor, int wbSize, long max } /* Get the in memory hashmap */ - public BytesBytesMultiHashMap getHashMapFromMemory() { - return hashMap; + public MapJoinHashTable getHashTableFromMemory() { + return hashTable; } /* Restore the hashmap from disk by deserializing it. * Currently Kryo is used for this purpose. */ - public BytesBytesMultiHashMap getHashMapFromDisk(int rowCount) + public MapJoinHashTable getHashMapFromDisk(int rowCount) throws IOException, ClassNotFoundException { if (hashMapSpilledOnCreation) { - return new BytesBytesMultiHashMap(rowCount, loadFactor, wbSize, -1); + return mapJoinHashTableFactory.createHashTable(rowCount, loadFactor, wbSize, -1); } else { InputStream inputStream = Files.newInputStream(hashMapLocalPath); com.esotericsoftware.kryo.io.Input input = new com.esotericsoftware.kryo.io.Input(inputStream); Kryo kryo = SerializationUtilities.borrowKryo(); - BytesBytesMultiHashMap restoredHashMap = null; + MapJoinHashTable restoredHashMap = null; try { - restoredHashMap = kryo.readObject(input, BytesBytesMultiHashMap.class); + restoredHashMap = (MapJoinHashTable) kryo.readObject(input, hashTableClass); } finally { SerializationUtilities.releaseKryo(kryo); } @@ -224,9 +246,9 @@ public boolean isHashMapOnDisk() { } public void clear() { - if (hashMap != null) { - hashMap.clear(); - hashMap = null; + if (hashTable != null) { + hashTable.clear(); + hashTable = null; } if (hashMapLocalPath != null) { @@ -261,13 +283,27 @@ public int size() { return rowsOnDisk + (sidefileKVContainer != null ? sidefileKVContainer.size() : 0); } else { // All rows should be in the in-memory hashmap - return hashMap.size(); + return hashTable.size(); } } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("hashMapOnDisk "); + sb.append(hashMapOnDisk); + sb.append(", hashMapSpilledOnCreation "); + sb.append(hashMapSpilledOnCreation); + sb.append(", wbSize "); + sb.append(wbSize); + sb.append(", rowsOnDisk "); + sb.append(rowsOnDisk); + return sb.toString(); + } } public HybridHashTableContainer(Configuration hconf, long keyCount, long memoryAvailable, - long estimatedTableSize, HybridHashTableConf nwayConf) + long estimatedTableSize, HybridHashTableConf nwayConf, + MapJoinHashTableFactory mapJoinHashTableFactory) throws SerDeException, IOException { this(HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEKEYCOUNTADJUSTMENT), HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD), @@ -279,15 +315,25 @@ public HybridHashTableContainer(Configuration hconf, long keyCount, long memoryA HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEMAPJOINOPTIMIZEDTABLEPROBEPERCENT), HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEHYBRIDGRACEHASHJOINBLOOMFILTER), estimatedTableSize, keyCount, memoryAvailable, nwayConf, - HiveUtils.getLocalDirList(hconf)); + HiveUtils.getLocalDirList(hconf), + mapJoinHashTableFactory); } private HybridHashTableContainer(float keyCountAdj, int threshold, float loadFactor, int memCheckFreq, int minWbSize, int maxWbSize, int minNumParts, float probePercent, boolean useBloomFilter, long estimatedTableSize, long keyCount, long memoryAvailable, - HybridHashTableConf nwayConf, String spillLocalDirs) + HybridHashTableConf nwayConf, String spillLocalDirs, MapJoinHashTableFactory mapJoinHashTableFactory) throws SerDeException, IOException { - directWriteHelper = new MapJoinBytesTableContainer.DirectKeyValueWriter(); + + this.mapJoinHashTableFactory = mapJoinHashTableFactory; + useMinMax = mapJoinHashTableFactory.useMinMax(); + if (useMinMax) { + // We have a single long key and the hash table supports min/max. + longMin = Long.MAX_VALUE; + longMax = Long.MIN_VALUE; + } else { + longMin = longMax = 0; + } int newKeyCount = HashMapWrapper.calculateTableSize( keyCountAdj, threshold, loadFactor, keyCount); @@ -369,23 +415,23 @@ private HybridHashTableContainer(float keyCountAdj, int threshold, float loadFac nwayConf.getLoadedContainerList().size() == 0) { // n-way join, first (biggest) small table if (i == 0) { // We unconditionally create a hashmap for the first hash partition hashPartitions[i] = new HashPartition(initialCapacity, loadFactor, writeBufferSize, - maxCapacity, true, spillLocalDirs); - LOG.info("Each new partition will require memory: " + hashPartitions[0].hashMap.memorySize()); + maxCapacity, true, spillLocalDirs, mapJoinHashTableFactory); + LOG.info("Each new partition will require memory: " + hashPartitions[0].hashTable.memorySize()); } else { // To check whether we have enough memory to allocate for another hash partition, // we need to get the size of the first hash partition to get an idea. hashPartitions[i] = new HashPartition(initialCapacity, loadFactor, writeBufferSize, - maxCapacity, memoryUsed + hashPartitions[0].hashMap.memorySize() < memoryThreshold, - spillLocalDirs); + maxCapacity, memoryUsed + hashPartitions[0].hashTable.memorySize() < memoryThreshold, + spillLocalDirs, mapJoinHashTableFactory); } } else { // n-way join, all later small tables // For all later small tables, follow the same pattern of the previously loaded tables. if (this.nwayConf.doSpillOnCreation(i)) { hashPartitions[i] = new HashPartition(initialCapacity, loadFactor, writeBufferSize, - maxCapacity, false, spillLocalDirs); + maxCapacity, false, spillLocalDirs, mapJoinHashTableFactory); } else { hashPartitions[i] = new HashPartition(initialCapacity, loadFactor, writeBufferSize, - maxCapacity, true, spillLocalDirs); + maxCapacity, true, spillLocalDirs, mapJoinHashTableFactory); } } @@ -398,7 +444,7 @@ private HybridHashTableContainer(float keyCountAdj, int threshold, float loadFac } LOG.info("Hash partition " + i + " is spilled on creation."); } else { - memoryUsed += hashPartitions[i].hashMap.memorySize(); + memoryUsed += hashPartitions[i].hashTable.memorySize(); LOG.info("Hash partition " + i + " is created in memory. Total memory usage so far: " + memoryUsed); } } @@ -436,8 +482,8 @@ private double calcFPP(int keyCount) { return p; } - public MapJoinBytesTableContainer.KeyValueHelper getWriteHelper() { - return writeHelper; + public KeyValuePut getKeyValuePutHelper() { + return keyValuePutHelper; } public HashPartition[] getHashPartitions() { @@ -455,8 +501,8 @@ public long getMemoryThreshold() { private long refreshMemoryUsed() { long memUsed = bloom1 != null ? bloom1.sizeInBytes() : 0; for (HashPartition hp : hashPartitions) { - if (hp.hashMap != null) { - memUsed += hp.hashMap.memorySize(); + if (hp.hashTable != null) { + memUsed += hp.hashTable.memorySize(); } else { // also include the still-in-memory sidefile, before it has been truely spilled if (hp.sidefileKVContainer != null) { @@ -492,22 +538,30 @@ public LazyBinaryStructObjectInspector getInternalValueOi() { @Override public MapJoinKey putRow(Writable currentKey, Writable currentValue) throws SerDeException, HiveException, IOException { - writeHelper.setKeyValue(currentKey, currentValue); - return internalPutRow(writeHelper, currentKey, currentValue); + keyValuePutHelper.setKeyValue(currentKey, currentValue); + return internalPutRow(keyValuePutHelper, currentKey, currentValue); } - private MapJoinKey internalPutRow(KeyValueHelper keyValueHelper, + private MapJoinKey internalPutRow(KeyValuePut keyValuePut, Writable currentKey, Writable currentValue) throws SerDeException, IOException { boolean putToSidefile = false; // by default we put row into partition in memory // Next, put row into corresponding hash partition - int keyHash = keyValueHelper.getHashFromKey(); - int partitionId = keyHash & (hashPartitions.length - 1); + keyValuePut.setKeyValue(currentKey, currentValue); + int hashCode = keyValuePut.getKeyHashCode(); + + int partitionId = hashCode & (hashPartitions.length - 1); HashPartition hashPartition = hashPartitions[partitionId]; if (bloom1 != null) { - bloom1.addLong(keyHash); + bloom1.addLong(hashCode); + } + if (useMinMax) { + // We have a single long key and the hash table supports min/max. + long longValue = keyValuePut.getLongKey(); + longMin = Math.min(longValue, longMin); + longMax = Math.max(longValue, longMax); } if (isOnDisk(partitionId) || isHashMapSpilledOnCreation(partitionId)) { // destination on disk @@ -549,7 +603,7 @@ private MapJoinKey internalPutRow(KeyValueHelper keyValueHelper, KeyValueContainer kvContainer = hashPartition.getSidefileKVContainer(); kvContainer.add((HiveKey) currentKey, (BytesWritable) currentValue); } else { - hashPartition.hashMap.put(keyValueHelper, keyHash); // Pass along hashcode to avoid recalculation + hashPartition.hashTable.put(keyValuePut); totalInMemRowCount++; } @@ -568,7 +622,7 @@ public boolean isOnDisk(int partitionId) { /** * Check if the hash table of a specified partition has been "spilled" to disk when it was created. * In fact, in other words, check if a hashmap does exist or not. - * @param partitionId hashMap ID + * @param partitionId hashTable ID * @return true if it was not created at all, false if there is a hash table existing there */ public boolean isHashMapSpilledOnCreation(int partitionId) { @@ -609,7 +663,7 @@ private int biggestPartition() { if (isOnDisk(i)) { continue; } else { - size = hashPartitions[i].hashMap.getNumValues(); + size = hashPartitions[i].hashTable.getNumValues(); } if (size > maxSize) { maxSize = size; @@ -637,7 +691,7 @@ private int biggestPartition() { */ public long spillPartition(int partitionId) throws IOException { HashPartition partition = hashPartitions[partitionId]; - int inMemRowCount = partition.hashMap.getNumValues(); + int inMemRowCount = partition.hashTable.getNumValues(); if (inMemRowCount == 0) { LOG.warn("Trying to spill an empty hash partition! It may be due to " + "hive.auto.convert.join.noconditionaltask.size being set too low."); @@ -652,7 +706,8 @@ public long spillPartition(int partitionId) throws IOException { Kryo kryo = SerializationUtilities.borrowKryo(); try { LOG.info("Trying to spill hash partition " + partitionId + " ..."); - kryo.writeObject(output, partition.hashMap); // use Kryo to serialize hashmap + kryo.writeObject(output, partition.hashTable); // use Kryo to serialize hashmap + partition.hashTableClass = partition.hashTable.getClass(); // remember so we can defrost it. output.close(); outputStream.close(); } finally { @@ -663,17 +718,17 @@ public long spillPartition(int partitionId) throws IOException { partition.hashMapOnDisk = true; LOG.info("Spilling hash partition " + partitionId + " (Rows: " + inMemRowCount + - ", Mem size: " + partition.hashMap.memorySize() + "): " + file); + ", Mem size: " + partition.hashTable.memorySize() + "): " + file); LOG.info("Memory usage before spilling: " + memoryUsed); - long memFreed = partition.hashMap.memorySize(); + long memFreed = partition.hashTable.memorySize(); memoryUsed -= memFreed; LOG.info("Memory usage after spilling: " + memoryUsed); partition.rowsOnDisk = inMemRowCount; totalInMemRowCount -= inMemRowCount; - partition.hashMap.clear(); - partition.hashMap = null; + partition.hashTable.clear(); + partition.hashTable = null; return memFreed; } @@ -774,25 +829,16 @@ public ReusableGetAdaptor createGetter(MapJoinKey keyTypeFromLoader) { public void seal() { for (HashPartition hp : hashPartitions) { // Only seal those partitions that haven't been spilled and cleared, - // because once a hashMap is cleared, it will become unusable - if (hp.hashMap != null && hp.hashMap.size() != 0) { - hp.hashMap.seal(); + // because once a hashTable is cleared, it will become unusable + if (hp.hashTable != null && hp.hashTable.size() != 0) { + hp.hashTable.seal(); } } } - - // Direct access interfaces. - - @Override - public void put(Writable currentKey, Writable currentValue) throws SerDeException, IOException { - directWriteHelper.setKeyValue(currentKey, currentValue); - internalPutRow(directWriteHelper, currentKey, currentValue); - } - /** Implementation of ReusableGetAdaptor that has Output for key serialization; row * container is also created once and reused for every row. */ - private class GetAdaptor implements ReusableGetAdaptor, ReusableGetAdaptorDirectAccess { + private class GetAdaptor implements ReusableGetAdaptor { private Object[] currentKey; private boolean[] nulls; @@ -807,9 +853,9 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromVector(VectorHashKeyWrapper kw, + public MapJoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, VectorHashKeyWrapperBatch keyWrapperBatch) - throws HiveException { + throws HiveException, IOException { if (nulls == null) { nulls = new boolean[keyOutputWriters.length]; currentKey = new Object[keyOutputWriters.length]; @@ -830,8 +876,8 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromRow(Object row, List fields, - List ois) throws HiveException { + public MapJoinResult setFromRow(Object row, List fields, + List ois) throws HiveException, IOException { if (nulls == null) { nulls = new boolean[fields.size()]; currentKey = new Object[fields.size()]; @@ -846,7 +892,7 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromOther(ReusableGetAdaptor other) throws HiveException { + public MapJoinResult setFromOther(ReusableGetAdaptor other) throws HiveException, IOException { assert other instanceof GetAdaptor; GetAdaptor other2 = (GetAdaptor)other; nulls = other2.nulls; @@ -874,26 +920,14 @@ public MapJoinRowContainer getCurrentRows() { public Object[] getCurrentKey() { return currentKey; } - - // Direct access interfaces. - - @Override - public JoinUtil.JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult) { - return currentValue.setDirect(bytes, offset, length, hashMapResult); - } - - @Override - public int directSpillPartitionId() { - return currentValue.directSpillPartitionId(); - } } /** Row container that gets and deserializes the rows on demand from bytes provided. */ private class ReusableRowContainer implements MapJoinRowContainer, AbstractRowContainer.RowIterator> { private byte aliasFilter; - private final BytesBytesMultiHashMap.Result hashMapResult; + + private final MapJoinHashMapResult hashMapResult; /** * Sometimes, when container is empty in multi-table mapjoin, we need to add a dummy row. @@ -906,7 +940,7 @@ public int directSpillPartitionId() { private final boolean needsComplexObjectFixup; private final ArrayList complexObjectArrayBuffer; - private int partitionId; // Current hashMap in use + private int partitionId; // Current hashTable in use public ReusableRowContainer() { if (internalValueOi != null) { @@ -926,29 +960,31 @@ public ReusableRowContainer() { complexObjectArrayBuffer = null; } uselessIndirection = new ByteArrayRef(); - hashMapResult = new BytesBytesMultiHashMap.Result(); + + // Use the factory to create a hash map result. + hashMapResult = mapJoinHashTableFactory.createHashMapResult(); clearRows(); } /* Determine if there is a match between big table row and the corresponding hashtable * Three states can be returned: * MATCH: a match is found - * NOMATCH: no match is found from the specified partition + * NO_MATCH: no match is found from the specified partition * SPILL: the specified partition has been spilled to disk and is not available; * the evaluation for this big table row will be postponed. */ - public JoinUtil.JoinResult setFromOutput(Output output) throws HiveException { + public MapJoinResult setFromOutput(Output output) throws HiveException, IOException { int keyHash = HashCodeUtil.murmurHash(output.getData(), 0, output.getLength()); if (bloom1 != null && !bloom1.testLong(keyHash)) { /* * if the keyHash is missing in the bloom filter, then the value cannot - * exist in any of the spilled partition - return NOMATCH + * exist in any of the spilled partition - return NO_MATCH */ dummyRow = null; aliasFilter = (byte) 0xff; hashMapResult.forget(); - return JoinResult.NOMATCH; + return MapJoinResult.NO_MATCH; } partitionId = keyHash & (hashPartitions.length - 1); @@ -956,19 +992,25 @@ public ReusableRowContainer() { // If the target hash table is on disk, spill this row to disk as well to be processed later if (isOnDisk(partitionId)) { toSpillPartitionId = partitionId; - hashMapResult.forget(); - return JoinUtil.JoinResult.SPILL; + hashMapResult.setSpill(partitionId); + return MapJoinResult.SPILL; } else { - aliasFilter = hashPartitions[partitionId].hashMap.getValueResult(output.getData(), 0, - output.getLength(), hashMapResult); + hashPartitions[partitionId].hashTable.hashMapLookup(output.getData(), 0, output.getLength(), + keyHash, hashMapResult); + MapJoinResult mapJoinResult = hashMapResult.getMapJoinResult(); dummyRow = null; - if (hashMapResult.hasRows()) { - return JoinUtil.JoinResult.MATCH; - } else { + switch (mapJoinResult) { + case MATCH: + aliasFilter = hashMapResult.aliasFilter(); + break; + case NO_MATCH: aliasFilter = (byte) 0xff; - return JoinUtil.JoinResult.NOMATCH; + break; + default: + throw new RuntimeException("Unexpected map join result " + mapJoinResult.name()); } + return mapJoinResult; } } @@ -1085,45 +1127,175 @@ public void addRow(Object[] value) { public void write(MapJoinObjectSerDeContext valueContext, ObjectOutputStream out) { throw new RuntimeException(this.getClass().getCanonicalName() + " cannot be serialized"); } + } - // Direct access. + @Override + public MapJoinHashTableFind getMapJoinHashTableFind() { + return (MapJoinHashTableFind) this; + } - public JoinUtil.JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult) { + @Override + public MapJoinHashTableFactory getMapJoinHashTableFactory() { + return mapJoinHashTableFactory; + } - int keyHash = HashCodeUtil.murmurHash(bytes, offset, length); - partitionId = keyHash & (hashPartitions.length - 1); + @Override + public boolean useMinMax() { + return useMinMax; + } - if (bloom1 != null && !bloom1.testLong(keyHash)) { - /* - * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the - * spilled partition - return NOMATCH - */ - dummyRow = null; - aliasFilter = (byte) 0xff; - hashMapResult.forget(); - return JoinResult.NOMATCH; - } + @Override + public long min() { + return longMin; + } - // If the target hash table is on disk, spill this row to disk as well to be processed later - if (isOnDisk(partitionId)) { - return JoinUtil.JoinResult.SPILL; - } - else { - aliasFilter = hashPartitions[partitionId].hashMap.getValueResult(bytes, offset, length, - hashMapResult); - dummyRow = null; - if (hashMapResult.hasRows()) { - return JoinUtil.JoinResult.MATCH; - } else { - aliasFilter = (byte) 0xff; - return JoinUtil.JoinResult.NOMATCH; - } - } + @Override + public long max() { + return longMax; + } + + @Override + public void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException { + + if (bloom1 != null && !bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - return NOMATCH + */ + hashMapResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashMapResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashMapLookup(keyBytes, keyStart, keyLength, + hashCode, hashMapResult); + } + } + + @Override + public void hashMapLookup(long key, int hashCode, MapJoinHashMapResult hashMapResult) + throws IOException { + + if (bloom1 != null && !bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - set no match. + */ + hashMapResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashMapResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashMapLookup(key, + hashCode, hashMapResult); + } + } + + @Override + public void hashMultiSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException { + + if (bloom1 != null && !bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - set no match. + */ + hashMultiSetResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashMultiSetResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashMultiSetContains(keyBytes, keyStart, keyLength, + hashCode, hashMultiSetResult); } + } + + @Override + public void hashMultiSetContains(long key, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) throws IOException { + + if (bloom1 != null && !bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - set no match. + */ + hashMultiSetResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashMultiSetResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashMultiSetContains(key, + hashCode, hashMultiSetResult); + } + } + + @Override + public void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashSetResult hashSetResult) throws IOException { + + if (bloom1 != null && !bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - set no match. + */ + hashSetResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); - public int directSpillPartitionId() { - return partitionId; + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashSetResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashSetContains(keyBytes, keyStart, keyLength, + hashCode, hashSetResult); + } + } + + @Override + public void hashSetContains(long key, int hashCode, MapJoinHashSetResult hashSetResult) + throws IOException { + + if (bloom1 != null && !bloom1.testLong(hashCode)) { + /* + * if the keyHash is missing in the bloom filter, then the value cannot exist in any of the + * spilled partition - set no match. + */ + hashSetResult.setNoMatch(); + return; + } + + int partitionId = hashCode & (hashPartitions.length - 1); + + // If the target hash table is on disk, spill this row to disk as well to be processed later + if (isOnDisk(partitionId)) { + hashSetResult.setSpill(partitionId); + } else { + hashPartitions[partitionId].hashTable.hashSetContains(key, + hashCode, hashSetResult); } } @@ -1131,8 +1303,8 @@ public int directSpillPartitionId() { public void dumpMetrics() { for (int i = 0; i < hashPartitions.length; i++) { HashPartition hp = hashPartitions[i]; - if (hp.hashMap != null) { - hp.hashMap.debugDumpMetrics(); + if (hp.hashTable != null) { + hp.hashTable.debugDumpMetrics(); } } } @@ -1166,22 +1338,25 @@ public int size() { @Override public void setSerde(MapJoinObjectSerDeContext keyCtx, MapJoinObjectSerDeContext valCtx) throws SerDeException { - SerDe keySerde = keyCtx.getSerDe(), valSerde = valCtx.getSerDe(); - - if (writeHelper == null) { - LOG.info("Initializing container with " + keySerde.getClass().getName() + " and " - + valSerde.getClass().getName()); - - // We assume this hashtable is loaded only when tez is enabled - LazyBinaryStructObjectInspector valSoi = - (LazyBinaryStructObjectInspector) valSerde.getObjectInspector(); - writeHelper = new MapJoinBytesTableContainer.LazyBinaryKvWriter(keySerde, valSoi, - valCtx.hasFilterTag()); - if (internalValueOi == null) { - internalValueOi = valSoi; - } - if (sortableSortOrders == null) { - sortableSortOrders = ((BinarySortableSerDe) keySerde).getSortOrders(); + if (mapJoinHashTableFactory.keyValuePutHelperIsExternal()) { + keyValuePutHelper = mapJoinHashTableFactory.createKeyValuePut(); + } else { + SerDe keySerde = keyCtx.getSerDe(), valSerde = valCtx.getSerDe(); + if (keyValuePutHelper == null) { + LOG.info("Initializing container with " + keySerde.getClass().getName() + " and " + + valSerde.getClass().getName()); + + // We assume this hashtable is loaded only when tez is enabled + LazyBinaryStructObjectInspector valSoi = + (LazyBinaryStructObjectInspector) valSerde.getObjectInspector(); + keyValuePutHelper = new MapJoinBytesTableContainer.LazyBinaryKvWriter(keySerde, valSoi, + valCtx.hasFilterTag()); + if (internalValueOi == null) { + internalValueOi = valSoi; + } + if (sortableSortOrders == null) { + sortableSortOrders = ((BinarySortableSerDe) keySerde).getSortOrders(); + } } if (nullMarkers == null) { nullMarkers = ((BinarySortableSerDe) keySerde).getNullMarkers(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java index a8aa71a..c10b77b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.exec.persistence; +import java.io.IOException; import java.io.ObjectOutputStream; import java.util.ArrayList; import java.util.Arrays; @@ -28,7 +29,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePutWriter; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapper; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapperBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; @@ -65,14 +72,16 @@ /** * Table container that serializes keys and values using LazyBinarySerDe into - * BytesBytesMultiHashMap, with very low memory overhead. However, + * a MapJoinHashTableFind, with very low memory overhead. However, * there may be some perf overhead when retrieving rows. */ -public class MapJoinBytesTableContainer - implements MapJoinTableContainer, MapJoinTableContainerDirectAccess { +public class MapJoinBytesTableContainer implements MapJoinTableContainer { private static final Logger LOG = LoggerFactory.getLogger(MapJoinTableContainer.class); - private final BytesBytesMultiHashMap hashMap; + private final MapJoinHashTableFactory mapJoinHashTableFactory; + + private final MapJoinHashTable hashTable; + /** The OI used to deserialize values. We never deserialize keys. */ private LazyBinaryStructObjectInspector internalValueOi; /** @@ -84,31 +93,35 @@ private boolean[] sortableSortOrders; private byte[] nullMarkers; private byte[] notNullMarkers; - private KeyValueHelper writeHelper; - private DirectKeyValueWriter directWriteHelper; + + private KeyValuePut keyValuePutHelper; private final List EMPTY_LIST = new ArrayList(0); public MapJoinBytesTableContainer(Configuration hconf, - MapJoinObjectSerDeContext valCtx, long keyCount, long memUsage) throws SerDeException { + MapJoinObjectSerDeContext valCtx, long keyCount, long memUsage, + MapJoinHashTableFactory mapJoinHashTableFactory) throws SerDeException { this(HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEKEYCOUNTADJUSTMENT), HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD), HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLELOADFACTOR), HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEWBSIZE), - valCtx, keyCount, memUsage); + valCtx, keyCount, memUsage, mapJoinHashTableFactory); } private MapJoinBytesTableContainer(float keyCountAdj, int threshold, float loadFactor, - int wbSize, MapJoinObjectSerDeContext valCtx, long keyCount, long memUsage) + int wbSize, MapJoinObjectSerDeContext valCtx, long keyCount, long memUsage, + MapJoinHashTableFactory mapJoinHashTableFactory) throws SerDeException { int newThreshold = HashMapWrapper.calculateTableSize( keyCountAdj, threshold, loadFactor, keyCount); - hashMap = new BytesBytesMultiHashMap(newThreshold, loadFactor, wbSize, memUsage); - directWriteHelper = new DirectKeyValueWriter(); + this.mapJoinHashTableFactory = mapJoinHashTableFactory; + hashTable = mapJoinHashTableFactory.createHashTable(newThreshold, loadFactor, wbSize, memUsage); } - public MapJoinBytesTableContainer(BytesBytesMultiHashMap hashMap) { - this.hashMap = hashMap; + public MapJoinBytesTableContainer(MapJoinHashTableFactory mapJoinHashTableFactory, + MapJoinHashTable hashTable) { + this.mapJoinHashTableFactory = mapJoinHashTableFactory; + this.hashTable = hashTable; } private LazyBinaryStructObjectInspector createInternalOi( @@ -147,13 +160,8 @@ public void setNotNullMarkers(byte[] notNullMarkers) { this.notNullMarkers = notNullMarkers; } - public static interface KeyValueHelper extends BytesBytesMultiHashMap.KvSource { - void setKeyValue(Writable key, Writable val) throws SerDeException; - /** Get hash value from the key. */ - int getHashFromKey() throws SerDeException; - } + private static class KeyValueWriter implements KeyValuePutWriter { - private static class KeyValueWriter implements KeyValueHelper { private final SerDe keySerDe, valSerDe; private final StructObjectInspector keySoi, valSoi; private final List keyOis, valOis; @@ -213,12 +221,22 @@ public byte updateStateByte(Byte previousValue) { } @Override - public int getHashFromKey() throws SerDeException { + public boolean hasHashCode() { + return false; + } + + @Override + public int getKeyHashCode() throws SerDeException { + throw new UnsupportedOperationException("Not supported for MapJoinBytesTableContainer"); + } + + @Override + public long getLongKey() { throw new UnsupportedOperationException("Not supported for MapJoinBytesTableContainer"); } } - static class LazyBinaryKvWriter implements KeyValueHelper { + static class LazyBinaryKvWriter implements KeyValuePutWriter { private final LazyBinaryStruct.SingleFieldGetter filterGetter; private Writable key, value; private final SerDe keySerDe; @@ -251,7 +269,12 @@ public void writeKey(RandomAccessOutput dest) throws SerDeException { } @Override - public int getHashFromKey() throws SerDeException { + public boolean hasHashCode() { + return true; + } + + @Override + public int getKeyHashCode() throws SerDeException { if (!(key instanceof BinaryComparable)) { throw new SerDeException("Unexpected type " + key.getClass().getCanonicalName()); } @@ -319,80 +342,48 @@ public byte updateStateByte(Byte previousValue) { aliasFilter &= filterGetter.getShort(); return aliasFilter; } - } - - /* - * An implementation of KvSource that can handle key and value as BytesWritable objects. - */ - protected static class DirectKeyValueWriter implements KeyValueHelper { - - private BytesWritable key; - private BytesWritable val; - - @Override - public void setKeyValue(Writable key, Writable val) throws SerDeException { - this.key = (BytesWritable) key; - this.val = (BytesWritable) val; - } - - @Override - public void writeKey(RandomAccessOutput dest) throws SerDeException { - byte[] keyBytes = key.getBytes(); - int keyLength = key.getLength(); - dest.write(keyBytes, 0, keyLength); - } - - @Override - public void writeValue(RandomAccessOutput dest) throws SerDeException { - byte[] valueBytes = val.getBytes(); - int valueLength = val.getLength(); - dest.write(valueBytes, 0 , valueLength); - } @Override - public byte updateStateByte(Byte previousValue) { - // Not used by the direct access client -- native vector map join. - throw new UnsupportedOperationException("Updating the state by not supported"); - } - - @Override - public int getHashFromKey() throws SerDeException { - byte[] keyBytes = key.getBytes(); - int keyLength = key.getLength(); - return HashCodeUtil.murmurHash(keyBytes, 0, keyLength); + public long getLongKey() { + throw new UnsupportedOperationException("Not supported for MapJoinBytesTableContainer"); } } @Override public void setSerde(MapJoinObjectSerDeContext keyContext, MapJoinObjectSerDeContext valueContext) throws SerDeException { - SerDe keySerde = keyContext.getSerDe(), valSerde = valueContext.getSerDe(); - if (writeHelper == null) { - LOG.info("Initializing container with " + keySerde.getClass().getName() + " and " - + valSerde.getClass().getName()); - if (keySerde instanceof BinarySortableSerDe && valSerde instanceof LazyBinarySerDe) { - LazyBinaryStructObjectInspector valSoi = - (LazyBinaryStructObjectInspector) valSerde.getObjectInspector(); - writeHelper = new LazyBinaryKvWriter(keySerde, valSoi, valueContext.hasFilterTag()); - internalValueOi = valSoi; - sortableSortOrders = ((BinarySortableSerDe) keySerde).getSortOrders(); - nullMarkers = ((BinarySortableSerDe) keySerde).getNullMarkers(); - notNullMarkers = ((BinarySortableSerDe) keySerde).getNotNullMarkers(); - } else { - writeHelper = new KeyValueWriter(keySerde, valSerde, valueContext.hasFilterTag()); - internalValueOi = createInternalOi(valueContext); - sortableSortOrders = null; - nullMarkers = null; - notNullMarkers = null; + if (mapJoinHashTableFactory.keyValuePutHelperIsExternal()) { + keyValuePutHelper = mapJoinHashTableFactory.createKeyValuePut(); + } else { + SerDe keySerde = keyContext.getSerDe(), valSerde = valueContext.getSerDe(); + if (keyValuePutHelper == null) { + LOG.info("Initializing container with " + keySerde.getClass().getName() + " and " + + valSerde.getClass().getName()); + if (keySerde instanceof BinarySortableSerDe && valSerde instanceof LazyBinarySerDe) { + LazyBinaryStructObjectInspector valSoi = + (LazyBinaryStructObjectInspector) valSerde.getObjectInspector(); + keyValuePutHelper = new LazyBinaryKvWriter(keySerde, valSoi, valueContext.hasFilterTag()); + internalValueOi = valSoi; + sortableSortOrders = ((BinarySortableSerDe) keySerde).getSortOrders(); + nullMarkers = ((BinarySortableSerDe) keySerde).getNullMarkers(); + notNullMarkers = ((BinarySortableSerDe) keySerde).getNotNullMarkers(); + } else { + keyValuePutHelper = new KeyValueWriter(keySerde, valSerde, valueContext.hasFilterTag()); + internalValueOi = createInternalOi(valueContext); + sortableSortOrders = null; + nullMarkers = null; + notNullMarkers = null; + } } } } @SuppressWarnings("deprecation") @Override - public MapJoinKey putRow(Writable currentKey, Writable currentValue) throws SerDeException { - writeHelper.setKeyValue(currentKey, currentValue); - hashMap.put(writeHelper, -1); + public MapJoinKey putRow(Writable currentKey, Writable currentValue) + throws SerDeException, IOException { + keyValuePutHelper.setKeyValue(currentKey, currentValue); + hashTable.put(keyValuePutHelper); return null; // there's no key to return } @@ -416,15 +407,7 @@ public ReusableGetAdaptor createGetter(MapJoinKey keyTypeFromLoader) { @Override public void seal() { - hashMap.seal(); - } - - // Direct access interfaces. - - @Override - public void put(Writable currentKey, Writable currentValue) throws SerDeException { - directWriteHelper.setKeyValue(currentKey, currentValue); - hashMap.put(directWriteHelper, -1); + hashTable.seal(); } public static boolean hasComplexObjects(LazyBinaryStructObjectInspector lazyBinaryStructObjectInspector) { @@ -463,7 +446,7 @@ public static boolean hasComplexObjects(LazyBinaryStructObjectInspector lazyBina /** Implementation of ReusableGetAdaptor that has Output for key serialization; row * container is also created once and reused for every row. */ - private class GetAdaptor implements ReusableGetAdaptor, ReusableGetAdaptorDirectAccess { + private class GetAdaptor implements ReusableGetAdaptor { private Object[] currentKey; private boolean[] nulls; @@ -478,9 +461,9 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromVector(VectorHashKeyWrapper kw, + public MapJoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, VectorHashKeyWrapperBatch keyWrapperBatch) - throws HiveException { + throws HiveException, IOException { if (nulls == null) { nulls = new boolean[keyOutputWriters.length]; currentKey = new Object[keyOutputWriters.length]; @@ -501,8 +484,8 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromRow(Object row, List fields, - List ois) throws HiveException { + public MapJoinResult setFromRow(Object row, List fields, + List ois) throws HiveException, IOException { if (nulls == null) { nulls = new boolean[fields.size()]; currentKey = new Object[fields.size()]; @@ -517,7 +500,7 @@ public GetAdaptor() { } @Override - public JoinUtil.JoinResult setFromOther(ReusableGetAdaptor other) { + public MapJoinResult setFromOther(ReusableGetAdaptor other) throws IOException { assert other instanceof GetAdaptor; GetAdaptor other2 = (GetAdaptor)other; nulls = other2.nulls; @@ -546,18 +529,6 @@ public MapJoinRowContainer getCurrentRows() { return currentKey; } - // Direct access interfaces. - - @Override - public JoinUtil.JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult) { - return currentValue.setDirect(bytes, offset, length, hashMapResult); - } - - @Override - public int directSpillPartitionId() { - throw new UnsupportedOperationException("Getting the spill hash partition not supported"); - } } /** Row container that gets and deserializes the rows on demand from bytes provided. */ @@ -566,7 +537,7 @@ public int directSpillPartitionId() { private byte aliasFilter; /** Hash table wrapper specific to the container. */ - private final BytesBytesMultiHashMap.Result hashMapResult; + private final MapJoinHashMapResult hashMapResult; /** * Sometimes, when container is empty in multi-table mapjoin, we need to add a dummy row. @@ -597,21 +568,28 @@ public ReusableRowContainer() { complexObjectArrayBuffer = null; } uselessIndirection = new ByteArrayRef(); - hashMapResult = new BytesBytesMultiHashMap.Result(); + hashMapResult = mapJoinHashTableFactory.createHashMapResult(); clearRows(); } - public JoinUtil.JoinResult setFromOutput(Output output) { + public MapJoinResult setFromOutput(Output output) throws IOException { + + int keyHash = HashCodeUtil.murmurHash(output.getData(), 0, output.getLength()); - aliasFilter = hashMap.getValueResult( - output.getData(), 0, output.getLength(), hashMapResult); + hashTable.hashMapLookup(output.getData(), 0, output.getLength(), keyHash, hashMapResult); + MapJoinResult mapJoinResult = hashMapResult.getMapJoinResult(); dummyRow = null; - if (hashMapResult.hasRows()) { - return JoinUtil.JoinResult.MATCH; - } else { + switch (mapJoinResult) { + case MATCH: + aliasFilter = hashMapResult.aliasFilter(); + break; + case NO_MATCH: aliasFilter = (byte) 0xff; - return JoinUtil.JoinResult.NOMATCH; + break; + default: + throw new RuntimeException("Unexpected map join result " + mapJoinResult.name()); } + return mapJoinResult; } @@ -728,20 +706,6 @@ public void addRow(Object[] value) { public void write(MapJoinObjectSerDeContext valueContext, ObjectOutputStream out) { throw new RuntimeException(this.getClass().getCanonicalName() + " cannot be serialized"); } - - // Direct access. - - public JoinUtil.JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult) { - aliasFilter = hashMap.getValueResult(bytes, offset, length, hashMapResult); - dummyRow = null; - if (hashMapResult.hasRows()) { - return JoinUtil.JoinResult.MATCH; - } else { - aliasFilter = (byte) 0xff; - return JoinUtil.JoinResult.NOMATCH; - } - } } public static boolean isSupportedKey(ObjectInspector keyOi) { @@ -756,7 +720,7 @@ public static boolean isSupportedKey(ObjectInspector keyOi) { @Override public void dumpMetrics() { - hashMap.debugDumpMetrics(); + hashTable.debugDumpMetrics(); } @Override @@ -766,6 +730,16 @@ public boolean hasSpill() { @Override public int size() { - return hashMap.size(); + return hashTable.size(); + } + + @Override + public MapJoinHashTable getMapJoinHashTableFind() { + return hashTable; + } + + @Override + public MapJoinHashTableFactory getMapJoinHashTableFactory() { + return mapJoinHashTableFactory; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinKey.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinKey.java index 9f27f56..cbe83be 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinKey.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinKey.java @@ -93,9 +93,7 @@ public static boolean isSupportedField(ObjectInspector foi) { return true; } - public static boolean isSupportedField(String typeName) { - TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); - + public static boolean isSupportedField(TypeInfo typeInfo) { if (typeInfo.getCategory() != Category.PRIMITIVE) return false; // not supported PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; PrimitiveCategory pc = primitiveTypeInfo.getPrimitiveCategory(); @@ -103,6 +101,11 @@ public static boolean isSupportedField(String typeName) { return true; } + public static boolean isSupportedField(String typeName) { + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + return isSupportedField(typeInfo); + } + public static MapJoinKey readFromVector(Output output, MapJoinKey key, Object[] keyObject, List keyOIs, boolean mayReuseKey) throws HiveException { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainer.java index 6d71fef..66ba9b7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainer.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainer.java @@ -22,7 +22,9 @@ import java.util.List; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapper; import org.apache.hadoop.hive.ql.exec.vector.VectorHashKeyWrapperBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; @@ -42,21 +44,21 @@ * Changes current rows to which adaptor is referring to the rows corresponding to * the key represented by a VHKW object, and writers and batch used to interpret it. */ - JoinUtil.JoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, - VectorHashKeyWrapperBatch keyWrapperBatch) throws HiveException; + MapJoinResult setFromVector(VectorHashKeyWrapper kw, VectorExpressionWriter[] keyOutputWriters, + VectorHashKeyWrapperBatch keyWrapperBatch) throws HiveException, IOException; /** * Changes current rows to which adaptor is referring to the rows corresponding to * the key represented by a row object, and fields and ois used to interpret it. */ - JoinUtil.JoinResult setFromRow(Object row, List fields, List ois) - throws HiveException; + MapJoinResult setFromRow(Object row, List fields, List ois) + throws HiveException, IOException; /** * Changes current rows to which adaptor is referring to the rows corresponding to * the key that another adaptor has already deserialized via setFromVector/setFromRow. */ - JoinUtil.JoinResult setFromOther(ReusableGetAdaptor other) throws HiveException; + MapJoinResult setFromOther(ReusableGetAdaptor other) throws HiveException, IOException; /** * Checks whether the current key has any nulls. @@ -93,6 +95,14 @@ MapJoinKey putRow(Writable currentKey, Writable currentValue) */ ReusableGetAdaptor createGetter(MapJoinKey keyTypeFromLoader); + /** + * Provides "managed" access to the map join hash table. For use by the native vector map join + * implementation. + */ + MapJoinHashTableFind getMapJoinHashTableFind(); + + MapJoinHashTableFactory getMapJoinHashTableFactory(); + /** Clears the contents of the table. */ void clear(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java index eb48dd7..afc4bab 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java @@ -32,7 +32,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.JavaUtils; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastTableContainer; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.serde2.SerDe; @@ -144,7 +143,8 @@ public MapJoinTableContainer load( Map metaData = (Map) in.readObject(); if (tableContainer == null) { tableContainer = useOptimizedContainer ? - new MapJoinBytesTableContainer(hconf, valueContext, -1, 0) : + new MapJoinBytesTableContainer(hconf, valueContext, -1, 0, + new BytesBytesMultiHashMapFactory()) : create(name, metaData); } tableContainer.setSerde(keyContext, valueContext); @@ -202,76 +202,6 @@ private void loadOptimized(MapJoinBytesTableContainer container, ObjectInputStre } } - /** - * Loads the small table into a VectorMapJoinFastTableContainer. Only used on Spark path. - * @param mapJoinDesc The descriptor for the map join - * @param fs FileSystem of the folder. - * @param folder The folder to load table container. - * @param hconf The hive configuration - * @return Loaded table. - */ - @SuppressWarnings("unchecked") - public MapJoinTableContainer loadFastContainer(MapJoinDesc mapJoinDesc, - FileSystem fs, Path folder, Configuration hconf) throws HiveException { - try { - VectorMapJoinFastTableContainer tableContainer = - new VectorMapJoinFastTableContainer(mapJoinDesc, hconf, -1); - tableContainer.setSerde(keyContext, valueContext); - - if (fs.exists(folder)) { - if (!fs.isDirectory(folder)) { - throw new HiveException("Error, not a directory: " + folder); - } - - FileStatus[] fileStatuses = fs.listStatus(folder); - if (fileStatuses != null && fileStatuses.length > 0) { - SerDe keySerDe = keyContext.getSerDe(); - SerDe valueSerDe = valueContext.getSerDe(); - Writable key = keySerDe.getSerializedClass().newInstance(); - Writable value = valueSerDe.getSerializedClass().newInstance(); - - for (FileStatus fileStatus : fileStatuses) { - Path filePath = fileStatus.getPath(); - if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) { - throw new HiveException("Error, not a file: " + filePath); - } - InputStream is = null; - ObjectInputStream in = null; - try { - is = fs.open(filePath, 4096); - in = new ObjectInputStream(is); - // skip the name and metadata - in.readUTF(); - in.readObject(); - int numKeys = in.readInt(); - for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) { - key.readFields(in); - long numRows = in.readLong(); - for (long rowIndex = 0L; rowIndex < numRows; rowIndex++) { - value.readFields(in); - tableContainer.putRow(key, value); - } - } - } finally { - if (in != null) { - in.close(); - } else if (is != null) { - is.close(); - } - } - } - } - } - - tableContainer.seal(); - return tableContainer; - } catch (IOException e) { - throw new HiveException("IO error while trying to create table container", e); - } catch (Exception e) { - throw new HiveException("Error while trying to create table container", e); - } - } - public void persist(ObjectOutputStream out, MapJoinPersistableTableContainer tableContainer) throws HiveException { int numKeys = tableContainer.size(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/ReusableGetAdaptorDirectAccess.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/ReusableGetAdaptorDirectAccess.java deleted file mode 100644 index 0685d84..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/ReusableGetAdaptorDirectAccess.java +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.persistence; - - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; - -public interface ReusableGetAdaptorDirectAccess { - - JoinResult setDirect(byte[] bytes, int offset, int length, - BytesBytesMultiHashMap.Result hashMapResult); - - int directSpillPartitionId(); -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMapResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMapResult.java new file mode 100644 index 0000000..75e1d4c --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMapResult.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; + +/* + * Interface for a hash map result. For reading the values, one-by-one. + */ +public interface MapJoinHashMapResult extends MapJoinHashTableResult { + + /** + * @return Whether there are any rows (i.e. true for match). + */ + boolean hasRows(); + + /** + * @return Whether there is 1 value row. + */ + boolean isSingleRow(); + + /** + * @return Whether there is a capped count available from cappedCount. + */ + boolean isCappedCountAvailable(); + + /** + * @return The count of values, up to a arbitrary cap limit. When available, the capped + * count can be used to make decisions on how to optimally generate join results. + */ + int cappedCount(); + + /** + * @return A reference to the first value, or null if there are no values. + */ + ByteSegmentRef first(); + + /** + * @return The next value, or null if there are no more values to be read. + */ + ByteSegmentRef next(); + + /** + * @return Whether there is alias filter available from aliasFilter. + */ + boolean isAliasFilterAvailable(); + + /** + * @return Alias filter byte. + */ + byte aliasFilter(); + + /** + * Get detailed HashMap result position information to help diagnose exceptions. + */ + public abstract String getDetailedHashMapResultPositionString(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResult.java new file mode 100644 index 0000000..30db8e2 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResult.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Abstract class for a hash multi-set result additional methods. + */ +public interface MapJoinHashMultiSetResult extends MapJoinHashTableResult { + + /* + * @return The multi-set count for the lookup key. + */ + public long count(); + + /* + * Mark the result as matched with count multi-set entries. + */ + public void setMatch(long count); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResultImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResultImpl.java new file mode 100644 index 0000000..b568163 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashMultiSetResultImpl.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Abstract class for a hash multi-set result. + */ +public class MapJoinHashMultiSetResultImpl extends MapJoinHashTableResultImpl + implements MapJoinHashMultiSetResult { + + protected long count; + + public MapJoinHashMultiSetResultImpl() { + super(); + count = 0; + } + + /* + * @return The multi-set count for the lookup key. + */ + public long count() { + return count; + } + + public void setMatch(long count) { + this.count = count; + mapJoinResult = MapJoinResult.MATCH; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResult.java new file mode 100644 index 0000000..5849fff --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResult.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Interface for a hash set result additional methods. + */ +public interface MapJoinHashSetResult extends MapJoinHashTableResult { + + /* + * Mark the result as matched for a set entry. + */ + void setMatch(); + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResultImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResultImpl.java new file mode 100644 index 0000000..18037b6 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashSetResultImpl.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Abstract class for a hash set result. + */ +public class MapJoinHashSetResultImpl extends MapJoinHashTableResultImpl + implements MapJoinHashSetResult { + + public MapJoinHashSetResultImpl() { + super(); + } + + public void setMatch() { + mapJoinResult = MapJoinResult.MATCH; + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTable.java new file mode 100644 index 0000000..0d6cae9 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTable.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Root abstract class for a vector map join hash table (which could be a hash map, hash multi-set, + * or hash set). + */ +public interface MapJoinHashTable extends MapJoinHashTableManage, MapJoinHashTableFind { + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFactory.java new file mode 100644 index 0000000..f179d92 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFactory.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; + +/* + * Root abstract class for a vector map join hash table (which could be a hash map, hash multi-set, + * or hash set). + */ +public interface MapJoinHashTableFactory { + + /** + * @return true when the hash table manages its the key value put helper. + */ + public boolean keyValuePutHelperIsExternal(); + + /** + * @return true if min/max optimization could be used. + */ + public boolean useMinMax(); + + /** + * @return For hash tables has manage their own key and value put object. + */ + KeyValuePut createKeyValuePut(); + + /** + * @param initialCapacity + * @param loadFactor + * @param writeBuffersSize + * @param memUsage + * @return A new hash table. + */ + MapJoinHashTable createHashTable(int initialCapacity, float loadFactor, + int writeBuffersSize, long memUsage); + + /* + * @return A new hash map result implementation specific object. + * + * The object can be used to access the values when there is a match, or + * access spill information when the partition with the key is currently spilled. + */ + MapJoinHashMapResult createHashMapResult(); + + /* + * @return A new hash multi-set result implementation specific object. + * + * The object can be used to access the *count* of values when the key is contained in the + * multi-set, or access spill information when the partition with the key is currently spilled. + */ + MapJoinHashMultiSetResult createHashMultiSetResult(); + + /* + * @return A new hash set result implementation specific object. + * + * The object can be used to access access spill information when the partition with the key + * is currently spilled. + */ + MapJoinHashSetResult createHashSetResult(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFind.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFind.java new file mode 100644 index 0000000..936d99a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableFind.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +import java.io.IOException; + +/* + * Root abstract class for a vector map join hash table (which could be a hash map, hash multi-set, + * or hash set). + */ +public interface MapJoinHashTableFind { + + //---------------------------- COMMON LONG METHODS (Begin)---------------------------------------- + + boolean useMinMax(); + + long min(); + long max(); + + //----------------------------- COMMON LONG METHODS (End)----------------------------------------- + + //-------------------------------- HASH MAP (Begin)----------------------------------------------- + + /* + * Lookup a byte array key in the hash map. + * + * @param keyBytes + * A byte array containing the key within a range. + * @param keyStart + * The offset the beginning of the key. + * @param keyLength + * The length of the key. + * @param hashCode + * The key hash code. + * @param hashMapResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException; + + /* + * Lookup an long in the hash map. + * + * @param key + * The long key. + * @param hashCode + * The key hash code. + * @param hashMapResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashMapLookup(long key, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException; + + + //-------------------------------- HASH MAP (End) ------------------------------------------------ + + //---------------------------- HASH MULTI-SET (Begin) ------------------------------------------- + + /* + * Lookup an byte array key in the hash multi-set. + * + * @param keyBytes + * A byte array containing the key within a range. + * @param keyStart + * The offset the beginning of the key. + * @param keyLength + * The length of the key. + * @param hashCode + * The key hash code. + * @param hashMultiSetResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashMultiSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException; + + /* + * Lookup an long in the hash multi-set. + * + * @param key + * The long key. + * @param hashCode + * The key hash code. + * @param hashMultiSetResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashMultiSetContains(long key, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException; + + + //----------------------------- HASH MULTI-SET (End) -------------------------------------------- + + //------------------------------- HASH SET (Begin) ---------------------------------------------- + + /* + * Lookup an byte array key in the hash set. + * + * @param keyBytes + * A byte array containing the key within a range. + * @param keyStart + * The offset the beginning of the key. + * @param keyLength + * The length of the key. + * @param hashCode + * The key hash code. + * @param hashSetResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashSetResult hashSetResult) + throws IOException; + + /* + * Lookup an long in the hash set. + * + * @param key + * The long key. + * @param hashCode + * The key hash code. + * @param hashSetResult + * The object to receive small table value(s) information on a MATCH. + * Or, for SPILL, it has information on where to spill the big table row. + * Examine mapJoinResult() for lookup result. + */ + void hashSetContains(long key, int hashCode, MapJoinHashSetResult hashSetResult) + throws IOException; + + //--------------------------------- HASH SET (End) ---------------------------------------------- + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableManage.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableManage.java new file mode 100644 index 0000000..3a650dd --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableManage.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +import java.io.IOException; + +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; +import org.apache.hadoop.io.Writable; + +/* + * Root abstract class for a vector map join hash table (which could be a hash map, hash multi-set, + * or hash set). + */ +public interface MapJoinHashTableManage { + + /* + * The interface for an object that helps adding a new key/value to the hash table. + * + * Hash code calculation is pulled up to support the HybridHashTableContainer which needs + * the hash code earlier so it can partition the hash tables. And, also, for bloom filters. + * + * To support single long key min/max, we have a method that can extract the long value out. + */ + public static interface KeyValuePut { + + void setKeyValue(Writable key, Writable value) throws SerDeException, IOException; + + boolean hasHashCode(); + + int getKeyHashCode() throws SerDeException; + + long getLongKey(); + } + + /** + * The source of keys and values to put into hashtable; avoids byte copying. + * Supports BytesBytesMultiHashMap. + */ + public static interface KeyValuePutWriter extends KeyValuePut { + /** Write key into output. */ + void writeKey(RandomAccessOutput dest) throws SerDeException; + + /** Write value into output. */ + void writeValue(RandomAccessOutput dest) throws SerDeException; + + /** + * Provide updated value for state byte for a key. + * @param previousValue Previous value; null if this is the first call per key. + * @return The updated value. + */ + byte updateStateByte(Byte previousValue); + } + + public void put(KeyValuePut keyValuePutHelper) throws SerDeException; + + + /** + * Number of keys in the hashmap + * @return number of keys + */ + int size(); + + /** + * Number of values in the hashmap + * This is equal to or bigger than number of keys, since some values may share the same key + * @return number of values + */ + int getNumValues(); + + /** + * Number of bytes used by the hashmap + * There are two main components that take most memory: writeBuffers and refs + * Others include instance fields: 100 + * @return number of bytes + */ + long memorySize(); + + void seal(); + + void clear(); + + void debugDumpMetrics(); + + void debugDumpTable(); + + void expandAndRehashToTarget(int estimateNewRowCount); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResult.java new file mode 100644 index 0000000..591b0da --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResult.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Root interface for a hash table result. + */ +public interface MapJoinHashTableResult { + + /** + * Represents the hash map lookup result between two tables + */ + public enum MapJoinResult { + NONE, + MATCH, + NO_MATCH, + SPILL + } + + /** + * Forget about the most recent hash table lookup or contains call. + */ + void forget(); + + /** + * Set the map join result. + */ + void setNoMatch(); + + /** + * @return The map join result. + */ + MapJoinResult getMapJoinResult(); + + /** + * Set the spill partition id. + */ + void setSpill(int spillPartitionId); + + /** + * @return The Hybrid Grace spill partition id. + */ + int getSpillPartitionId(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResultImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResultImpl.java new file mode 100644 index 0000000..0c43655 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/mapjoinhashtable/MapJoinHashTableResultImpl.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable; + +/* + * Abstract implementation class for a hash table result. + */ +public abstract class MapJoinHashTableResultImpl implements MapJoinHashTableResult { + + protected MapJoinResult mapJoinResult; + + private int spillPartitionId; + + public MapJoinHashTableResultImpl() { + mapJoinResult = MapJoinResult.NONE; + spillPartitionId = -1; + } + + /** + * Forget about the most recent hash table lookup or contains call. + */ + @Override + public void forget() { + mapJoinResult = MapJoinResult.NONE; + spillPartitionId = -1; + } + + /** + * Set the map join result. + */ + @Override + public void setNoMatch() { + this.mapJoinResult = MapJoinResult.NO_MATCH; + } + + /** + * @return The map join result. + */ + @Override + public MapJoinResult getMapJoinResult() { + return mapJoinResult; + } + + /** + * Set the spill partition id. + */ + @Override + public void setSpill(int spillPartitionId) { + this.mapJoinResult = MapJoinResult.SPILL; + this.spillPartitionId = spillPartitionId; + } + + /** + * @return The Hybrid Grace spill partition id. + */ + @Override + public int getSpillPartitionId() { + return spillPartitionId; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(mapJoinResult.name()); + if (mapJoinResult == MapJoinResult.SPILL) { + sb.append(", spillPartitionId "); + sb.append(spillPartitionId); + } + return sb.toString(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java index 1634f42..7d3a08c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java @@ -64,8 +64,6 @@ private MapJoinOperator joinOp; private MapJoinDesc desc; - private boolean useFastContainer = false; - @Override public void init(ExecMapperContext context, MapredContext mrContext, Configuration hconf, MapJoinOperator joinOp) { @@ -73,12 +71,6 @@ public void init(ExecMapperContext context, MapredContext mrContext, Configurati this.hconf = hconf; this.joinOp = joinOp; this.desc = joinOp.getConf(); - if (desc.getVectorMode() && HiveConf.getBoolVar( - hconf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) { - VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); - useFastContainer = vectorDesc != null && vectorDesc.hashTableImplementationType() == - VectorMapJoinDesc.HashTableImplementationType.FAST; - } } @Override @@ -108,7 +100,7 @@ public void load(MapJoinTableContainer[] mapJoinTables, FileSystem fs = FileSystem.get(baseDir.toUri(), hconf); BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext(); boolean firstContainer = true; - boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar( + boolean useOptimizedContainer = HiveConf.getBoolVar( hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE); for (int pos = 0; pos < mapJoinTables.length; pos++) { if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) { @@ -156,17 +148,14 @@ private MapJoinTableContainer load(FileSystem fs, Path path, MapJoinTableContainerSerDe mapJoinTableSerde) throws HiveException { LOG.info("\tLoad back all hashtable files from tmp folder uri:" + path); if (!SparkUtilities.isDedicatedCluster(hconf)) { - return useFastContainer ? mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) : - mapJoinTableSerde.load(fs, path, hconf); + return mapJoinTableSerde.load(fs, path, hconf); } MapJoinTableContainer mapJoinTable = SmallTableCache.get(path); if (mapJoinTable == null) { synchronized (path.toString().intern()) { mapJoinTable = SmallTableCache.get(path); if (mapJoinTable == null) { - mapJoinTable = useFastContainer ? - mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) : - mapJoinTableSerde.load(fs, path, hconf); + mapJoinTable = mapJoinTableSerde.load(fs, path, hconf); SmallTableCache.cache(path, mapJoinTable); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java index 7b13e90..4ab94e5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; +import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMapFactory; import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableConf; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer; @@ -38,9 +39,13 @@ import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastHashTableFactory; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; @@ -63,12 +68,47 @@ private MapJoinDesc desc; private TezContext tezContext; + private VectorMapJoinDesc vectorDesc; + private MapJoinHashTableFactory mapJoinHashTableFactory; + + public static boolean weWillUseHybridHashTableContainer(Configuration hconf, MapJoinDesc desc) { + boolean isCrossProduct = false; + List joinExprs = desc.getKeys().values().iterator().next(); + if (joinExprs.size() == 0) { + isCrossProduct = true; + } + boolean useOptimizedTables = HiveConf.getBoolVar( + hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE); + boolean useHybridGraceHashJoin = desc.isHybridHashJoin(); + + return useOptimizedTables && useHybridGraceHashJoin && !isCrossProduct; + } + @Override public void init(ExecMapperContext context, MapredContext mrContext, Configuration hconf, MapJoinOperator joinOp) { this.tezContext = (TezContext) mrContext; this.hconf = hconf; this.desc = joinOp.getConf(); + this.vectorDesc = this.desc.getVectorDesc(); + HashTableImplementationType hashTableImplementationType; + if (this.vectorDesc == null) { + hashTableImplementationType = HashTableImplementationType.NONE; + } else { + hashTableImplementationType = vectorDesc.hashTableImplementationType(); + } + switch (hashTableImplementationType) { + case NONE: + // Non-native vector map join uses the regular BytesBytesMultiHashMap. + mapJoinHashTableFactory = new BytesBytesMultiHashMapFactory(); + break; + case FAST: + mapJoinHashTableFactory = new VectorMapJoinFastHashTableFactory(this.desc); + break; + default: + throw new RuntimeException("Unknown vector map join hash table implementation type " + + hashTableImplementationType.name()); + } } @Override @@ -168,6 +208,7 @@ public void load(MapJoinTableContainer[] mapJoinTables, MapJoinObjectSerDeContext keyCtx = mapJoinTableSerdes[pos].getKeyContext(), valCtx = mapJoinTableSerdes[pos].getValueContext(); if (useOptimizedTables) { + // Some keys are not supported by regular hive key handling. ObjectInspector keyOi = keyCtx.getSerDe().getObjectInspector(); if (!MapJoinBytesTableContainer.isSupportedKey(keyOi)) { if (isFirstKey) { @@ -196,10 +237,11 @@ public void load(MapJoinTableContainer[] mapJoinTables, MapJoinTableContainer tableContainer; if (useOptimizedTables) { if (!useHybridGraceHashJoin || isCrossProduct) { - tableContainer = new MapJoinBytesTableContainer(hconf, valCtx, keyCount, 0); + tableContainer = new MapJoinBytesTableContainer(hconf, valCtx, keyCount, 0, + mapJoinHashTableFactory); } else { tableContainer = new HybridHashTableContainer(hconf, keyCount, memory, - desc.getParentDataSizes().get(pos), nwayConf); + desc.getParentDataSizes().get(pos), nwayConf, mapJoinHashTableFactory); } } else { tableContainer = new HashMapWrapper(hconf, keyCount); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorBatchDebug.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorBatchDebug.java new file mode 100644 index 0000000..155c9b8 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorBatchDebug.java @@ -0,0 +1,105 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.sql.Timestamp; + +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class VectorBatchDebug { + private static final Logger LOG = LoggerFactory.getLogger(VectorBatchDebug.class); + + public static String displayBytes(byte[] bytes, int start, int length) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + char ch = (char) bytes[i]; + if (ch < ' ' || ch > '~') { + sb.append(String.format("\\%03d", bytes[i] & 0xff)); + } else { + sb.append(ch); + } + } + return sb.toString(); + } + + public static void debugDisplayOneRow(VectorizedRowBatch batch, int index, String prefix) { + StringBuilder sb = new StringBuilder(); + sb.append(prefix + " row " + index + " "); + for (int p = 0; p < batch.projectionSize; p++) { + int column = batch.projectedColumns[p]; + if (p == column) { + sb.append("(col " + p + ") "); + } else { + sb.append("(proj col " + p + " col " + column + ") "); + } + ColumnVector colVector = batch.cols[column]; + if (colVector == null) { + sb.append("(null ColumnVector)"); + } else { + boolean isRepeating = colVector.isRepeating; + if (isRepeating) { + sb.append("(repeating)"); + } + index = (isRepeating ? 0 : index); + if (colVector.noNulls || !colVector.isNull[index]) { + if (colVector instanceof LongColumnVector) { + sb.append(((LongColumnVector) colVector).vector[index]); + } else if (colVector instanceof DoubleColumnVector) { + sb.append(((DoubleColumnVector) colVector).vector[index]); + } else if (colVector instanceof BytesColumnVector) { + BytesColumnVector bytesColumnVector = (BytesColumnVector) colVector; + byte[] bytes = bytesColumnVector.vector[index]; + int start = bytesColumnVector.start[index]; + int length = bytesColumnVector.length[index]; + if (bytes == null) { + sb.append("(Unexpected null bytes with start " + start + " length " + length + ")"); + } else { + sb.append("bytes: '" + displayBytes(bytes, start, length) + "'"); + } + } else if (colVector instanceof DecimalColumnVector) { + sb.append(((DecimalColumnVector) colVector).vector[index].toString()); + } else if (colVector instanceof TimestampColumnVector) { + Timestamp timestamp = new Timestamp(0); + ((TimestampColumnVector) colVector).timestampUpdate(timestamp, index); + sb.append(timestamp.toString()); + } else if (colVector instanceof IntervalDayTimeColumnVector) { + HiveIntervalDayTime intervalDayTime = ((IntervalDayTimeColumnVector) colVector).asScratchIntervalDayTime(index); + sb.append(intervalDayTime.toString()); + } else { + sb.append("Unknown"); + } + } else { + sb.append("NULL"); + } + } + sb.append(" "); + } + System.err.println(sb.toString()); + // LOG.info(sb.toString()); + } + + public static void debugDisplayBatch(VectorizedRowBatch batch, String prefix) { + for (int i = 0; i < batch.size; i++) { + int index = (batch.selectedInUse ? batch.selected[i] : i); + debugDisplayOneRow(batch, index, prefix); + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java index c4b95c3..c890674 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnMapping.java @@ -20,6 +20,8 @@ import java.util.Arrays; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + /** * This class collects column information for copying a row from one VectorizedRowBatch to * same/another batch. @@ -30,7 +32,7 @@ protected int[] sourceColumns; protected int[] outputColumns; - protected String[] typeNames; + protected TypeInfo[] typeInfos; protected VectorColumnOrderedMap vectorColumnMapping; @@ -38,7 +40,7 @@ public VectorColumnMapping(String name) { this.vectorColumnMapping = new VectorColumnOrderedMap(name); } - public abstract void add(int sourceColumn, int outputColumn, String typeName); + public abstract void add(int sourceColumn, int outputColumn, TypeInfo typeInfo); public abstract void finalize(); @@ -54,8 +56,8 @@ public int getCount() { return outputColumns; } - public String[] getTypeNames() { - return typeNames; + public TypeInfo[] getTypeInfos() { + return typeInfos; } @Override @@ -65,7 +67,7 @@ public String toString() { sb.append(", "); sb.append("output columns: " + Arrays.toString(outputColumns)); sb.append(", "); - sb.append("type names: " + Arrays.toString(typeNames)); + sb.append("type infos: " + Arrays.toString(typeInfos)); return sb.toString(); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java index 0e6014b..97d55f5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOrderedMap.java @@ -23,8 +23,10 @@ import java.util.TreeMap; import org.apache.commons.lang.ArrayUtils; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * This class collects column information for mapping vector columns, including the hive type name. @@ -43,17 +45,17 @@ private class Value { int valueColumn; - String typeName; + TypeInfo typeInfo; - Value(int valueColumn, String typeName) { + Value(int valueColumn, TypeInfo typeInfo) { this.valueColumn = valueColumn; - this.typeName = typeName; + this.typeInfo = typeInfo; } public String toString() { StringBuilder sb = new StringBuilder(); sb.append("(value column: " + valueColumn); - sb.append(", type name: " + typeName + ")"); + sb.append(", type info: " + typeInfo.toString() + ")"); return sb.toString(); } } @@ -62,12 +64,12 @@ public String toString() { private final int[] orderedColumns; private final int[] valueColumns; - private final String[] typeNames; + private final TypeInfo[] typeInfos; - Mapping(int[] orderedColumns, int[] valueColumns, String[] typeNames) { + Mapping(int[] orderedColumns, int[] valueColumns, TypeInfo[] typeInfos) { this.orderedColumns = orderedColumns; this.valueColumns = valueColumns; - this.typeNames = typeNames; + this.typeInfos = typeInfos; } public int getCount() { @@ -82,8 +84,8 @@ public int getCount() { return valueColumns; } - public String[] getTypeNames() { - return typeNames; + public TypeInfo[] getTypeInfos() { + return typeInfos; } } @@ -92,14 +94,14 @@ public VectorColumnOrderedMap(String name) { orderedTreeMap = new TreeMap(); } - public void add(int orderedColumn, int valueColumn, String typeName) { + public void add(int orderedColumn, int valueColumn, TypeInfo typeInfo) { if (orderedTreeMap.containsKey(orderedColumn)) { throw new RuntimeException( name + " duplicate column " + orderedColumn + " in ordered column map " + orderedTreeMap.toString() + - " when adding value column " + valueColumn + ", type " + typeName); + " when adding value column " + valueColumn + ", type into " + typeInfo.toString()); } - orderedTreeMap.put(orderedColumn, new Value(valueColumn, typeName)); + orderedTreeMap.put(orderedColumn, new Value(valueColumn, typeInfo)); } public boolean orderedColumnsContain(int orderedColumn) { @@ -109,17 +111,16 @@ public boolean orderedColumnsContain(int orderedColumn) { public Mapping getMapping() { ArrayList orderedColumns = new ArrayList(); ArrayList valueColumns = new ArrayList(); - ArrayList typeNames = new ArrayList(); + ArrayList typeInfos = new ArrayList(); for (Map.Entry entry : orderedTreeMap.entrySet()) { orderedColumns.add(entry.getKey()); Value value = entry.getValue(); valueColumns.add(value.valueColumn); - typeNames.add(value.typeName); + typeInfos.add(value.typeInfo); } return new Mapping( ArrayUtils.toPrimitive(orderedColumns.toArray(new Integer[0])), ArrayUtils.toPrimitive(valueColumns.toArray(new Integer[0])), - typeNames.toArray(new String[0])); - + typeInfos.toArray(new TypeInfo[0])); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java index f35aff7..4ceff6b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnOutputMapping.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.exec.vector; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOrderedMap.Mapping; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * This class collects column information for copying a row from one VectorizedRowBatch to @@ -35,9 +36,9 @@ public VectorColumnOutputMapping(String name) { } @Override - public void add(int sourceColumn, int outputColumn, String typeName) { + public void add(int sourceColumn, int outputColumn, TypeInfo typeInfo) { // Order on outputColumn. - vectorColumnMapping.add(outputColumn, sourceColumn, typeName); + vectorColumnMapping.add(outputColumn, sourceColumn, typeInfo); } public boolean containsOutputColumn(int outputColumn) { @@ -51,7 +52,7 @@ public void finalize() { // Ordered columns are the output columns. sourceColumns = mapping.getValueColumns(); outputColumns = mapping.getOrderedColumns(); - typeNames = mapping.getTypeNames(); + typeInfos = mapping.getTypeInfos(); // Not needed anymore. vectorColumnMapping = null; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java index 4f5ba9a..061e396 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnSourceMapping.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.exec.vector; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOrderedMap.Mapping; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * This class collects column information for copying a row from one VectorizedRowBatch to @@ -35,9 +36,9 @@ public VectorColumnSourceMapping(String name) { } @Override - public void add(int sourceColumn, int outputColumn, String typeName) { + public void add(int sourceColumn, int outputColumn, TypeInfo typeInfo) { // Order on sourceColumn. - vectorColumnMapping.add(sourceColumn, outputColumn, typeName); + vectorColumnMapping.add(sourceColumn, outputColumn, typeInfo); } @Override @@ -47,7 +48,7 @@ public void finalize() { // Ordered columns are the source columns. sourceColumns = mapping.getOrderedColumns(); outputColumns = mapping.getValueColumns(); - typeNames = mapping.getTypeNames(); + typeInfos = mapping.getTypeInfos(); // Not needed anymore. vectorColumnMapping = null; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java index c8e0284..911aeb0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorCopyRow.java @@ -262,8 +262,7 @@ public void init(VectorColumnMapping columnMapping) throws HiveException { for (int i = 0; i < count; i++) { int inputColumn = columnMapping.getInputColumns()[i]; int outputColumn = columnMapping.getOutputColumns()[i]; - String typeName = columnMapping.getTypeNames()[i].toLowerCase(); - TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + TypeInfo typeInfo = columnMapping.getTypeInfos()[i]; Type columnVectorType = VectorizationContext.getColumnVectorTypeFromTypeInfo(typeInfo); CopyRow copyRowByValue = null; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java index f66916b..a3a5059 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.exec.vector; -import java.io.EOFException; import java.io.IOException; import java.util.List; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java index e6dc9ec..74b3230 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java @@ -167,6 +167,20 @@ public void init(List typeNames) throws HiveException { } } + /* + * Initialize using data type infos. + * No projection -- the column range 0 .. types.size()-1 + */ + public void init(TypeInfo[] typeInfos) throws HiveException { + + final int count = typeInfos.length; + allocateArrays(count); + + for (int i = 0; i < count; i++) { + initEntry(i, i, typeInfos[i]); + } + } + public int getCount() { return projectionColumnNums.length; } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java index 2605203..4112b1b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java @@ -350,7 +350,7 @@ public void processBatch(VectorizedRowBatch batch) throws HiveException { //Validate that some progress is being made if (!(numEntriesHashTable < preFlushEntriesCount)) { - if (LOG.isDebugEnabled()) { + if (isLogDebugEnabled) { LOG.debug(String.format("Flush did not progress: %d entries before, %d entries after", preFlushEntriesCount, numEntriesHashTable)); @@ -429,7 +429,7 @@ private void computeMemoryLimits() { maxHashTblMemory = (int)(maxMemory * memoryThreshold); - if (LOG.isDebugEnabled()) { + if (isLogDebugEnabled) { LOG.debug(String.format("maxMemory:%dMb (%d * %f) fixSize:%d (key:%d agg:%d)", maxHashTblMemory/1024/1024, maxMemory/1024/1024, @@ -452,7 +452,7 @@ private void flush(boolean all) throws HiveException { (int)(numEntriesHashTable * this.percentEntriesToFlush); int entriesFlushed = 0; - if (LOG.isDebugEnabled()) { + if (isLogDebugEnabled) { LOG.debug(String.format( "Flush %d %s entries:%d fixed:%d variable:%d (used:%dMb max:%dMb) gcCanary:%s", entriesToFlush, all ? "(all)" : "", @@ -485,7 +485,7 @@ private void flush(boolean all) throws HiveException { numEntriesHashTable = 0; } - if (all && LOG.isDebugEnabled()) { + if (all && isLogDebugEnabled) { LOG.debug(String.format("GC canary caused %d flushes", gcCanaryFlushes)); } } @@ -537,7 +537,7 @@ private void updateAvgVariableSize(VectorizedRowBatch batch) { private void checkHashModeEfficiency() throws HiveException { if (lastModeCheckRowCount > numRowsCompareHashAggr) { lastModeCheckRowCount = 0; - if (LOG.isDebugEnabled()) { + if (isLogDebugEnabled) { LOG.debug(String.format("checkHashModeEfficiency: HT:%d RC:%d MIN:%d", numEntriesHashTable, sumBatchSize, (long)(sumBatchSize * minReductionHashAggr))); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java index 0cb6c8a..1babe74 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -28,6 +29,7 @@ import org.apache.hadoop.hive.ql.exec.JoinUtil; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory; @@ -187,8 +189,8 @@ protected Object _evaluate(Object row, int version) throws HiveException { } @Override - protected JoinUtil.JoinResult setMapJoinKey(ReusableGetAdaptor dest, Object row, byte alias) - throws HiveException { + protected MapJoinResult setMapJoinKey(ReusableGetAdaptor dest, Object row, byte alias) + throws HiveException, IOException { return dest.setFromVector(keyValues[batchIndex], keyOutputWriters, keyWrapperBatch); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRow.java index 6af3d99..febbeb3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSerializeRow.java @@ -33,6 +33,8 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import com.google.common.base.Preconditions; + /** * This class serializes columns from a row in a VectorizedRowBatch into a serialization format. * @@ -58,569 +60,153 @@ public VectorSerializeRow(T serializeWrite) { private VectorSerializeRow() { } - private abstract class Writer { - protected int columnIndex; - - Writer(int columnIndex) { - this.columnIndex = columnIndex; - } - - abstract boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException; - } - - private abstract class AbstractLongWriter extends Writer { - - AbstractLongWriter(int columnIndex) { - super(columnIndex); - } - } - - private class BooleanWriter extends AbstractLongWriter { - - BooleanWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeBoolean(colVector.vector[0] != 0); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeBoolean(colVector.vector[batchIndex] != 0); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } - } - } - - private class ByteWriter extends AbstractLongWriter { - - ByteWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeByte((byte) colVector.vector[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeByte((byte) colVector.vector[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } - } - } - - private class ShortWriter extends AbstractLongWriter { - - ShortWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeShort((short) colVector.vector[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeShort((short) colVector.vector[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } - } - } - - private class IntWriter extends AbstractLongWriter { - - IntWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeInt((int) colVector.vector[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeInt((int) colVector.vector[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } - } - } - - private class LongWriter extends AbstractLongWriter { - - LongWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeLong(colVector.vector[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeLong(colVector.vector[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } - } - } - - private class DateWriter extends AbstractLongWriter { - - DateWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeDate((int) colVector.vector[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeDate((int) colVector.vector[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } - } - } - - private class TimestampWriter extends Writer { - - Timestamp scratchTimestamp; - - TimestampWriter(int columnIndex) { - super(columnIndex); - scratchTimestamp = new Timestamp(0); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - TimestampColumnVector colVector = (TimestampColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - colVector.timestampUpdate(scratchTimestamp, 0); - serializeWrite.writeTimestamp(scratchTimestamp); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - colVector.timestampUpdate(scratchTimestamp, batchIndex); - serializeWrite.writeTimestamp(scratchTimestamp); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } - } - } - - private class IntervalYearMonthWriter extends AbstractLongWriter { + Category[] sourceCategories; + // The data type category of each column being serialized. - IntervalYearMonthWriter(int columnIndex) { - super(columnIndex); - } + PrimitiveCategory[] sourcePrimitiveCategories; + //The data type primitive category of each column being serialized. - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - LongColumnVector colVector = (LongColumnVector) batch.cols[columnIndex]; + int[] columnNums; + // The column numbers to serialize. - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeHiveIntervalYearMonth((int) colVector.vector[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } + private boolean serialize(VectorizedRowBatch batch, final int index, int batchIndex) throws IOException { + final int columnIndex = columnNums[index]; + ColumnVector colVector = (ColumnVector) batch.cols[columnIndex]; + if (colVector.isRepeating) { + if (colVector.noNulls || !colVector.isNull[0]) { + batchIndex = 0; + // Fall below... } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeHiveIntervalYearMonth((int) colVector.vector[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } + serializeWrite.writeNull(); + return false; } - } - } - - private class IntervalDayTimeWriter extends Writer { - - private HiveIntervalDayTime hiveIntervalDayTime; - - IntervalDayTimeWriter(int columnIndex) { - super(columnIndex); - hiveIntervalDayTime = new HiveIntervalDayTime(); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - IntervalDayTimeColumnVector colVector = (IntervalDayTimeColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - hiveIntervalDayTime.set(colVector.asScratchIntervalDayTime(0)); - serializeWrite.writeHiveIntervalDayTime(hiveIntervalDayTime); - return true; - } else { - serializeWrite.writeNull(); - return false; - } + } else { + if (colVector.noNulls || !colVector.isNull[batchIndex]) { + // Fall below... } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - hiveIntervalDayTime.set(colVector.asScratchIntervalDayTime(batchIndex)); - serializeWrite.writeHiveIntervalDayTime(hiveIntervalDayTime); - return true; - } else { - serializeWrite.writeNull(); - return false; - } + serializeWrite.writeNull(); + return false; } } - } - - private abstract class AbstractDoubleWriter extends Writer { - - AbstractDoubleWriter(int columnIndex) { - super(columnIndex); - } - } - - private class FloatWriter extends AbstractDoubleWriter { - - FloatWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - DoubleColumnVector colVector = (DoubleColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeFloat((float) colVector.vector[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeFloat((float) colVector.vector[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } - } - } - - private class DoubleWriter extends AbstractDoubleWriter { - - DoubleWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - DoubleColumnVector colVector = (DoubleColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeDouble(colVector.vector[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeDouble(colVector.vector[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } - } - } - - private class StringWriter extends Writer { - - StringWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeString(colVector.vector[0], colVector.start[0], colVector.length[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeString(colVector.vector[batchIndex], - colVector.start[batchIndex], colVector.length[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; + switch (sourceCategories[index]) { + case PRIMITIVE: + switch (sourcePrimitiveCategories[index]) { + case BOOLEAN: + serializeWrite.writeBoolean(((LongColumnVector) colVector).vector[batchIndex] != 0); + break; + case BYTE: + serializeWrite.writeByte((byte) ((LongColumnVector) colVector).vector[batchIndex]); + break; + case SHORT: + serializeWrite.writeShort((short) ((LongColumnVector) colVector).vector[batchIndex]); + break; + case INT: + serializeWrite.writeInt((int) ((LongColumnVector) colVector).vector[batchIndex]); + break; + case LONG: + serializeWrite.writeLong(((LongColumnVector) colVector).vector[batchIndex]); + break; + case FLOAT: + serializeWrite.writeFloat((float) ((DoubleColumnVector) colVector).vector[batchIndex]); + break; + case DOUBLE: + serializeWrite.writeDouble(((DoubleColumnVector) colVector).vector[batchIndex]); + break; + case STRING: + case VARCHAR: + case CHAR: + { + BytesColumnVector bytesColVector = ((BytesColumnVector) colVector); + serializeWrite.writeString(bytesColVector.vector[batchIndex], bytesColVector.start[batchIndex], bytesColVector.length[batchIndex]); } - } - } - } - - private class BinaryWriter extends Writer { - - BinaryWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - BytesColumnVector colVector = (BytesColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeBinary(colVector.vector[0], colVector.start[0], colVector.length[0]); - return true; - } else { - serializeWrite.writeNull(); - return false; + break; + case DATE: + serializeWrite.writeDate((int) ((LongColumnVector) colVector).vector[batchIndex]); + break; + case TIMESTAMP: + serializeWrite.writeTimestamp(((TimestampColumnVector) colVector).asScratchTimestamp(batchIndex)); + break; + case BINARY: + { + BytesColumnVector bytesColVector = ((BytesColumnVector) colVector); + serializeWrite.writeBinary(bytesColVector.vector[batchIndex], bytesColVector.start[batchIndex], bytesColVector.length[batchIndex]); } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeBinary(colVector.vector[batchIndex], - colVector.start[batchIndex], colVector.length[batchIndex]); - return true; - } else { - serializeWrite.writeNull(); - return false; + break; + case DECIMAL: + { + DecimalColumnVector decimalColVector = ((DecimalColumnVector) colVector); + serializeWrite.writeHiveDecimal(decimalColVector.vector[batchIndex].getHiveDecimal(), decimalColVector.scale); } + break; + case INTERVAL_YEAR_MONTH: + serializeWrite.writeHiveIntervalYearMonth((int) ((LongColumnVector) colVector).vector[batchIndex]); + break; + case INTERVAL_DAY_TIME: + serializeWrite.writeHiveIntervalDayTime(((IntervalDayTimeColumnVector) colVector).asScratchIntervalDayTime(batchIndex)); + break; + default: + throw new RuntimeException("Unsupported primitve category " + sourcePrimitiveCategories[index]); } + break; + default: + throw new RuntimeException("Unsupported category " + sourceCategories[index]); } + return true; } - private class HiveDecimalWriter extends Writer { - protected HiveDecimalWritable[] vector; - - HiveDecimalWriter(int columnIndex) { - super(columnIndex); - } - - @Override - boolean apply(VectorizedRowBatch batch, int batchIndex) throws IOException { - DecimalColumnVector colVector = (DecimalColumnVector) batch.cols[columnIndex]; - - if (colVector.isRepeating) { - if (colVector.noNulls || !colVector.isNull[0]) { - serializeWrite.writeHiveDecimal(colVector.vector[0].getHiveDecimal(), colVector.scale); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } else { - if (colVector.noNulls || !colVector.isNull[batchIndex]) { - serializeWrite.writeHiveDecimal(colVector.vector[batchIndex].getHiveDecimal(), colVector.scale); - return true; - } else { - serializeWrite.writeNull(); - return false; - } - } + private void addColumn(int index, TypeInfo typeInfo, int columnNum) { + Category category = typeInfo.getCategory(); + sourceCategories[index] = typeInfo.getCategory(); + if (category == Category.PRIMITIVE) { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; + sourcePrimitiveCategories[index] = primitiveTypeInfo.getPrimitiveCategory(); } + columnNums[index] = columnNum; } - private Writer[] writers; - - private Writer createWriter(TypeInfo typeInfo, int columnIndex) throws HiveException { - Writer writer; - Category category = typeInfo.getCategory(); - switch (category) { - case PRIMITIVE: - { - PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; - PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); - switch (primitiveCategory) { - // case VOID: - // UNDONE: - // break; - case BOOLEAN: - writer = new BooleanWriter(columnIndex); - break; - case BYTE: - writer = new ByteWriter(columnIndex); - break; - case SHORT: - writer = new ShortWriter(columnIndex); - break; - case INT: - writer = new IntWriter(columnIndex); - break; - case LONG: - writer = new LongWriter(columnIndex); - break; - case DATE: - writer = new DateWriter(columnIndex); - break; - case TIMESTAMP: - writer = new TimestampWriter(columnIndex); - break; - case FLOAT: - writer = new FloatWriter(columnIndex); - break; - case DOUBLE: - writer = new DoubleWriter(columnIndex); - break; - case STRING: - case CHAR: - case VARCHAR: - // We store CHAR and VARCHAR without pads, so use STRING writer class. - writer = new StringWriter(columnIndex); - break; - case BINARY: - writer = new BinaryWriter(columnIndex); - break; - case DECIMAL: - writer = new HiveDecimalWriter(columnIndex); - break; - case INTERVAL_YEAR_MONTH: - writer = new IntervalYearMonthWriter(columnIndex); - break; - case INTERVAL_DAY_TIME: - writer = new IntervalDayTimeWriter(columnIndex); - break; - default: - throw new HiveException("Unexpected primitive type category " + primitiveCategory); - } - } - break; - default: - throw new HiveException("Unexpected type category " + category); - } - return writer; + private void allocArrays(final int size) { + sourceCategories = new Category[size]; + sourcePrimitiveCategories = new PrimitiveCategory[size]; + columnNums = new int[size]; } public void init(List typeNames, int[] columnMap) throws HiveException { - writers = new Writer[typeNames.size()]; - for (int i = 0; i < typeNames.size(); i++) { + final int size = typeNames.size(); + Preconditions.checkState(size > 0); + allocArrays(size); + for (int i = 0; i < size; i++) { String typeName = typeNames.get(i); TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); - int columnIndex = columnMap[i]; - Writer writer = createWriter(typeInfo, columnIndex); - writers[i] = writer; + addColumn(i, typeInfo, columnMap[i]); } } public void init(List typeNames) throws HiveException { - writers = new Writer[typeNames.size()]; - for (int i = 0; i < typeNames.size(); i++) { + final int size = typeNames.size(); + Preconditions.checkState(size > 0); + allocArrays(size); + for (int i = 0; i < size; i++) { String typeName = typeNames.get(i); TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); - Writer writer = createWriter(typeInfo, i); - writers[i] = writer; + addColumn(i, typeInfo, i); } } public void init(TypeInfo[] typeInfos, int[] columnMap) throws HiveException { - writers = new Writer[typeInfos.length]; - for (int i = 0; i < typeInfos.length; i++) { - int columnIndex = columnMap[i]; - Writer writer = createWriter(typeInfos[i], columnIndex); - writers[i] = writer; + final int size = typeInfos.length; + Preconditions.checkState(size > 0); + allocArrays(size); + for (int i = 0; i < size; i++) { + addColumn(i, typeInfos[i], columnMap[i]); } } public int getCount() { - return writers.length; + return sourceCategories.length; } public void setOutput(Output output) { @@ -632,21 +218,20 @@ public void setOutputAppend(Output output) { } private boolean hasAnyNulls; - private boolean isAllNulls; /* * Note that when serializing a row, the logical mapping using selected in use has already * been performed. batchIndex is the actual index of the row. + * + * Afterwards, the getHasAnyNulls function can be used to see if any of the columns were null. */ public void serializeWrite(VectorizedRowBatch batch, int batchIndex) throws IOException { + final int size = sourceCategories.length; hasAnyNulls = false; - isAllNulls = true; - for (Writer writer : writers) { - if (!writer.apply(batch, batchIndex)) { + for (int i = 0; i < size; i++) { + if (!serialize(batch, i, batchIndex)) { hasAnyNulls = true; - } else { - isAllNulls = false; } } } @@ -655,7 +240,28 @@ public boolean getHasAnyNulls() { return hasAnyNulls; } - public boolean getIsAllNulls() { - return isAllNulls; + private boolean nullSafeNullsRemain; + + /* + * A version of serializeWrite that support SQL nullsafe. + * + * If a column is null and its column has true in nullsafe, it is ignored. + * + * Afterwards, check getNullSafeAllNulls. + */ + public void serializeWriteNullSafe(VectorizedRowBatch batch, int batchIndex, + boolean[] nullsafe) throws IOException { + + final int size = sourceCategories.length; + nullSafeNullsRemain = false; + for (int i = 0; i < size; i++) { + if (!serialize(batch, i, batchIndex) && !nullsafe[i]) { + nullSafeNullsRemain = true; + } + } + } + + public boolean getNullSafeNullsRemain() { + return nullSafeNullsRemain; } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index f088941..ca1a590 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -489,7 +489,7 @@ private VectorExpression getColumnVectorExpression(ExprNodeColumnDesc } break; case PROJECTION: - expr = new IdentityExpression(columnNum, exprDesc.getTypeString()); + expr = new IdentityExpression(columnNum, exprDesc.getColumn(), exprDesc.getTypeString()); break; } return expr; @@ -1103,20 +1103,23 @@ private VectorExpression getIdentityExpression(List childExprList) throws HiveException { ExprNodeDesc childExpr = childExprList.get(0); int inputCol; + String name; String colType; VectorExpression v1 = null; if (childExpr instanceof ExprNodeGenericFuncDesc) { + name = ((ExprNodeGenericFuncDesc) childExpr).getName(); v1 = getVectorExpression(childExpr); inputCol = v1.getOutputColumn(); colType = v1.getOutputType(); } else if (childExpr instanceof ExprNodeColumnDesc) { + name = ((ExprNodeColumnDesc) childExpr) .getColumn(); ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) childExpr; inputCol = getInputColumnIndex(colDesc.getColumn()); colType = colDesc.getTypeString(); } else { throw new HiveException("Expression not supported: "+childExpr); } - VectorExpression expr = new IdentityExpression(inputCol, colType); + VectorExpression expr = new IdentityExpression(inputCol, name, colType); if (v1 != null) { expr.setChildExpressions(new VectorExpression [] {v1}); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java index 990e896..f448323 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java @@ -580,7 +580,7 @@ public static StandardStructObjectInspector convertToStandardStructObjectInspect } static ColumnVector cloneColumnVector(ColumnVector source - ) throws HiveException{ + ) throws HiveException { if (source instanceof LongColumnVector) { return new LongColumnVector(((LongColumnVector) source).vector.length); } else if (source instanceof DoubleColumnVector) { @@ -625,6 +625,17 @@ static ColumnVector cloneColumnVector(ColumnVector source " is not supported!"); } + public static PrimitiveTypeInfo[] primitiveTypeInfosFromTypeInfos( + TypeInfo[] typeInfos) throws HiveException { + + PrimitiveTypeInfo[] result = new PrimitiveTypeInfo[typeInfos.length]; + + for(int i = 0; i < typeInfos.length; i++) { + result[i] = (PrimitiveTypeInfo) typeInfos[i]; + } + return result; + } + /** * Make a new (scratch) batch, which is exactly "like" the batch provided, except that it's empty * @param batch the batch to imitate diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IdentityExpression.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IdentityExpression.java index 402d0f8..df31c28 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IdentityExpression.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IdentityExpression.java @@ -28,13 +28,15 @@ private static final long serialVersionUID = 1L; private int colNum = -1; + private String name = null; private String type = null; public IdentityExpression() { } - public IdentityExpression(int colNum, String type) { + public IdentityExpression(int colNum, String name, String type) { this.colNum = colNum; + this.name = name; this.type = type; } @@ -68,6 +70,10 @@ public int getColNum() { return getOutputColumn(); } + public String getName() { + return name; + } + public String getType() { return getOutputType(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeries.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeries.java index ac6c4b8..a864853 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeries.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeries.java @@ -23,28 +23,43 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** - * An abstraction of keys within a VectorizedRowBatch. - * - * A key is one or more columns. + * An abstraction of columns representing a key within a VectorizedRowBatch. * * When there is a sequential "run" of equal keys, they are collapsed and represented by a * duplicate count. * - * The batch of keys (with sequential duplicates collapsed) is called a series. + * The (sequential duplicates collapsed) keys in the batch is called a series. * - * A key can be all null, or a key with no or some nulls. + * A key can be either ALL NULL or have at least one column value and 0 or more column NULLs. * * All keys have a duplicate count. * - * A key with no or some nulls has: + * A key with at least one column value and 0 or more NULLs: * 1) A hash code. - * 2) Key values and other value(s) defined by other interfaces. + * 2) Column values defined by other interfaces. * - * The key series is logically indexed. That is, if batch.selectedInUse is true, the indices + * The key series is logically indexed. That is, when batch.selectedInUse is true, the indices * will be logical and need to be mapped through batch.selected to get the physical batch - * indices. Otherwise, the indices are physical batch indices. + * indices. Otherwise, when !batch.selectedInUse, the indices are physical batch indices. + * + * NOTE: This interface doesn't support the series count so that the VectorKeySeriesCombo class + * can combine multiple VectorKeySeries into a combined key without having to compute the series + * count. + * */ -public interface VectorKeySeries { +public abstract class VectorKeySeries { + + public int currentLogical; + public boolean currentKeyIsNull; + public int currentDuplicateCount; + public int currentHashCode; + + public VectorKeySeries() { + currentLogical = 0; + currentKeyIsNull = false; + currentDuplicateCount = 0; + currentHashCode = 0; + } /** * Process a non-empty batch of rows and compute a key series. @@ -54,45 +69,24 @@ * @param batch * @throws IOException */ - void processBatch(VectorizedRowBatch batch) throws IOException; - - /** - * Position to the beginning of the key series. - */ - void positionToFirst(); - - /** - * @return the current logical index of the first row of the current key. - * The next duplicate count rows have the same key. - */ - int getCurrentLogical(); - - /** - * @return true when the current key is all nulls. - */ - boolean getCurrentIsAllNull(); - - /** - * @return the number of duplicate keys of the current key. - */ - int getCurrentDuplicateCount(); - - - /** - * @return true when there is at least one null in the current key. - * Only valid when getCurrentIsAllNull is false. Otherwise, undefined. - */ - boolean getCurrentHasAnyNulls(); - - /** - * @return the hash code of the current key. - * Only valid when getCurrentIsAllNull is false. Otherwise, undefined. - */ - int getCurrentHashCode(); + abstract void processBatch(VectorizedRowBatch batch) throws IOException; /** * Move to the next key. * @return true when there is another key. Otherwise, the key series is complete. */ - boolean next(); + abstract boolean next(); + + public static String displayBytes(byte[] bytes, int start, int length) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + char ch = (char) bytes[i]; + if (ch < ' ' || ch > '~') { + sb.append(String.format("\\%03d", bytes[i] & 0xff)); + } else { + sb.append(ch); + } + } + return sb.toString(); + } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytes.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytes.java new file mode 100644 index 0000000..d83520b --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytes.java @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hive.common.util.HashCodeUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * A key series of a single column of byte array keys. + * + */ +public final class VectorKeySeriesBytes extends VectorKeySeriesBytesBase { + + private static final Logger LOG = LoggerFactory.getLogger(VectorKeySeriesBytes.class.getName()); + + private byte[][] keyBytesArrays; + private int[] keyStarts; + private int[] keyLengths; + + public byte[] currentBytes; + public int currentStart; + public int currentLength; + + public VectorKeySeriesBytes(int columnNum) { + super(columnNum); + keyBytesArrays = new byte[VectorizedRowBatch.DEFAULT_SIZE][]; + keyStarts = new int[VectorizedRowBatch.DEFAULT_SIZE]; + keyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + super.processBatch(batch); + + if (nonNullKeyCount > 0) { + HashCodeUtil.calculateBytesArrayHashCodes(keyBytesArrays, + keyStarts, keyLengths, hashCodes, nonNullKeyCount); + } + + // Do the position after we compute the checksums. + positionToFirst(); + } + + @Override + public void saveBytesKey(int nonNullKeyPosition, byte[] keyBytes, int keyByteStart, + int keyByteLength) { + keyBytesArrays[nonNullKeyPosition] = keyBytes; + keyStarts[nonNullKeyPosition] = keyByteStart; + keyLengths[nonNullKeyPosition] = keyByteLength; + } + + @Override + public void setNextNonEmptyKey(int nonNullKeyPosition) { + currentBytes = keyBytesArrays[nonNullKeyPosition]; + currentStart = keyStarts[nonNullKeyPosition]; + currentLength = keyLengths[nonNullKeyPosition]; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesBase.java new file mode 100644 index 0000000..1e5ed07 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesBase.java @@ -0,0 +1,259 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import com.google.common.base.Preconditions; + +/** + * An abstract class implementing key series of a single column of byte array keys where the keys + * get serialized. + */ +public abstract class VectorKeySeriesBytesBase extends VectorKeySeriesSingle { + + private final int columnNum; + + public VectorKeySeriesBytesBase(int columnNum) { + super(); + this.columnNum = columnNum; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + + BytesColumnVector bytesColVector = (BytesColumnVector) batch.cols[columnNum]; + + final byte[][] vector = bytesColVector.vector; + final int[] start = bytesColVector.start; + final int[] length = bytesColVector.length; + + if (bytesColVector.isRepeating){ + duplicateCounts[0] = currentBatchSize; + if (bytesColVector.noNulls || !bytesColVector.isNull[0]) { + keyIsNull[0] = false; + saveBytesKey(0, vector[0], start[0], length[0]); + nonNullKeyCount = 1; + } else { + keyIsNull[0] = true; + nonNullKeyCount = 0; + } + keyCount = 1; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + keyCount = 0; + nonNullKeyCount = 0; + if (batch.selectedInUse) { + int[] selected = batch.selected; + if (bytesColVector.noNulls) { + + duplicateCounts[0] = 1; + int index; + index = selected[0]; + byte[] prevKeyBytes = vector[index]; + int prevKeyStart = start[index]; + int prevKeyLength = length[index]; + saveBytesKey(0, prevKeyBytes, prevKeyStart, prevKeyLength); + + int currentKeyStart; + int currentKeyLength; + byte[] currentKeyBytes; + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + currentKeyBytes = vector[index]; + currentKeyStart = start[index]; + currentKeyLength = length[index]; + if (StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, + currentKeyBytes, currentKeyStart, currentKeyLength)) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveBytesKey(keyCount, currentKeyBytes, currentKeyStart, currentKeyLength); + prevKeyBytes = currentKeyBytes; + prevKeyStart = currentKeyStart; + prevKeyLength = currentKeyLength; + } + } + Arrays.fill(keyIsNull, 0, ++keyCount, false); + nonNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = bytesColVector.isNull; + + boolean prevKeyIsNull; + byte[] prevKeyBytes = null; + int prevKeyStart = 0; + int prevKeyLength = 0; + duplicateCounts[0] = 1; + int index = selected[0]; + if (isNull[index]) { + keyIsNull[0] = true; + prevKeyIsNull = true; + } else { + keyIsNull[0] = false; + prevKeyIsNull = false; + prevKeyBytes = vector[index]; + prevKeyStart = start[index]; + prevKeyLength = length[index]; + saveBytesKey(0, prevKeyBytes, prevKeyStart, prevKeyLength); + nonNullKeyCount = 1; + } + + int currentKeyStart; + int currentKeyLength; + byte[] currentKeyBytes; + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = true; + prevKeyIsNull = true; + } + } else { + currentKeyBytes = vector[index]; + currentKeyStart = start[index]; + currentKeyLength = length[index]; + if (!prevKeyIsNull && + StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, + currentKeyBytes, currentKeyStart, currentKeyLength)) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = false; + saveBytesKey(nonNullKeyCount++, currentKeyBytes, currentKeyStart, currentKeyLength); + prevKeyIsNull = false; + prevKeyBytes = currentKeyBytes; + prevKeyStart = currentKeyStart; + prevKeyLength = currentKeyLength; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } else { + + // NOT selectedInUse + + if (bytesColVector.noNulls) { + + duplicateCounts[0] = 1; + byte[] prevKeyBytes = vector[0]; + int prevKeyStart = start[0]; + int prevKeyLength = length[0]; + saveBytesKey(0, prevKeyBytes, prevKeyStart, prevKeyLength); + + int currentKeyStart; + int currentKeyLength; + byte[] currentKeyBytes; + for (int index = 1; index < currentBatchSize; index++) { + currentKeyBytes = vector[index]; + currentKeyStart = start[index]; + currentKeyLength = length[index]; + if (StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, + currentKeyBytes, currentKeyStart, currentKeyLength)) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveBytesKey(keyCount, currentKeyBytes, currentKeyStart, currentKeyLength); + prevKeyBytes = currentKeyBytes; + prevKeyStart = currentKeyStart; + prevKeyLength = currentKeyLength; + } + } + Arrays.fill(keyIsNull, 0, ++keyCount, false); + nonNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = bytesColVector.isNull; + + boolean prevKeyIsNull; + byte[] prevKeyBytes = null; + int prevKeyStart = 0; + int prevKeyLength = 0; + duplicateCounts[0] = 1; + if (isNull[0]) { + keyIsNull[0] = true; + prevKeyIsNull = true; + } else { + keyIsNull[0] = false; + prevKeyIsNull = false; + prevKeyBytes = vector[0]; + prevKeyStart = start[0]; + prevKeyLength = length[0]; + saveBytesKey(0, prevKeyBytes, prevKeyStart, prevKeyLength); + nonNullKeyCount = 1; + } + + byte[] currentKeyBytes; + int currentKeyStart; + int currentKeyLength; + for (int index = 1; index < currentBatchSize; index++) { + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = true; + prevKeyIsNull = true; + } + } else { + currentKeyBytes = vector[index]; + currentKeyStart = start[index]; + currentKeyLength = length[index]; + if (!prevKeyIsNull && + StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, + currentKeyBytes, currentKeyStart, currentKeyLength)) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = false; + saveBytesKey(nonNullKeyCount++, currentKeyBytes, currentKeyStart, currentKeyLength); + prevKeyIsNull = false; + prevKeyBytes = currentKeyBytes; + prevKeyStart = currentKeyStart; + prevKeyLength = currentKeyLength; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } + } + + Preconditions.checkState(validate()); + } + + /* + * Serialize a bytes key. + */ + protected abstract void saveBytesKey(int nonNullKeyPosition, byte[] keyBytes, int keyByteStart, + int keyByteLength) throws IOException; +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesSerialized.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesSerialized.java deleted file mode 100644 index 81a8455..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesBytesSerialized.java +++ /dev/null @@ -1,271 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; -import org.apache.hadoop.hive.serde2.fast.SerializeWrite; -import com.google.common.base.Preconditions; - -/** - * A key series of a single column of byte array keys where the keys get serialized. - */ -public class VectorKeySeriesBytesSerialized - extends VectorKeySeriesSerializedImpl implements VectorKeySeriesSerialized { - - private final int columnNum; - - private int outputStartPosition; - - public VectorKeySeriesBytesSerialized(int columnNum, T serializeWrite) { - super(serializeWrite); - this.columnNum = columnNum; - } - - @Override - public void processBatch(VectorizedRowBatch batch) throws IOException { - - currentBatchSize = batch.size; - Preconditions.checkState(currentBatchSize > 0); - - BytesColumnVector bytesColVector = (BytesColumnVector) batch.cols[columnNum]; - - byte[][] vectorBytesArrays = bytesColVector.vector; - int[] vectorStarts = bytesColVector.start; - int[] vectorLengths = bytesColVector.length; - - // The serialize routine uses this to build serializedKeyLengths. - outputStartPosition = 0; - output.reset(); - - if (bytesColVector.isRepeating){ - duplicateCounts[0] = currentBatchSize; - if (bytesColVector.noNulls || !bytesColVector.isNull[0]) { - seriesIsAllNull[0] = false; - serialize(0, vectorBytesArrays[0], vectorStarts[0], vectorLengths[0]); - nonNullKeyCount = 1; - } else { - seriesIsAllNull[0] = true; - nonNullKeyCount = 0; - } - seriesCount = 1; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - seriesCount = 0; - nonNullKeyCount = 0; - if (batch.selectedInUse) { - int[] selected = batch.selected; - if (bytesColVector.noNulls) { - - duplicateCounts[0] = 1; - int index; - index = selected[0]; - byte[] prevKeyBytes = vectorBytesArrays[index]; - int prevKeyStart = vectorStarts[index]; - int prevKeyLength = vectorLengths[index]; - serialize(0, prevKeyBytes, prevKeyStart, prevKeyLength); - - int currentKeyStart; - int currentKeyLength; - byte[] currentKeyBytes; - for (int logical = 1; logical < currentBatchSize; logical++) { - index = selected[logical]; - currentKeyBytes = vectorBytesArrays[index]; - currentKeyStart = vectorStarts[index]; - currentKeyLength = vectorLengths[index]; - if (StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, - currentKeyBytes, currentKeyStart, currentKeyLength)) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - serialize(seriesCount, currentKeyBytes, currentKeyStart, currentKeyLength); - prevKeyBytes = currentKeyBytes; - prevKeyStart = currentKeyStart; - prevKeyLength = currentKeyLength; - } - } - Arrays.fill(seriesIsAllNull, 0, ++seriesCount, false); - nonNullKeyCount = seriesCount; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - boolean[] isNull = bytesColVector.isNull; - - boolean prevKeyIsNull; - byte[] prevKeyBytes = null; - int prevKeyStart = 0; - int prevKeyLength = 0; - duplicateCounts[0] = 1; - int index = selected[0]; - if (isNull[index]) { - seriesIsAllNull[0] = true; - prevKeyIsNull = true; - } else { - seriesIsAllNull[0] = false; - prevKeyIsNull = false; - prevKeyBytes = vectorBytesArrays[index]; - prevKeyStart = vectorStarts[index]; - prevKeyLength = vectorLengths[index]; - serialize(0, prevKeyBytes, prevKeyStart, prevKeyLength); - nonNullKeyCount = 1; - } - - int currentKeyStart; - int currentKeyLength; - byte[] currentKeyBytes; - for (int logical = 1; logical < currentBatchSize; logical++) { - index = selected[logical]; - if (isNull[index]) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = true; - prevKeyIsNull = true; - } - } else { - currentKeyBytes = vectorBytesArrays[index]; - currentKeyStart = vectorStarts[index]; - currentKeyLength = vectorLengths[index]; - if (!prevKeyIsNull && - StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, - currentKeyBytes, currentKeyStart, currentKeyLength)) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = false; - serialize(nonNullKeyCount++, currentKeyBytes, currentKeyStart, currentKeyLength); - prevKeyIsNull = false; - prevKeyBytes = currentKeyBytes; - prevKeyStart = currentKeyStart; - prevKeyLength = currentKeyLength; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - } else { - - // NOT selectedInUse - - if (bytesColVector.noNulls) { - - duplicateCounts[0] = 1; - byte[] prevKeyBytes = vectorBytesArrays[0]; - int prevKeyStart = vectorStarts[0]; - int prevKeyLength = vectorLengths[0]; - serialize(0, prevKeyBytes, prevKeyStart, prevKeyLength); - - int currentKeyStart; - int currentKeyLength; - byte[] currentKeyBytes; - for (int index = 1; index < currentBatchSize; index++) { - currentKeyBytes = vectorBytesArrays[index]; - currentKeyStart = vectorStarts[index]; - currentKeyLength = vectorLengths[index]; - if (StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, - currentKeyBytes, currentKeyStart, currentKeyLength)) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - serialize(seriesCount, currentKeyBytes, currentKeyStart, currentKeyLength); - prevKeyBytes = currentKeyBytes; - prevKeyStart = currentKeyStart; - prevKeyLength = currentKeyLength; - } - } - Arrays.fill(seriesIsAllNull, 0, ++seriesCount, false); - nonNullKeyCount = seriesCount; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - boolean[] isNull = bytesColVector.isNull; - - boolean prevKeyIsNull; - byte[] prevKeyBytes = null; - int prevKeyStart = 0; - int prevKeyLength = 0; - duplicateCounts[0] = 1; - if (isNull[0]) { - seriesIsAllNull[0] = true; - prevKeyIsNull = true; - } else { - seriesIsAllNull[0] = false; - prevKeyIsNull = false; - prevKeyBytes = vectorBytesArrays[0]; - prevKeyStart = vectorStarts[0]; - prevKeyLength = vectorLengths[0]; - serialize(0, prevKeyBytes, prevKeyStart, prevKeyLength); - nonNullKeyCount = 1; - } - - byte[] currentKeyBytes; - int currentKeyStart; - int currentKeyLength; - for (int index = 1; index < currentBatchSize; index++) { - if (isNull[index]) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = true; - prevKeyIsNull = true; - } - } else { - currentKeyBytes = vectorBytesArrays[index]; - currentKeyStart = vectorStarts[index]; - currentKeyLength = vectorLengths[index]; - if (!prevKeyIsNull && - StringExpr.equal(prevKeyBytes, prevKeyStart, prevKeyLength, - currentKeyBytes, currentKeyStart, currentKeyLength)) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = false; - serialize(nonNullKeyCount++, currentKeyBytes, currentKeyStart, currentKeyLength); - prevKeyIsNull = false; - prevKeyBytes = currentKeyBytes; - prevKeyStart = currentKeyStart; - prevKeyLength = currentKeyLength; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - } - } - - // Finally. - computeSerializedHashCodes(); - positionToFirst(); - Preconditions.checkState(validate()); - } - - private void serialize(int pos, byte[] bytes, int start, int length) throws IOException { - serializeWrite.setAppend(output); - serializeWrite.writeString(bytes, start, length); - int outputNewPosition = output.getLength(); - serializedKeyLengths[pos] = outputNewPosition - outputStartPosition; - outputStartPosition = outputNewPosition; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesCombo.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesCombo.java new file mode 100644 index 0000000..6d2a3ca --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesCombo.java @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import com.google.common.base.Preconditions; + +/** + * A key series of a multiple columns or an array of VectorKeySeriesSingle. + */ +public class VectorKeySeriesCombo extends VectorKeySeries { + + protected VectorKeySeriesSingle[] keySeriesArray; + + protected int currentBatchSize; + + private long allNullMask; + + protected long currentNullMask; + + public VectorKeySeriesCombo() { + super(); + } + + public void init(VectorKeySeriesSingle[] keySeriesArray) { + Preconditions.checkState(keySeriesArray.length > 0); + this.keySeriesArray = keySeriesArray; + allNullMask = (1L << keySeriesArray.length) - 1; + } + + public boolean getCurrentKeyHasAnyNulls() { + return (currentNullMask != 0); + } + + public long getCurrentNullMask() { + return currentNullMask; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + + for (int i = 0; i < keySeriesArray.length; i ++) { + keySeriesArray[i].processBatch(batch); + } + + positionToFirst(); + } + + public void positionToFirst() { + currentLogical = 0; + + // Prime the pump with the first key. + keySeriesArray[0].positionToFirst(); + if (keySeriesArray[0].currentKeyIsNull) { + currentNullMask = (1L << 0); + } else { + currentNullMask = 0; + } + currentDuplicateCount = keySeriesArray[0].currentDuplicateCount; + + // Determine the minimum series count. + for (int i = 1; i < keySeriesArray.length; i++) { + VectorKeySeriesSingle key = keySeriesArray[i]; + key.positionToFirst(); + if (key.currentKeyIsNull) { + currentNullMask |= (1L << i); + } + currentDuplicateCount = Math.min(currentDuplicateCount, key.currentDuplicateCount); + } + currentKeyIsNull = (currentNullMask == allNullMask); + Preconditions.checkState(currentDuplicateCount > 0); + Preconditions.checkState(currentDuplicateCount <= currentBatchSize); + } + + public boolean next() { + + currentLogical += currentDuplicateCount; + if (currentLogical >= currentBatchSize) { + return false; + } + + int prevDuplicateCount = currentDuplicateCount; + + // Advance past the series we just used. + keySeriesArray[0].advance(prevDuplicateCount); + + /* + * Calculate the next series. + */ + + // Prime the pump with the first key. + if (keySeriesArray[0].currentKeyIsNull) { + currentNullMask = (1L << 0); + } else { + currentNullMask = 0; + } + currentDuplicateCount = keySeriesArray[0].currentDuplicateCount; + + // Determine the minimum series count. + for (int i = 1; i < keySeriesArray.length; i++) { + VectorKeySeriesSingle key = keySeriesArray[i]; + key.advance(prevDuplicateCount); + if (key.currentKeyIsNull) { + currentNullMask |= (1L << i); + } + currentDuplicateCount = Math.min(currentDuplicateCount, key.currentDuplicateCount); + } + currentKeyIsNull = (currentNullMask == allNullMask); + return true; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDouble.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDouble.java new file mode 100644 index 0000000..b5b7e21 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDouble.java @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.common.util.HashCodeUtil; + +/** + * A key series of a single column of double keys. + */ +public final class VectorKeySeriesDouble extends VectorKeySeriesDoubleBase { + + private final double[] doubleKeys; + private final long[] doubleAsLongKeys; + + private double currentDoubleKey; + private long currentDoubleAsLongKey; + + public VectorKeySeriesDouble(int columnNum, PrimitiveTypeInfo primitiveTypeInfo) { + super(columnNum, primitiveTypeInfo); + doubleKeys = new double[VectorizedRowBatch.DEFAULT_SIZE]; + doubleAsLongKeys = new long[VectorizedRowBatch.DEFAULT_SIZE]; + } + + public double getCurrentDoubleKey() { + return currentDoubleKey; + } + + public long getCurrentDoubleAsLongKey() { + return currentDoubleAsLongKey; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + super.processBatch(batch); + + if (nonNullKeyCount > 0) { + HashCodeUtil.calculateLongArrayHashCodes(doubleAsLongKeys, hashCodes, nonNullKeyCount); + } + + // Do the position after we compute the checksums. + positionToFirst(); + } + + @Override + protected void saveDoubleKey(int nonNullKeyPosition, double key) + throws IOException { + doubleKeys[nonNullKeyPosition] = key; + doubleAsLongKeys[nonNullKeyPosition] = Double.doubleToLongBits(key); + } + + @Override + public void setNextNonEmptyKey(int nonNullKeyPosition) { + currentDoubleKey = doubleKeys[nonNullKeyPosition]; + currentDoubleAsLongKey = doubleAsLongKeys[nonNullKeyPosition]; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDoubleBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDoubleBase.java new file mode 100644 index 0000000..1463680 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesDoubleBase.java @@ -0,0 +1,212 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import com.google.common.base.Preconditions; + +/** + * An abstract class implementing key series of a single column of double keys where the keys + * get serialized. + */ +public abstract class VectorKeySeriesDoubleBase extends VectorKeySeriesSingle { + + protected final int columnNum; + protected final PrimitiveCategory primitiveCategory; + + public VectorKeySeriesDoubleBase(int columnNum, PrimitiveTypeInfo primitiveTypeInfo) { + super(); + this.columnNum = columnNum; + primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + + DoubleColumnVector doubleColVector = (DoubleColumnVector) batch.cols[columnNum]; + + double[] vector = doubleColVector.vector; + + if (doubleColVector.isRepeating){ + duplicateCounts[0] = currentBatchSize; + if (doubleColVector.noNulls || !doubleColVector.isNull[0]) { + keyIsNull[0] = false; + saveDoubleKey(0, vector[0]); + nonNullKeyCount = 1; + } else { + keyIsNull[0] = true; + nonNullKeyCount = 0; + } + keyCount = 1; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + keyCount = 0; + nonNullKeyCount = 0; + if (batch.selectedInUse) { + int[] selected = batch.selected; + if (doubleColVector.noNulls) { + + duplicateCounts[0] = 1; + double prevKey = vector[selected[0]]; + saveDoubleKey(0, prevKey); + + double currentKey; + for (int logical = 1; logical < currentBatchSize; logical++) { + currentKey = vector[selected[logical]]; + if (prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveDoubleKey(keyCount, currentKey); + prevKey = currentKey; + } + } + Arrays.fill(keyIsNull, 0, ++keyCount, false); + nonNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = doubleColVector.isNull; + + boolean prevKeyIsNull; + double prevKey = 0; + duplicateCounts[0] = 1; + int index = selected[0]; + if (isNull[index]) { + keyIsNull[0] = true; + prevKeyIsNull = true; + } else { + keyIsNull[0] = false; + prevKeyIsNull = false; + prevKey = vector[index]; + saveDoubleKey(0, prevKey); + nonNullKeyCount = 1; + } + + double currentKey; + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = true; + prevKeyIsNull = true; + } + } else { + currentKey = vector[index]; + if (!prevKeyIsNull && prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = false; + saveDoubleKey(nonNullKeyCount++, currentKey); + prevKeyIsNull = false; + prevKey = currentKey; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } else { + + // NOT selectedInUse + + if (doubleColVector.noNulls) { + + duplicateCounts[0] = 1; + double prevKey = vector[0]; + saveDoubleKey(0, prevKey); + double currentKey; + for (int index = 1; index < currentBatchSize; index++) { + currentKey = vector[index]; + if (prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveDoubleKey(keyCount, currentKey); + prevKey = currentKey; + } + } + Arrays.fill(keyIsNull, 0, ++keyCount, false); + nonNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = doubleColVector.isNull; + + boolean prevKeyIsNull; + double prevKey = 0; + duplicateCounts[0] = 1; + if (isNull[0]) { + keyIsNull[0] = true; + prevKeyIsNull = true; + } else { + keyIsNull[0] = false; + prevKeyIsNull = false; + prevKey = vector[0]; + saveDoubleKey(nonNullKeyCount++, prevKey); + } + + for (int index = 1; index < currentBatchSize; index++) { + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = true; + prevKeyIsNull = true; + } + } else { + double currentKey = vector[index]; + if (!prevKeyIsNull && prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = false; + saveDoubleKey(nonNullKeyCount++, currentKey); + prevKeyIsNull = false; + prevKey = currentKey; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } + } + + Preconditions.checkState(validate()); + } + + /* + * Serialize a double key. + */ + protected abstract void saveDoubleKey(int nonNullKeyPosition, double key) + throws IOException; +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesImpl.java deleted file mode 100644 index 55e923e..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesImpl.java +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -/** - * A base implementation of VectorKeySeries. - * - */ -public abstract class VectorKeySeriesImpl implements VectorKeySeries { - - protected int currentLogical; - protected boolean currentIsAllNull; - protected boolean currentHasAnyNulls; - protected int currentDuplicateCount; - protected int currentHashCode; - - VectorKeySeriesImpl() { - currentLogical = 0; - currentIsAllNull = false; - - // Set to true by default. Only actively set in the multiple key case to support Outer Join. - currentHasAnyNulls = true; - - currentDuplicateCount = 0; - currentHashCode = 0; - } - - @Override - public int getCurrentLogical() { - return currentLogical; - } - - @Override - public boolean getCurrentIsAllNull() { - return currentIsAllNull; - } - - @Override - public boolean getCurrentHasAnyNulls() { - return currentHasAnyNulls; - } - - @Override - public int getCurrentDuplicateCount() { - return currentDuplicateCount; - } - - @Override - public int getCurrentHashCode() { - return currentHashCode; - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLong.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLong.java new file mode 100644 index 0000000..7dbe6ce --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLong.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.common.util.HashCodeUtil; + +/** + * A key series of a single column of long keys. + */ +public final class VectorKeySeriesLong extends VectorKeySeriesLongBase { + + private final long[] longKeys; + + private long currentKey; + + public VectorKeySeriesLong(int columnNum, PrimitiveTypeInfo primitiveTypeInfo) { + super(columnNum, primitiveTypeInfo); + longKeys = new long[VectorizedRowBatch.DEFAULT_SIZE]; + } + + public long getCurrentKey() { + return currentKey; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + super.processBatch(batch); + + if (nonNullKeyCount > 0) { + HashCodeUtil.calculateLongArrayHashCodes(longKeys, hashCodes, nonNullKeyCount); + } + + // Do the position after we compute the checksums. + positionToFirst(); + } + + @Override + protected void saveLongKey(int nonNullKeyPosition, long key) + throws IOException { + longKeys[nonNullKeyPosition] = key; + } + + @Override + public void setNextNonEmptyKey(int nonNullKeyPosition) { + currentKey = longKeys[nonNullKeyPosition]; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongBase.java new file mode 100644 index 0000000..bee3706 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongBase.java @@ -0,0 +1,212 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import com.google.common.base.Preconditions; + +/** + * An abstract class implementing key series of a single column of long keys where the keys + * get serialized. + */ +public abstract class VectorKeySeriesLongBase extends VectorKeySeriesSingle { + + protected final int columnNum; + protected final PrimitiveCategory primitiveCategory; + + public VectorKeySeriesLongBase(int columnNum, PrimitiveTypeInfo primitiveTypeInfo) { + super(); + this.columnNum = columnNum; + primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + + LongColumnVector longColVector = (LongColumnVector) batch.cols[columnNum]; + + long[] vector = longColVector.vector; + + if (longColVector.isRepeating){ + duplicateCounts[0] = currentBatchSize; + if (longColVector.noNulls || !longColVector.isNull[0]) { + keyIsNull[0] = false; + saveLongKey(0, vector[0]); + nonNullKeyCount = 1; + } else { + keyIsNull[0] = true; + nonNullKeyCount = 0; + } + keyCount = 1; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + keyCount = 0; + nonNullKeyCount = 0; + if (batch.selectedInUse) { + int[] selected = batch.selected; + if (longColVector.noNulls) { + + duplicateCounts[0] = 1; + long prevKey = vector[selected[0]]; + saveLongKey(0, prevKey); + + long currentKey; + for (int logical = 1; logical < currentBatchSize; logical++) { + currentKey = vector[selected[logical]]; + if (prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveLongKey(keyCount, currentKey); + prevKey = currentKey; + } + } + Arrays.fill(keyIsNull, 0, ++keyCount, false); + nonNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = longColVector.isNull; + + boolean prevKeyIsNull; + long prevKey = 0; + duplicateCounts[0] = 1; + int index = selected[0]; + if (isNull[index]) { + keyIsNull[0] = true; + prevKeyIsNull = true; + } else { + keyIsNull[0] = false; + prevKeyIsNull = false; + prevKey = vector[index]; + saveLongKey(0, prevKey); + nonNullKeyCount = 1; + } + + long currentKey; + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = true; + prevKeyIsNull = true; + } + } else { + currentKey = vector[index]; + if (!prevKeyIsNull && prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = false; + saveLongKey(nonNullKeyCount++, currentKey); + prevKeyIsNull = false; + prevKey = currentKey; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } else { + + // NOT selectedInUse + + if (longColVector.noNulls) { + + duplicateCounts[0] = 1; + long prevKey = vector[0]; + saveLongKey(0, prevKey); + long currentKey; + for (int index = 1; index < currentBatchSize; index++) { + currentKey = vector[index]; + if (prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + saveLongKey(keyCount, currentKey); + prevKey = currentKey; + } + } + Arrays.fill(keyIsNull, 0, ++keyCount, false); + nonNullKeyCount = keyCount; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + boolean[] isNull = longColVector.isNull; + + boolean prevKeyIsNull; + long prevKey = 0; + duplicateCounts[0] = 1; + if (isNull[0]) { + keyIsNull[0] = true; + prevKeyIsNull = true; + } else { + keyIsNull[0] = false; + prevKeyIsNull = false; + prevKey = vector[0]; + saveLongKey(nonNullKeyCount++, prevKey); + } + + for (int index = 1; index < currentBatchSize; index++) { + if (isNull[index]) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = true; + prevKeyIsNull = true; + } + } else { + long currentKey = vector[index]; + if (!prevKeyIsNull && prevKey == currentKey) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = false; + saveLongKey(nonNullKeyCount++, currentKey); + prevKeyIsNull = false; + prevKey = currentKey; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + } + } + + Preconditions.checkState(validate()); + } + + /* + * Serialize a long key. + */ + protected abstract void saveLongKey(int nonNullKeyPosition, long key) + throws IOException; +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongSerialized.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongSerialized.java deleted file mode 100644 index a0134fd..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesLongSerialized.java +++ /dev/null @@ -1,249 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.serde2.fast.SerializeWrite; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; - -import com.google.common.base.Preconditions; - -/** - * A key series of a single column of long keys where the keys get serialized. - */ -public class VectorKeySeriesLongSerialized - extends VectorKeySeriesSerializedImpl implements VectorKeySeriesSerialized { - - private final int columnNum; - private PrimitiveCategory primitiveCategory; - - private int currentKeyStart; - - public VectorKeySeriesLongSerialized(int columnNum, PrimitiveTypeInfo primitiveTypeInfo, - T serializeWrite) { - super(serializeWrite); - this.columnNum = columnNum; - primitiveCategory = primitiveTypeInfo.getPrimitiveCategory(); - } - - @Override - public void processBatch(VectorizedRowBatch batch) throws IOException { - - currentBatchSize = batch.size; - Preconditions.checkState(currentBatchSize > 0); - - LongColumnVector longColVector = (LongColumnVector) batch.cols[columnNum]; - - long[] vector = longColVector.vector; - - // The serialize routine uses this to build serializedKeyLengths. - currentKeyStart = 0; - output.reset(); - - if (longColVector.isRepeating){ - duplicateCounts[0] = currentBatchSize; - if (longColVector.noNulls || !longColVector.isNull[0]) { - seriesIsAllNull[0] = false; - serialize(0, vector[0]); - nonNullKeyCount = 1; - } else { - seriesIsAllNull[0] = true; - nonNullKeyCount = 0; - } - seriesCount = 1; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - seriesCount = 0; - nonNullKeyCount = 0; - if (batch.selectedInUse) { - int[] selected = batch.selected; - if (longColVector.noNulls) { - - duplicateCounts[0] = 1; - long prevKey = vector[selected[0]]; - serialize(0, prevKey); - - long currentKey; - for (int logical = 1; logical < currentBatchSize; logical++) { - currentKey = vector[selected[logical]]; - if (prevKey == currentKey) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - serialize(seriesCount, currentKey); - prevKey = currentKey; - } - } - Arrays.fill(seriesIsAllNull, 0, ++seriesCount, false); - nonNullKeyCount = seriesCount; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - boolean[] isNull = longColVector.isNull; - - boolean prevKeyIsNull; - long prevKey = 0; - duplicateCounts[0] = 1; - int index = selected[0]; - if (isNull[index]) { - seriesIsAllNull[0] = true; - prevKeyIsNull = true; - nonNullKeyCount = 0; - } else { - seriesIsAllNull[0] = false; - prevKeyIsNull = false; - prevKey = vector[index]; - serialize(0, prevKey); - nonNullKeyCount = 1; - } - - long currentKey; - for (int logical = 1; logical < currentBatchSize; logical++) { - index = selected[logical]; - if (isNull[index]) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = true; - prevKeyIsNull = true; - } - } else { - currentKey = vector[index]; - if (!prevKeyIsNull && prevKey == currentKey) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = false; - serialize(nonNullKeyCount++, currentKey); - prevKeyIsNull = false; - prevKey = currentKey; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - } else { - - // NOT selectedInUse - - if (longColVector.noNulls) { - - duplicateCounts[0] = 1; - long prevKey = vector[0]; - serialize(0, prevKey); - long currentKey; - for (int index = 1; index < currentBatchSize; index++) { - currentKey = vector[index]; - if (prevKey == currentKey) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - serialize(seriesCount, currentKey); - prevKey = currentKey; - } - } - Arrays.fill(seriesIsAllNull, 0, ++seriesCount, false); - nonNullKeyCount = seriesCount; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - boolean[] isNull = longColVector.isNull; - - boolean prevKeyIsNull; - long prevKey = 0; - duplicateCounts[0] = 1; - if (isNull[0]) { - seriesIsAllNull[0] = true; - prevKeyIsNull = true; - nonNullKeyCount = 0; - } else { - seriesIsAllNull[0] = false; - prevKeyIsNull = false; - prevKey = vector[0]; - serialize(0, prevKey); - nonNullKeyCount = 1; - } - - for (int index = 1; index < currentBatchSize; index++) { - if (isNull[index]) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = true; - prevKeyIsNull = true; - } - } else { - long currentKey = vector[index]; - if (!prevKeyIsNull && prevKey == currentKey) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = false; - serialize(nonNullKeyCount++, currentKey); - prevKeyIsNull = false; - prevKey = currentKey; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - } - } - - // Finally. - computeSerializedHashCodes(); - positionToFirst(); - Preconditions.checkState(validate()); - } - - private void serialize(int pos, long value) throws IOException { - serializeWrite.setAppend(output); - - // UNDONE: Add support for DATE, TIMESTAMP, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME... - switch (primitiveCategory) { - case BOOLEAN: - serializeWrite.writeBoolean(value != 0); - break; - case BYTE: - serializeWrite.writeByte((byte) value); - break; - case SHORT: - serializeWrite.writeShort((short) value); - break; - case INT: - serializeWrite.writeInt((int) value); - break; - case LONG: - serializeWrite.writeLong(value); - break; - default: - throw new RuntimeException("Unexpected primitive category " + primitiveCategory.name()); - } - int outputNewPosition = output.getLength(); - serializedKeyLengths[pos] = outputNewPosition - currentKeyStart; - currentKeyStart = outputNewPosition; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiBase.java new file mode 100644 index 0000000..dff16ab --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiBase.java @@ -0,0 +1,147 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * An abstract class implementing key series of multiple column keys where the keys + * get serialized. + * + * It can be one key that requires custom serialization (e.g. HiveDecimal). + */ +public abstract class VectorKeySeriesMultiBase extends VectorKeySeriesSingle { + + private static final Logger LOG = LoggerFactory.getLogger(VectorKeySeriesMultiBase.class.getName()); + + public VectorKeySeriesMultiBase() { + super(); + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + + currentBatchSize = batch.size; + Preconditions.checkState(currentBatchSize > 0); + Preconditions.checkState(currentBatchSize <= VectorizedRowBatch.DEFAULT_SIZE); + + keyCount = 0; + boolean prevKeyIsNull; + duplicateCounts[0] = 1; + if (batch.selectedInUse) { + int[] selected = batch.selected; + int index = selected[0]; + writeMultiKey(batch, index, 0); + if (isNull()) { + keyIsNull[0] = prevKeyIsNull = true; + nonNullKeyCount = 0; + } else { + keyIsNull[0] = prevKeyIsNull = false; + nonNullKeyCount = 1; + } + + for (int logical = 1; logical < currentBatchSize; logical++) { + index = selected[logical]; + writeMultiKey(batch, index, nonNullKeyCount); + if (isNull()) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = prevKeyIsNull = true; + } + } else { + if (!prevKeyIsNull && equalsPrevKey(nonNullKeyCount)) { + forgetKey(); + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = prevKeyIsNull = false; + nonNullKeyCount++; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } else { + writeMultiKey(batch, 0, 0); + if (isNull()) { + keyIsNull[0] = prevKeyIsNull = true; + nonNullKeyCount = 0; + } else { + keyIsNull[0] = prevKeyIsNull = false; + nonNullKeyCount = 1; + } + + for (int index = 1; index < currentBatchSize; index++) { + writeMultiKey(batch, index, nonNullKeyCount); + if (isNull()) { + if (prevKeyIsNull) { + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = prevKeyIsNull = true; + } + } else { + if (!prevKeyIsNull && equalsPrevKey(nonNullKeyCount)) { + forgetKey(); + duplicateCounts[keyCount]++; + } else { + duplicateCounts[++keyCount] = 1; + keyIsNull[keyCount] = prevKeyIsNull = false; + nonNullKeyCount++; + } + } + } + keyCount++; + Preconditions.checkState(keyCount <= currentBatchSize); + } + + Preconditions.checkState(validate()); + } + + /* + * Serialize a multiple column key. + */ + protected abstract void writeMultiKey(VectorizedRowBatch batch, int index, int nonNullKeyCount) + throws IOException; + + /* + * After calling writeMultiKey, this method returns whether ANY the key columns is null. + */ + protected abstract boolean isNull(); + + /* + * Compare the previous serialized multiple column key is equal to the last serialized + * multiple column key. + */ + protected abstract boolean equalsPrevKey(int nonNullKeyCount); + + /* + * Forget the last serialized multiple column key (as a result of finding with + * equalsPrevKey the previous key matched). + */ + protected abstract void forgetKey(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiSerialized.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiSerialized.java deleted file mode 100644 index 2cc3531..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesMultiSerialized.java +++ /dev/null @@ -1,187 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.fast.SerializeWrite; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Preconditions; - -/** - * A key series of a multiple columns of keys where the keys get serialized. - * (Or, it can be 1 column). - */ -public class VectorKeySeriesMultiSerialized - extends VectorKeySeriesSerializedImpl implements VectorKeySeriesSerialized { - - private static final Logger LOG = LoggerFactory.getLogger( - VectorKeySeriesMultiSerialized.class.getName()); - - private VectorSerializeRow keySerializeRow; - - private boolean[] hasAnyNulls; - - public VectorKeySeriesMultiSerialized(T serializeWrite) { - super(serializeWrite); - } - - public void init(TypeInfo[] typeInfos, int[] columnNums) throws HiveException { - keySerializeRow = new VectorSerializeRow(serializeWrite); - keySerializeRow.init(typeInfos, columnNums); - hasAnyNulls = new boolean[VectorizedRowBatch.DEFAULT_SIZE]; - } - - @Override - public void processBatch(VectorizedRowBatch batch) throws IOException { - - currentBatchSize = batch.size; - Preconditions.checkState(currentBatchSize > 0); - - // LOG.info("VectorKeySeriesMultiSerialized processBatch size " + currentBatchSize + " numCols " + batch.numCols + " selectedInUse " + batch.selectedInUse); - - int prevKeyStart = 0; - int prevKeyLength; - int currentKeyStart = 0; - output.reset(); - - seriesCount = 0; - boolean prevKeyIsNull; - duplicateCounts[0] = 1; - if (batch.selectedInUse) { - int[] selected = batch.selected; - int index = selected[0]; - keySerializeRow.setOutputAppend(output); - keySerializeRow.serializeWrite(batch, index); - if (keySerializeRow.getIsAllNulls()) { - seriesIsAllNull[0] = prevKeyIsNull = true; - prevKeyLength = 0; - output.setWritePosition(0); - nonNullKeyCount = 0; - } else { - seriesIsAllNull[0] = prevKeyIsNull = false; - serializedKeyLengths[0] = currentKeyStart = prevKeyLength = output.getLength(); - hasAnyNulls[0] = keySerializeRow.getHasAnyNulls(); - nonNullKeyCount = 1; - } - - int keyLength; - for (int logical = 1; logical < currentBatchSize; logical++) { - index = selected[logical]; - keySerializeRow.setOutputAppend(output); - keySerializeRow.serializeWrite(batch, index); - if (keySerializeRow.getIsAllNulls()) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = prevKeyIsNull = true; - } - output.setWritePosition(currentKeyStart); - } else { - keyLength = output.getLength() - currentKeyStart; - if (!prevKeyIsNull && - StringExpr.equal( - output.getData(), prevKeyStart, prevKeyLength, - output.getData(), currentKeyStart, keyLength)) { - duplicateCounts[seriesCount]++; - output.setWritePosition(currentKeyStart); - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = prevKeyIsNull = false; - prevKeyStart = currentKeyStart; - serializedKeyLengths[nonNullKeyCount] = prevKeyLength = keyLength; - currentKeyStart += keyLength; - hasAnyNulls[nonNullKeyCount] = keySerializeRow.getHasAnyNulls(); - nonNullKeyCount++; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } else { - keySerializeRow.setOutputAppend(output); - keySerializeRow.serializeWrite(batch, 0); - if (keySerializeRow.getIsAllNulls()) { - seriesIsAllNull[0] = prevKeyIsNull = true; - prevKeyLength = 0; - output.setWritePosition(0); - nonNullKeyCount = 0; - } else { - seriesIsAllNull[0] = prevKeyIsNull = false; - serializedKeyLengths[0] = currentKeyStart = prevKeyLength = output.getLength(); - hasAnyNulls[0] = keySerializeRow.getHasAnyNulls(); - nonNullKeyCount = 1; - } - - int keyLength; - for (int index = 1; index < currentBatchSize; index++) { - keySerializeRow.setOutputAppend(output); - keySerializeRow.serializeWrite(batch, index); - if (keySerializeRow.getIsAllNulls()) { - if (prevKeyIsNull) { - duplicateCounts[seriesCount]++; - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = prevKeyIsNull = true; - } - output.setWritePosition(currentKeyStart); - } else { - keyLength = output.getLength() - currentKeyStart; - if (!prevKeyIsNull && - StringExpr.equal( - output.getData(), prevKeyStart, prevKeyLength, - output.getData(), currentKeyStart, keyLength)) { - duplicateCounts[seriesCount]++; - output.setWritePosition(currentKeyStart); - } else { - duplicateCounts[++seriesCount] = 1; - seriesIsAllNull[seriesCount] = prevKeyIsNull = false; - prevKeyStart = currentKeyStart; - serializedKeyLengths[nonNullKeyCount] = prevKeyLength = keyLength; - currentKeyStart += keyLength; - hasAnyNulls[nonNullKeyCount] = keySerializeRow.getHasAnyNulls(); - nonNullKeyCount++; - } - } - } - seriesCount++; - Preconditions.checkState(seriesCount <= currentBatchSize); - } - - // Finally. - computeSerializedHashCodes(); - positionToFirst(); - Preconditions.checkState(validate()); - } - - @Override - public void setNextNonNullKey(int nonNullKeyPosition) { - super.setNextNonNullKey(nonNullKeyPosition); - - currentHasAnyNulls = hasAnyNulls[nonNullKeyPosition]; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerialized.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerialized.java deleted file mode 100644 index 1dfb3df..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerialized.java +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -/** - * An abstract adding serialization to key series. - * - * A key with no or some nulls has serialized bytes, offset, and length. - */ -public interface VectorKeySeriesSerialized extends VectorKeySeries { - - /** - * @return the serialized bytes (including optional key tag), start offset of the key in the - * bytes, and key byte length. - */ - byte[] getSerializedBytes(); - int getSerializedStart(); - int getSerializedLength(); -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerializedImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerializedImpl.java deleted file mode 100644 index 1fbafa7..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSerializedImpl.java +++ /dev/null @@ -1,130 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.serde2.ByteStream.Output; -import org.apache.hadoop.hive.serde2.fast.SerializeWrite; -import org.apache.hive.common.util.HashCodeUtil; - -import com.google.common.base.Preconditions; - -/** - * Implementation of base serialization interface. - * - */ -public abstract class VectorKeySeriesSerializedImpl - extends VectorKeySeriesSingleImpl implements VectorKeySeriesSerialized { - - protected T serializeWrite; - - protected int bufferOffset; - - // The serialized (non-NULL) series keys. These 3 members represent the value. - public int serializedStart; - public int serializedLength; - public byte[] serializedBytes; - - protected final Output output; - - protected final int[] serializedKeyLengths; - - public VectorKeySeriesSerializedImpl(T serializeWrite) { - super(); - this.serializeWrite = serializeWrite; - output = new Output(); - serializedKeyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; - } - - public boolean validate() { - super.validate(); - - int nullCount = 0; - for (int i = 0; i < seriesCount; i++) { - if (seriesIsAllNull[i]) { - nullCount++; - } - } - Preconditions.checkState(nullCount + nonNullKeyCount == seriesCount); - - int lengthSum = 0; - int keyLength; - for (int i = 0; i < nonNullKeyCount; i++) { - keyLength = serializedKeyLengths[i]; - Preconditions.checkState(keyLength > 0); - lengthSum += keyLength; - Preconditions.checkState(lengthSum <= output.getLength()); - } - return true; - } - - @Override - public byte[] getSerializedBytes() { - return serializedBytes; - } - - @Override - public int getSerializedStart() { - return serializedStart; - } - - @Override - public int getSerializedLength() { - return serializedLength; - } - - /** - * Batch compute the hash codes for all the serialized keys. - * - * NOTE: MAJOR MAJOR ASSUMPTION: - * We assume that HashCodeUtil.murmurHash produces the same result - * as MurmurHash.hash with seed = 0 (the method used by ReduceSinkOperator for - * UNIFORM distribution). - */ - protected void computeSerializedHashCodes() { - int offset = 0; - int keyLength; - byte[] bytes = output.getData(); - for (int i = 0; i < nonNullKeyCount; i++) { - keyLength = serializedKeyLengths[i]; - hashCodes[i] = HashCodeUtil.murmurHash(bytes, offset, keyLength); - offset += keyLength; - } - } - - @Override - public void positionToFirst() { - - // Reset this before calling positionToFirst. - bufferOffset = 0; - - super.positionToFirst(); - - // This is constant for whole series. - serializedBytes = output.getData(); - } - - @Override - public void setNextNonNullKey(int nonNullKeyPosition) { - serializedStart = bufferOffset; - serializedLength = serializedKeyLengths[nonNullKeyPosition]; - Preconditions.checkState(serializedStart + serializedLength <= output.getData().length); - bufferOffset += serializedLength; - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingle.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingle.java new file mode 100644 index 0000000..6c7e8d1 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingle.java @@ -0,0 +1,166 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import com.google.common.base.Preconditions; + +/** + * Implementation of when a one key series or a serialized key series is being presented. + * + */ +public abstract class VectorKeySeriesSingle extends VectorKeySeries { + + private static final Log LOG = LogFactory.getLog(VectorKeySeriesSingle.class.getName()); + + protected int currentBatchSize; + + // The number of key series in the batch. With sequential duplicates collapsed and including + // ALL NULL keys. + public int keyCount; + + // The current position in the key series. + protected int keyPosition; + + // The number of duplicates for each series key. + protected final int[] duplicateCounts; + + // Whether a series key is NULL. + protected final boolean[] keyIsNull; + + // The number of non ALL NULL keys. They have associated hash codes and key data. + protected int nonNullKeyCount; + + // The current non-NULL key position. + protected int nonNullKeyPosition; + + // The hash code for each non-NULL key. + protected final int[] hashCodes; + + protected VectorKeySeriesSingle() { + super(); + + keyCount = 0; + keyPosition = 0; + + duplicateCounts = new int[VectorizedRowBatch.DEFAULT_SIZE]; + keyIsNull = new boolean[VectorizedRowBatch.DEFAULT_SIZE]; + + nonNullKeyCount = 0; + nonNullKeyPosition = -1; + + hashCodes = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + public boolean validate() { + Preconditions.checkState(keyCount > 0); + Preconditions.checkState(keyCount <= currentBatchSize); + Preconditions.checkState(nonNullKeyCount >= 0); + Preconditions.checkState(nonNullKeyCount <= keyCount); + + validateDuplicateCount(); + return true; + } + + private void validateDuplicateCount() { + int sum = 0; + int duplicateCount; + for (int i = 0; i < keyCount; i++) { + duplicateCount = duplicateCounts[i]; + Preconditions.checkState(duplicateCount > 0); + Preconditions.checkState(duplicateCount <= currentBatchSize); + sum += duplicateCount; + } + Preconditions.checkState(sum == currentBatchSize); + } + + public void positionToFirst() { + keyPosition = 0; + + currentLogical = 0; + currentDuplicateCount = duplicateCounts[0]; + currentKeyIsNull = keyIsNull[0]; + + if (!currentKeyIsNull) { + nonNullKeyPosition = 0; + currentHashCode = hashCodes[0]; + setNextNonEmptyKey(0); + } else { + nonNullKeyPosition = -1; + } + Preconditions.checkState(currentDuplicateCount > 0); + } + + // Consumes whole key. + public boolean next() { + + currentLogical += currentDuplicateCount; + if (currentLogical >= currentBatchSize) { + return false; + } + + Preconditions.checkState(keyPosition + 1 < keyCount); + + keyPosition++; + currentDuplicateCount = duplicateCounts[keyPosition]; + currentKeyIsNull = keyIsNull[keyPosition]; + + if (!currentKeyIsNull) { + Preconditions.checkState(nonNullKeyPosition + 1 < nonNullKeyCount); + nonNullKeyPosition++; + currentHashCode = hashCodes[nonNullKeyPosition]; + setNextNonEmptyKey(nonNullKeyPosition); + } + Preconditions.checkState(currentDuplicateCount > 0); + return true; + } + + // For use by VectorKeySeriesMulti so that the minimum equal key can be advanced. + /** + * Advance the current key by a duplicate key count. + * If there are more than duplicateCount keys left in the current key, then + * we remain in the current key. + * @param duplicateCount + */ + public void advance(int duplicateCount) { + + currentLogical += currentDuplicateCount; + + currentDuplicateCount -= duplicateCount; + if (currentDuplicateCount == 0) { + keyPosition++; + currentKeyIsNull = keyIsNull[keyPosition]; + currentDuplicateCount = duplicateCounts[keyPosition]; + + if (!currentKeyIsNull) { + nonNullKeyPosition++; + currentHashCode = hashCodes[nonNullKeyPosition]; + setNextNonEmptyKey(nonNullKeyPosition); + } + } + } + + /** + * Position the current non-empty key from the specified position. + * @param nonAllNullKeyPosition + */ + public abstract void setNextNonEmptyKey(int nonAllNullKeyPosition); +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingleImpl.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingleImpl.java deleted file mode 100644 index bf0a25b..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/VectorKeySeriesSingleImpl.java +++ /dev/null @@ -1,158 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.keyseries; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import com.google.common.base.Preconditions; - -/** - * Implementation of when a one key series or a serialized key series is being presented. - * - */ -public abstract class VectorKeySeriesSingleImpl extends VectorKeySeriesImpl - implements VectorKeySeries { - - private static final Log LOG = LogFactory.getLog(VectorKeySeriesSingleImpl.class.getName()); - - protected int currentBatchSize; - - // The number of keys (with sequential duplicates collapsed, both NULL and non-NULL) in the batch. - protected int seriesCount; - - // The current position in the key series. - protected int seriesPosition; - - // The number of duplicates for each series key (NULL or non-NULL). - protected final int[] duplicateCounts; - - // Whether a series key is NULL. - protected final boolean[] seriesIsAllNull; - - // The number of non-NULL keys. They have associated hash codes and key data. - protected int nonNullKeyCount; - - // The current non-NULL key position. - protected int nonNullKeyPosition; - - // The hash code for each non-NULL key. - protected final int[] hashCodes; - - VectorKeySeriesSingleImpl() { - super(); - - seriesCount = 0; - seriesPosition = 0; - - duplicateCounts = new int[VectorizedRowBatch.DEFAULT_SIZE]; - seriesIsAllNull = new boolean[VectorizedRowBatch.DEFAULT_SIZE]; - - nonNullKeyCount = 0; - nonNullKeyPosition = -1; - - hashCodes = new int[VectorizedRowBatch.DEFAULT_SIZE]; - } - - public boolean validate() { - Preconditions.checkState(seriesCount > 0); - Preconditions.checkState(seriesCount <= currentBatchSize); - Preconditions.checkState(nonNullKeyCount >= 0); - Preconditions.checkState(nonNullKeyCount <= seriesCount); - - validateDuplicateCount(); - return true; - } - - private void validateDuplicateCount() { - int sum = 0; - int duplicateCount; - for (int i = 0; i < seriesCount; i++) { - duplicateCount = duplicateCounts[i]; - Preconditions.checkState(duplicateCount > 0); - Preconditions.checkState(duplicateCount <= currentBatchSize); - sum += duplicateCount; - } - Preconditions.checkState(sum == currentBatchSize); - } - - @Override - public void positionToFirst() { - seriesPosition = 0; - - currentLogical = 0; - currentDuplicateCount = duplicateCounts[0]; - currentIsAllNull = seriesIsAllNull[0]; - - if (!currentIsAllNull) { - nonNullKeyPosition = 0; - currentHashCode = hashCodes[0]; - setNextNonNullKey(0); - } else { - nonNullKeyPosition = -1; - } - Preconditions.checkState(currentDuplicateCount > 0); - } - - // Consumes whole key. - @Override - public boolean next() { - - currentLogical += currentDuplicateCount; - if (currentLogical >= currentBatchSize) { - return false; - } - - Preconditions.checkState(seriesPosition + 1 < seriesCount); - - seriesPosition++; - currentDuplicateCount = duplicateCounts[seriesPosition]; - currentIsAllNull = seriesIsAllNull[seriesPosition]; - - if (!currentIsAllNull) { - Preconditions.checkState(nonNullKeyPosition + 1 < nonNullKeyCount); - nonNullKeyPosition++; - currentHashCode = hashCodes[nonNullKeyPosition]; - setNextNonNullKey(nonNullKeyPosition); - } - Preconditions.checkState(currentDuplicateCount > 0); - return true; - } - - // For use by VectorKeySeriesMulti so that the minimum equal key can be advanced. - public void advance(int duplicateCount) { - - currentLogical += currentDuplicateCount; - - currentDuplicateCount -= duplicateCount; - if (currentDuplicateCount == 0) { - seriesPosition++; - currentIsAllNull = seriesIsAllNull[seriesPosition]; - currentDuplicateCount = duplicateCounts[seriesPosition]; - - if (!currentIsAllNull) { - nonNullKeyPosition++; - currentHashCode = hashCodes[nonNullKeyPosition]; - setNextNonNullKey(nonNullKeyPosition); - } - } - } - - protected abstract void setNextNonNullKey(int nonNullKeyPosition); -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesBytesFast.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesBytesFast.java new file mode 100644 index 0000000..7815ebb --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesBytesFast.java @@ -0,0 +1,105 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytesBase; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import com.google.common.base.Preconditions; + +/** + * A key series of a single column of byte array keys where the keys get serialized using + * fast SerializeWrite style serialization. + */ +public final class VectorKeySeriesBytesFast + extends VectorKeySeriesBytesBase { + + private T serializeWrite; + + // The serialized (non-NULL) series keys. These 3 members represent the value. + public byte[] serializedBytes; + public int serializedStart; + public int serializedLength; + + private final Output output; + + private int outputOffset; + + private final int[] serializedKeyLengths; + + public VectorKeySeriesBytesFast(int columnNum, T serializeWrite) { + super(columnNum); + this.serializeWrite = serializeWrite; + output = new Output(); + serializedKeyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + outputOffset = 0; + output.reset(); + + super.processBatch(batch); + + Preconditions.checkState( + VectorKeySeriesFastUtil.validate(keyCount, nonNullKeyCount, keyIsNull, + serializedKeyLengths, output.getLength())); + + if (nonNullKeyCount > 0) { + // Compute hash codes of fast serialized keys. + VectorKeySeriesFastUtil.computeSerializedHashCodes(output, serializedKeyLengths, + nonNullKeyCount, hashCodes); + } + + // Do the posiiton after we compute the checksums. + positionToFirst(); + } + + @Override + public void saveBytesKey(int nonNullKeyPosition, byte[] keyBytes, int keyByteStart, + int keyByteLength) throws IOException { + serializeWrite.setAppend(output); + serializeWrite.writeString(keyBytes, keyByteStart, keyByteLength); + int outputNewOffset = output.getLength(); + serializedKeyLengths[nonNullKeyPosition] = outputNewOffset - outputOffset; + outputOffset = outputNewOffset; + } + + @Override + public void positionToFirst() { + + // Reset this before calling positionToFirst. + outputOffset = 0; + + super.positionToFirst(); + + // This is constant for whole series. + serializedBytes = output.getData(); + } + + public void setNextNonEmptyKey(int nonNullKeyPosition) { + serializedStart = outputOffset; + serializedLength = serializedKeyLengths[nonNullKeyPosition]; + Preconditions.checkState(serializedStart + serializedLength <= output.getData().length); + outputOffset += serializedLength; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesFastUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesFastUtil.java new file mode 100644 index 0000000..43589f8 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesFastUtil.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hive.common.util.HashCodeUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Implementation of base fast SerializeWrite style serialization interface. + * + */ +public class VectorKeySeriesFastUtil { + + private static final String CLASS_NAME = VectorKeySeriesFastUtil.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + public static boolean validate(int keyCount, int nonNullKeyCount, boolean[] keyAllNulls, + int[] serializedKeyLengths, int outputUsedLength) { + + int nullCount = 0; + for (int i = 0; i < keyCount; i++) { + if (keyAllNulls[i]) { + nullCount++; + } + } + if (nullCount + nonNullKeyCount != keyCount) { + return false; + } + + if (!validateKeyLengthSum(nonNullKeyCount, serializedKeyLengths, outputUsedLength)) { + return false; + } + + return true; + } + + public static boolean validateKeyLengthSum(int nonNullKeyCount, + int[] serializedKeyLengths, int outputUsedLength) { + int lengthSum = 0; + int keyLength; + for (int i = 0; i < nonNullKeyCount; i++) { + keyLength = serializedKeyLengths[i]; + if (keyLength <= 0) { + return false; + } + lengthSum += keyLength; + if (lengthSum > outputUsedLength) { + return false; + } + } + if (lengthSum != outputUsedLength) { + return false; + } + return true; + } + + /** + * Batch compute the hash codes for all the serialized keys. + * + * NOTE: MAJOR MAJOR ASSUMPTION: + * We assume that HashCodeUtil.murmurHash produces the same result + * as MurmurHash.hash with seed = 0 (the method used by ReduceSinkOperator for + * UNIFORM distribution). + */ + protected static void computeSerializedHashCodes(Output output, int[] serializedKeyLengths, + int nonNullKeyCount, int[] hashCodes) { + int offset = 0; + int keyLength; + byte[] bytes = output.getData(); + for (int i = 0; i < nonNullKeyCount; i++) { + keyLength = serializedKeyLengths[i]; + hashCodes[i] = HashCodeUtil.murmurHash(bytes, offset, keyLength); + offset += keyLength; + } + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesLongFast.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesLongFast.java new file mode 100644 index 0000000..b78c17f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesLongFast.java @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytesBase; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLongBase; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; + +import com.google.common.base.Preconditions; + +/** + * A key series of a single column of long keys where the keys get serialized using + * fast SerializeWrite style serialization + */ +public final class VectorKeySeriesLongFast + extends VectorKeySeriesLongBase { + + private T serializeWrite; + + // The serialized (non-NULL) series keys. These 3 members represent the value. + public byte[] serializedBytes; + public int serializedStart; + public int serializedLength; + + private final Output output; + + private int outputOffset; + + private final int[] serializedKeyLengths; + + public VectorKeySeriesLongFast(int columnNum, PrimitiveTypeInfo primitiveTypeInfo, + T serializeWrite) { + super(columnNum, primitiveTypeInfo); + this.serializeWrite = serializeWrite; + output = new Output(); + serializedKeyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + outputOffset = 0; + output.reset(); + + super.processBatch(batch); + + Preconditions.checkState( + VectorKeySeriesFastUtil.validate(keyCount, nonNullKeyCount, keyIsNull, + serializedKeyLengths, output.getLength())); + + if (nonNullKeyCount > 0) { + // Compute hash codes of fast serialized keys. + VectorKeySeriesFastUtil.computeSerializedHashCodes(output, serializedKeyLengths, + nonNullKeyCount, hashCodes); + } + + // Do the posiiton after we compute the checksums. + positionToFirst(); + } + + protected void saveLongKey(int nonNullKeyPosition, long key) throws IOException { + serializeWrite.setAppend(output); + + switch (primitiveCategory) { + case BOOLEAN: + serializeWrite.writeBoolean(key != 0); + break; + case BYTE: + serializeWrite.writeByte((byte) key); + break; + case SHORT: + serializeWrite.writeShort((short) key); + break; + case INT: + serializeWrite.writeInt((int) key); + break; + case LONG: + serializeWrite.writeLong(key); + break; + default: + throw new RuntimeException("Unexpected primitive category " + primitiveCategory.name()); + } + int outputNewPosition = output.getLength(); + serializedKeyLengths[nonNullKeyPosition] = outputNewPosition - outputOffset; + outputOffset = outputNewPosition; + } + + @Override + public void positionToFirst() { + + // Reset this before calling positionToFirst. + outputOffset = 0; + + super.positionToFirst(); + + // This is constant for whole series. + serializedBytes = output.getData(); + } + + @Override + public void setNextNonEmptyKey(int nonNullKeyPosition) { + serializedStart = outputOffset; + serializedLength = serializedKeyLengths[nonNullKeyPosition]; + Preconditions.checkState(serializedStart + serializedLength <= output.getData().length); + outputOffset += serializedLength; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFast.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFast.java new file mode 100644 index 0000000..7da25ad --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFast.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * A key series of a multiple columns of keys where the keys get serialized using + * fast SerializeWrite style serialization. + * + * NOTE: it can be 1 column when that column isn't one supported by the other type specializations. + */ +public final class VectorKeySeriesMultiFast + extends VectorKeySeriesMultiFastBase { + + private static final String CLASS_NAME = VectorKeySeriesMultiFast.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + public VectorKeySeriesMultiFast(T serializeWrite) { + super(serializeWrite); + } + + @Override + protected void writeMultiKey(VectorizedRowBatch batch, int index, int nonNullKeyCount) + throws IOException { + currentKeyOffset = output.getLength(); + keySerializeRow.setOutputAppend(output); + keySerializeRow.serializeWrite(batch, index); + if (keySerializeRow.getHasAnyNulls()) { + output.setWritePosition(currentKeyOffset); + return; + } + int writeLength = output.getLength() - currentKeyOffset; + Preconditions.checkState(writeLength > 0); + serializedKeyLengths[nonNullKeyCount] = writeLength; + } + + @Override + protected boolean isNull() { + return keySerializeRow.getHasAnyNulls(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFastBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFastBase.java new file mode 100644 index 0000000..c396baa --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFastBase.java @@ -0,0 +1,184 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesMultiBase; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * A key series of a multiple columns of keys where the keys get serialized using + * fast SerializeWrite style serialization. + * + * NOTE: it can be 1 column when that column isn't one supported by the other type specializations. + */ +public abstract class VectorKeySeriesMultiFastBase + extends VectorKeySeriesMultiBase { + + private static final String CLASS_NAME = VectorKeySeriesMultiFast.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + private final T serializeWrite; + + protected VectorSerializeRow keySerializeRow; + + // The serialized (non-NULL) series keys. These 3 members represent the value. + public int serializedStart; + public int serializedLength; + public byte[] serializedBytes; + + protected final Output output; + + protected int currentKeyOffset; + + protected final int[] serializedKeyLengths; + + public VectorKeySeriesMultiFastBase(T serializeWrite) { + this.serializeWrite = serializeWrite; + output = new Output(); + serializedKeyLengths = new int[VectorizedRowBatch.DEFAULT_SIZE]; + } + + public void init(TypeInfo[] typeInfos, int[] columnNums) throws HiveException { + keySerializeRow = new VectorSerializeRow(serializeWrite); + keySerializeRow.init(typeInfos, columnNums); + } + + @Override + public void processBatch(VectorizedRowBatch batch) throws IOException { + currentKeyOffset = 0; + output.reset(); + + super.processBatch(batch); + + if (!VectorKeySeriesFastUtil.validate(keyCount, nonNullKeyCount, keyIsNull, + serializedKeyLengths, output.getLength())) { + throw new RuntimeException("validate error batch.size " + batch.size + + " keyCount " + keyCount + + " nonNullKeyCount " + nonNullKeyCount + + " keyIsNull " + Arrays.toString(Arrays.copyOf(keyIsNull, keyCount)) + + " serializedKeyLengths " + Arrays.toString(Arrays.copyOf(serializedKeyLengths, keyCount)) + + " output.getLength() " + output.getLength()); + } + Preconditions.checkState( + VectorKeySeriesFastUtil.validate(keyCount, nonNullKeyCount, keyIsNull, + serializedKeyLengths, output.getLength())); + + if (nonNullKeyCount > 0) { + // Compute hash codes of fast serialized keys. + VectorKeySeriesFastUtil.computeSerializedHashCodes(output, serializedKeyLengths, + nonNullKeyCount, hashCodes); + } + + // Do the position after we compute the checksums. + positionToFirst(); + } + + @Override + protected boolean equalsPrevKey(int nonNullKeyCount) { + int prevKeyLength = serializedKeyLengths[nonNullKeyCount - 1]; + int keyLength = serializedKeyLengths[nonNullKeyCount]; + byte[] bytes = output.getData(); + boolean result = + StringExpr.equal(bytes, currentKeyOffset - prevKeyLength, prevKeyLength, + bytes, currentKeyOffset, keyLength); + return result; + } + + @Override + protected void forgetKey() { + output.setWritePosition(currentKeyOffset); + } + + @Override + public void positionToFirst() { + + // Reset this before calling positionToFirst. + currentKeyOffset = 0; + + super.positionToFirst(); + + // This is constant for whole series. + serializedBytes = output.getData(); + } + + @Override + public void setNextNonEmptyKey(int nonNullKeyPosition) { + serializedStart = currentKeyOffset; + serializedLength = serializedKeyLengths[nonNullKeyPosition]; + Preconditions.checkState(serializedStart + serializedLength <= output.getData().length); + currentKeyOffset += serializedLength; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(CLASS_NAME); + sb.append(" "); + sb.append("keyCount "); + sb.append(keyCount); + sb.append(" nonNullKeyCount "); + sb.append(nonNullKeyCount); + sb.append("\n"); + int logical = 0; + int nonNullIndex = 0; + int duplicateCount; + boolean isAllNull; + int keyOffset = 0; + int keyLength; + int hashCode; + byte[] bytes = output.getData(); + for (int i = 0; i < keyCount; i++) { + sb.append(logical); + sb.append(" "); + duplicateCount = duplicateCounts[i]; + isAllNull = keyIsNull[i]; + if (isAllNull) { + sb.append("NULL "); + } else { + keyLength = serializedKeyLengths[nonNullIndex]; + sb.append(displayBytes(bytes, keyOffset, keyLength)); + keyOffset += keyLength; + hashCode = hashCodes[nonNullIndex]; + sb.append(" hashCode "); + sb.append(hashCode); + nonNullIndex++; + } + if (duplicateCount > 1) { + sb.append(" repeat "); + sb.append(duplicateCount); + } + sb.append("\n"); + logical += duplicateCount; + } + return sb.toString(); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFastNullSafe.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFastNullSafe.java new file mode 100644 index 0000000..8d819ca --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/keyseries/fast/VectorKeySeriesMultiFastNullSafe.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries.fast; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * A key series of a multiple columns of keys where the keys get serialized using + * fast SerializeWrite style serialization. + * + * NOTE: it can be 1 column when that column isn't one supported by the other type specializations. + */ +public final class VectorKeySeriesMultiFastNullSafe + extends VectorKeySeriesMultiFastBase { + + private static final String CLASS_NAME = VectorKeySeriesMultiFastNullSafe.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + private final boolean[] nullsafe; + + public VectorKeySeriesMultiFastNullSafe(T serializeWrite, boolean[] nullsafe) { + super(serializeWrite); + this.nullsafe = nullsafe; + } + + @Override + protected void writeMultiKey(VectorizedRowBatch batch, int index, int nonNullKeyCount) + throws IOException { + currentKeyOffset = output.getLength(); + keySerializeRow.setOutputAppend(output); + + // Call different serialize method designed for nullsafe. + keySerializeRow.serializeWriteNullSafe(batch, index, nullsafe); + if (keySerializeRow.getNullSafeNullsRemain()) { + output.setWritePosition(currentKeyOffset); + return; + } + + int writeLength = output.getLength() - currentKeyOffset; + Preconditions.checkState(writeLength > 0); + serializedKeyLengths[nonNullKeyCount] = writeLength; + } + + @Override + protected boolean isNull() { + return keySerializeRow.getNullSafeNullsRemain(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java index 24668f9..a194277 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.List; import java.util.Map; + import org.apache.commons.lang.ArrayUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,6 +33,10 @@ import org.apache.hadoop.hive.ql.exec.HashTableLoader; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; +import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnMapping; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping; @@ -43,19 +48,14 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion; import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedCreateHashTable; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTable; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastHashTableLoader; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; import org.apache.hadoop.hive.serde2.objectinspector.StructField; @@ -63,6 +63,9 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; + /** * This class is common operator class for native vectorized map join. * @@ -72,7 +75,43 @@ */ public abstract class VectorMapJoinCommonOperator extends MapJoinOperator implements VectorizationContextRegion { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinCommonOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + + private static final String CLASS_NAME = VectorMapJoinCommonOperator.class.getName(); +private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected abstract String getLoggingPrefix(); + + // For debug tracing: information about the map or reduce task, operator, operator class, etc. + protected transient String loggingPrefix; + + protected String getLoggingPrefix(String className) { + if (loggingPrefix == null) { + initLoggingPrefix(className); + } + return loggingPrefix; + } + + protected void initLoggingPrefix(String className) { + if (hconf == null) { + // Constructor time... + loggingPrefix = className; + } else { + // Determine the name of our map or reduce task for debug tracing. + BaseWork work = Utilities.getMapWork(hconf); + if (work == null) { + work = Utilities.getReduceWork(hconf); + } + loggingPrefix = className + " " + work.getName() + " " + getOperatorId(); + } + } + + //------------------------------------------------------------------------------------------------ + + protected VectorMapJoinDesc vectorDesc; + + protected VectorMapJoinInfo vectorMapJoinInfo; // Whether this operator is an outer join. protected boolean isOuterJoin; @@ -88,10 +127,10 @@ // a mixture of input big table columns and new scratch columns. protected VectorizationContext vOutContext; - // The output column projection of the vectorized row batch. And, the type names of the output + // The output column projection of the vectorized row batch. And, the type infos of the output // columns. protected int[] outputProjection; - protected String[] outputTypeNames; + protected TypeInfo[] outputTypeInfos; // These are the vectorized batch expressions for filtering, key expressions, and value // expressions. @@ -101,15 +140,17 @@ // This is map of which vectorized row batch columns are the big table key columns. Since // we may have key expressions that produce new scratch columns, we need a mapping. - // And, we have their type names. + // And, we have their type infos. protected int[] bigTableKeyColumnMap; - protected ArrayList bigTableKeyTypeNames; + protected String[] bigTableKeyColumnNames; + protected TypeInfo[] bigTableKeyTypeInfos; // Similarly, this is map of which vectorized row batch columns are the big table value columns. // Since we may have value expressions that produce new scratch columns, we need a mapping. - // And, we have their type names. + // And, we have their type infos. protected int[] bigTableValueColumnMap; - protected ArrayList bigTableValueTypeNames; + protected String[] bigTableValueColumnNames; + protected TypeInfo[] bigTableValueTypeInfos; // This is a mapping of which big table columns (input and key/value expressions) will be // part of the big table portion of the join output result. @@ -124,6 +165,8 @@ // to output batch scratch columns for the small table portion. protected VectorColumnSourceMapping smallTableMapping; + protected VectorColumnSourceMapping projectionMapping; + // These are the output columns for the small table and the outer small table keys. protected int[] smallTableOutputVectorColumns; protected int[] bigTableOuterKeyOutputVectorColumns; @@ -137,9 +180,6 @@ // transient. //--------------------------------------------------------------------------- - // For debug tracing: the name of the map or reduce task. - protected transient String taskName; - // The threshold where we should use a repeating vectorized row batch optimization for // generating join output results. protected transient boolean useOverflowRepeatedThreshold; @@ -175,7 +215,12 @@ protected transient boolean needHashTableSetup; // The small table hash table for the native vectorized map join operator. - protected transient VectorMapJoinHashTable vectorMapJoinHashTable; + protected transient MapJoinHashTableFind vectorMapJoinHashTableFind; + + // The map join hash table factory for create hash table results. + protected transient MapJoinHashTableFactory vectormapJoinHashTableFactory; + + protected transient long totalNumSmallTableKeys; /** Kryo ctor. */ protected VectorMapJoinCommonOperator() { @@ -192,6 +237,9 @@ public VectorMapJoinCommonOperator(CompilationOpContext ctx, MapJoinDesc desc = (MapJoinDesc) conf; this.conf = desc; + vectorDesc = desc.getVectorDesc(); + vectorMapJoinInfo = vectorDesc.getVectorMapJoinInfo(); + Preconditions.checkState(vectorMapJoinInfo != null); this.vContext = vContext; @@ -210,214 +258,28 @@ public VectorMapJoinCommonOperator(CompilationOpContext ctx, bigTableFilterExpressions = vContext.getVectorExpressions(filterExpressions.get(posBigTable), VectorExpressionDescriptor.Mode.FILTER); - List keyDesc = desc.getKeys().get(posBigTable); - bigTableKeyExpressions = vContext.getVectorExpressions(keyDesc); - - // Since a key expression can be a calculation and the key will go into a scratch column, - // we need the mapping and type information. - bigTableKeyColumnMap = new int[bigTableKeyExpressions.length]; - bigTableKeyTypeNames = new ArrayList(); - boolean onlyColumns = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - VectorExpression ve = bigTableKeyExpressions[i]; - if (!IdentityExpression.isColumnOnly(ve)) { - onlyColumns = false; - } - bigTableKeyTypeNames.add(keyDesc.get(i).getTypeString()); - bigTableKeyColumnMap[i] = ve.getOutputColumn(); - } - if (onlyColumns) { - bigTableKeyExpressions = null; - } - - List bigTableExprs = desc.getExprs().get(posBigTable); - bigTableValueExpressions = vContext.getVectorExpressions(bigTableExprs); - - /* - * Similarly, we need a mapping since a value expression can be a calculation and the value - * will go into a scratch column. - */ - bigTableValueColumnMap = new int[bigTableValueExpressions.length]; - bigTableValueTypeNames = new ArrayList(); - onlyColumns = true; - for (int i = 0; i < bigTableValueColumnMap.length; i++) { - VectorExpression ve = bigTableValueExpressions[i]; - if (!IdentityExpression.isColumnOnly(ve)) { - onlyColumns = false; - } - bigTableValueTypeNames.add(bigTableExprs.get(i).getTypeString()); - bigTableValueColumnMap[i] = ve.getOutputColumn(); - } - if (onlyColumns) { - bigTableValueExpressions = null; - } - - determineCommonInfo(isOuterJoin); - } - - protected void determineCommonInfo(boolean isOuter) throws HiveException { - - bigTableRetainedMapping = new VectorColumnOutputMapping("Big Table Retained Mapping"); - - bigTableOuterKeyMapping = new VectorColumnOutputMapping("Big Table Outer Key Mapping"); - - // The order of the fields in the LazyBinary small table value must be used, so - // we use the source ordering flavor for the mapping. - smallTableMapping = new VectorColumnSourceMapping("Small Table Mapping"); - - // We use a mapping object here so we can build the projection in any order and - // get the ordered by 0 to n-1 output columns at the end. - // - // Also, to avoid copying a big table key into the small table result area for inner joins, - // we reference it with the projection so there can be duplicate output columns - // in the projection. - VectorColumnSourceMapping projectionMapping = new VectorColumnSourceMapping("Projection Mapping"); - - /* - * Gather up big and small table output result information from the MapJoinDesc. - */ - List bigTableRetainList = conf.getRetainList().get(posBigTable); - int bigTableRetainSize = bigTableRetainList.size(); - - int[] smallTableIndices; - int smallTableIndicesSize; - List smallTableExprs = conf.getExprs().get(posSingleVectorMapJoinSmallTable); - if (conf.getValueIndices() != null && conf.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) { - smallTableIndices = conf.getValueIndices().get(posSingleVectorMapJoinSmallTable); - smallTableIndicesSize = smallTableIndices.length; - } else { - smallTableIndices = null; - smallTableIndicesSize = 0; - } + bigTableKeyColumnMap = vectorMapJoinInfo.getBigTableKeyColumnMap(); + bigTableKeyColumnNames = vectorMapJoinInfo.getBigTableKeyColumnNames(); + bigTableKeyTypeInfos = vectorMapJoinInfo.getBigTableKeyTypeInfos(); + bigTableKeyExpressions = vectorMapJoinInfo.getBigTableKeyExpressions(); - List smallTableRetainList = conf.getRetainList().get(posSingleVectorMapJoinSmallTable); - int smallTableRetainSize = smallTableRetainList.size(); + bigTableValueColumnMap = vectorMapJoinInfo.getBigTableValueColumnMap(); + bigTableValueColumnNames = vectorMapJoinInfo.getBigTableValueColumnNames(); + bigTableValueTypeInfos = vectorMapJoinInfo.getBigTableValueTypeInfos(); + bigTableValueExpressions = vectorMapJoinInfo.getBigTableValueExpressions(); - int smallTableResultSize = 0; - if (smallTableIndicesSize > 0) { - smallTableResultSize = smallTableIndicesSize; - } else if (smallTableRetainSize > 0) { - smallTableResultSize = smallTableRetainSize; - } - - /* - * Determine the big table retained mapping first so we can optimize out (with - * projection) copying inner join big table keys in the subsequent small table results section. - */ - int nextOutputColumn = (order[0] == posBigTable ? 0 : smallTableResultSize); - for (int i = 0; i < bigTableRetainSize; i++) { + bigTableRetainedMapping = vectorMapJoinInfo.getBigTableRetainedMapping(); - // Since bigTableValueExpressions may do a calculation and produce a scratch column, we - // need to map to the right batch column. + bigTableOuterKeyMapping = vectorMapJoinInfo.getBigTableOuterKeyMapping(); - int retainColumn = bigTableRetainList.get(i); - int batchColumnIndex = bigTableValueColumnMap[retainColumn]; - String typeName = bigTableValueTypeNames.get(i); + smallTableMapping = vectorMapJoinInfo.getSmallTableMapping(); - // With this map we project the big table batch to make it look like an output batch. - projectionMapping.add(nextOutputColumn, batchColumnIndex, typeName); - - // Collect columns we copy from the big table batch to the overflow batch. - if (!bigTableRetainedMapping.containsOutputColumn(batchColumnIndex)) { - // Tolerate repeated use of a big table column. - bigTableRetainedMapping.add(batchColumnIndex, batchColumnIndex, typeName); - } - - nextOutputColumn++; - } - - /* - * Now determine the small table results. - */ - int firstSmallTableOutputColumn; - firstSmallTableOutputColumn = (order[0] == posBigTable ? bigTableRetainSize : 0); - int smallTableOutputCount = 0; - nextOutputColumn = firstSmallTableOutputColumn; + projectionMapping = vectorMapJoinInfo.getProjectionMapping(); - // Small table indices has more information (i.e. keys) than retain, so use it if it exists... - if (smallTableIndicesSize > 0) { - smallTableOutputCount = smallTableIndicesSize; - - for (int i = 0; i < smallTableIndicesSize; i++) { - if (smallTableIndices[i] >= 0) { - - // Zero and above numbers indicate a big table key is needed for - // small table result "area". - - int keyIndex = smallTableIndices[i]; - - // Since bigTableKeyExpressions may do a calculation and produce a scratch column, we - // need to map the right column. - int batchKeyColumn = bigTableKeyColumnMap[keyIndex]; - String typeName = bigTableKeyTypeNames.get(keyIndex); - - if (!isOuter) { - - // Optimize inner join keys of small table results. - - // Project the big table key into the small table result "area". - projectionMapping.add(nextOutputColumn, batchKeyColumn, typeName); - - if (!bigTableRetainedMapping.containsOutputColumn(batchKeyColumn)) { - // If necessary, copy the big table key into the overflow batch's small table - // result "area". - bigTableRetainedMapping.add(batchKeyColumn, batchKeyColumn, typeName); - } - } else { - - // For outer joins, since the small table key can be null when there is no match, - // we must have a physical (scratch) column for those keys. We cannot use the - // projection optimization used by inner joins above. - - int scratchColumn = vOutContext.allocateScratchColumn(typeName); - projectionMapping.add(nextOutputColumn, scratchColumn, typeName); - - bigTableRetainedMapping.add(batchKeyColumn, scratchColumn, typeName); - - bigTableOuterKeyMapping.add(batchKeyColumn, scratchColumn, typeName); - } - } else { - - // Negative numbers indicate a column to be (deserialize) read from the small table's - // LazyBinary value row. - int smallTableValueIndex = -smallTableIndices[i] - 1; - - String typeName = smallTableExprs.get(i).getTypeString(); - - // Make a new big table scratch column for the small table value. - int scratchColumn = vOutContext.allocateScratchColumn(typeName); - projectionMapping.add(nextOutputColumn, scratchColumn, typeName); - - smallTableMapping.add(smallTableValueIndex, scratchColumn, typeName); - } - nextOutputColumn++; - } - } else if (smallTableRetainSize > 0) { - smallTableOutputCount = smallTableRetainSize; - - // Only small table values appear in join output result. - - for (int i = 0; i < smallTableRetainSize; i++) { - int smallTableValueIndex = smallTableRetainList.get(i); - - // Make a new big table scratch column for the small table value. - String typeName = smallTableExprs.get(i).getTypeString(); - int scratchColumn = vOutContext.allocateScratchColumn(typeName); - - projectionMapping.add(nextOutputColumn, scratchColumn, typeName); - - smallTableMapping.add(smallTableValueIndex, scratchColumn, typeName); - nextOutputColumn++; - } - } - - // Convert dynamic arrays and maps to simple arrays. - - bigTableRetainedMapping.finalize(); - - bigTableOuterKeyMapping.finalize(); + determineCommonInfo(isOuterJoin); + } - smallTableMapping.finalize(); + protected void determineCommonInfo(boolean isOuter) throws HiveException { bigTableOuterKeyOutputVectorColumns = bigTableOuterKeyMapping.getOutputColumns(); smallTableOutputVectorColumns = smallTableMapping.getOutputColumns(); @@ -429,46 +291,37 @@ protected void determineCommonInfo(boolean isOuter) throws HiveException { smallTableByteColumnVectorColumns = getByteColumnVectorColumns(smallTableMapping); - projectionMapping.finalize(); - - // Verify we added an entry for each output. - assert projectionMapping.isSourceSequenceGood(); - outputProjection = projectionMapping.getOutputColumns(); - outputTypeNames = projectionMapping.getTypeNames(); + outputTypeInfos = projectionMapping.getTypeInfos(); if (isLogDebugEnabled) { int[] orderDisplayable = new int[order.length]; for (int i = 0; i < order.length; i++) { orderDisplayable[i] = (int) order[i]; } - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor order " + Arrays.toString(orderDisplayable)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor posBigTable " + (int) posBigTable); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor posSingleVectorMapJoinSmallTable " + (int) posSingleVectorMapJoinSmallTable); - - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableKeyColumnMap " + Arrays.toString(bigTableKeyColumnMap)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableKeyTypeNames " + bigTableKeyTypeNames); - - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableValueColumnMap " + Arrays.toString(bigTableValueColumnMap)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableValueTypeNames " + bigTableValueTypeNames); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor order " + Arrays.toString(orderDisplayable)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor posBigTable " + (int) posBigTable); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor posSingleVectorMapJoinSmallTable " + (int) posSingleVectorMapJoinSmallTable); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableIndices " + Arrays.toString(smallTableIndices)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableRetainList " + smallTableRetainList); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableKeyColumnMap " + Arrays.toString(bigTableKeyColumnMap)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableKeyColumnNames " + Arrays.toString(bigTableKeyColumnNames)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableKeyTypeInfos " + Arrays.toString(bigTableKeyTypeInfos)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor firstSmallTableOutputColumn " + firstSmallTableOutputColumn); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableOutputCount " + smallTableOutputCount); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableValueColumnMap " + Arrays.toString(bigTableValueColumnMap)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableValueColumnNames " + Arrays.toString(bigTableValueColumnNames)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableValueTypeNames " + Arrays.toString(bigTableValueTypeInfos)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableRetainedMapping " + bigTableRetainedMapping.toString()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableRetainedMapping " + bigTableRetainedMapping.toString()); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableOuterKeyMapping " + bigTableOuterKeyMapping.toString()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableOuterKeyMapping " + bigTableOuterKeyMapping.toString()); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableMapping " + smallTableMapping.toString()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor smallTableMapping " + smallTableMapping.toString()); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor bigTableByteColumnVectorColumns " + Arrays.toString(bigTableByteColumnVectorColumns)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor smallTableByteColumnVectorColumns " + Arrays.toString(smallTableByteColumnVectorColumns)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor bigTableByteColumnVectorColumns " + Arrays.toString(bigTableByteColumnVectorColumns)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor smallTableByteColumnVectorColumns " + Arrays.toString(smallTableByteColumnVectorColumns)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor outputProjection " + Arrays.toString(outputProjection)); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor outputTypeNames " + Arrays.toString(outputTypeNames)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor outputProjection " + Arrays.toString(outputProjection)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor outputTypeInfos " + Arrays.toString(outputTypeInfos)); } setupVOutContext(conf.getOutputColumnNames()); @@ -482,10 +335,10 @@ protected void determineCommonInfo(boolean isOuter) throws HiveException { ArrayList list = new ArrayList(); int count = mapping.getCount(); int[] outputColumns = mapping.getOutputColumns(); - String[] typeNames = mapping.getTypeNames(); + TypeInfo[] typeInfos = mapping.getTypeInfos(); for (int i = 0; i < count; i++) { int outputColumn = outputColumns[i]; - String typeName = typeNames[i]; + String typeName = typeInfos[i].getTypeName(); if (VectorizationContext.isStringFamily(typeName)) { list.add(outputColumn); } @@ -500,10 +353,10 @@ protected void determineCommonInfo(boolean isOuter) throws HiveException { */ protected void setupVOutContext(List outputColumnNames) { if (isLogDebugEnabled) { - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor outputColumnNames " + outputColumnNames); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor outputColumnNames " + outputColumnNames); } if (outputColumnNames.size() != outputProjection.length) { - throw new RuntimeException("Output column names " + outputColumnNames + " length and output projection " + Arrays.toString(outputProjection) + " / " + Arrays.toString(outputTypeNames) + " length mismatch"); + throw new RuntimeException("Output column names " + outputColumnNames + " length and output projection " + Arrays.toString(outputProjection) + " / " + Arrays.toString(outputTypeInfos) + " length mismatch"); } vOutContext.resetProjectionColumns(); for (int i = 0; i < outputColumnNames.size(); ++i) { @@ -512,49 +365,15 @@ protected void setupVOutContext(List outputColumnNames) { vOutContext.addProjectionColumn(columnName, outputColumn); if (isLogDebugEnabled) { - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator constructor addProjectionColumn " + i + " columnName " + columnName + " outputColumn " + outputColumn); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator constructor addProjectionColumn " + i + " columnName " + columnName + " outputColumn " + outputColumn); } } } - /** - * This override lets us substitute our own fast vectorized hash table loader. - */ - @Override - protected HashTableLoader getHashTableLoader(Configuration hconf) { - VectorMapJoinDesc vectorDesc = conf.getVectorDesc(); - HashTableImplementationType hashTableImplementationType = vectorDesc.hashTableImplementationType(); - HashTableLoader hashTableLoader; - switch (vectorDesc.hashTableImplementationType()) { - case OPTIMIZED: - // Use the Tez hash table loader. - hashTableLoader = HashTableLoaderFactory.getLoader(hconf); - break; - case FAST: - // Use our specialized hash table loader. - hashTableLoader = HiveConf.getVar( - hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark") ? - HashTableLoaderFactory.getLoader(hconf) : new VectorMapJoinFastHashTableLoader(); - break; - default: - throw new RuntimeException("Unknown vector map join hash table implementation type " + hashTableImplementationType.name()); - } - return hashTableLoader; - } - @Override protected void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); - if (isLogDebugEnabled) { - // Determine the name of our map or reduce task for debug tracing. - BaseWork work = Utilities.getMapWork(hconf); - if (work == null) { - work = Utilities.getReduceWork(hconf); - } - taskName = work.getName(); - } - /* * Get configuration parameters. */ @@ -570,8 +389,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { smallTableVectorDeserializeRow = new VectorDeserializeRow( new LazyBinaryDeserializeRead( - VectorizedBatchUtil.typeInfosFromTypeNames( - smallTableMapping.getTypeNames()))); + smallTableMapping.getTypeInfos())); smallTableVectorDeserializeRow.init(smallTableMapping.getOutputColumns()); } @@ -595,13 +413,13 @@ protected void initializeOp(Configuration hconf) throws HiveException { if (isLogDebugEnabled) { int[] currentScratchColumns = vOutContext.currentScratchColumns(); - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator initializeOp currentScratchColumns " + Arrays.toString(currentScratchColumns)); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator initializeOp currentScratchColumns " + Arrays.toString(currentScratchColumns)); StructObjectInspector structOutputObjectInspector = (StructObjectInspector) outputObjInspector; List fields = structOutputObjectInspector.getAllStructFieldRefs(); int i = 0; for (StructField field : fields) { - LOG.debug("VectorMapJoinInnerBigOnlyCommonOperator initializeOp " + i + " field " + field.getFieldName() + " type " + field.getFieldObjectInspector().getTypeName()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator initializeOp " + i + " field " + field.getFieldName() + " type " + field.getFieldObjectInspector().getTypeName()); i++; } } @@ -612,31 +430,32 @@ protected void completeInitializationOp(Object[] os) throws HiveException { // setup mapJoinTables and serdes super.completeInitializationOp(os); - VectorMapJoinDesc vectorDesc = conf.getVectorDesc(); - HashTableImplementationType hashTableImplementationType = vectorDesc.hashTableImplementationType(); - switch (vectorDesc.hashTableImplementationType()) { - case OPTIMIZED: - { - // Create our vector map join optimized hash table variation *above* the - // map join table container. - vectorMapJoinHashTable = VectorMapJoinOptimizedCreateHashTable.createHashTable(conf, - mapJoinTables[posSingleVectorMapJoinSmallTable]); - } - break; - - case FAST: - { - // Get our vector map join fast hash table variation from the - // vector map join table container. - VectorMapJoinTableContainer vectorMapJoinTableContainer = - (VectorMapJoinTableContainer) mapJoinTables[posSingleVectorMapJoinSmallTable]; - vectorMapJoinHashTable = vectorMapJoinTableContainer.vectorMapJoinHashTable(); - } - break; - default: - throw new RuntimeException("Unknown vector map join hash table implementation type " + hashTableImplementationType.name()); + if (isTestingNoHashTableLoad) { + return; } - LOG.info("Using " + vectorMapJoinHashTable.getClass().getSimpleName() + " from " + this.getClass().getSimpleName()); + MapJoinTableContainer mapJoinTableContainer = + mapJoinTables[posSingleVectorMapJoinSmallTable]; + + setUpHashTable(mapJoinTableContainer); + } + + @VisibleForTesting + @Override + public void setTestMapJoinTableContainer(int posSmallTable, + MapJoinTableContainer testMapJoinTableContainer, + MapJoinTableContainerSerDe mapJoinTableContainerSerDe) { + setUpHashTable(testMapJoinTableContainer); + } + + private void setUpHashTable(MapJoinTableContainer mapJoinTableContainer) { + + // The hash table for the specialized operator. + vectorMapJoinHashTableFind = mapJoinTableContainer.getMapJoinHashTableFind(); + + // The factory so we can create result objects. + vectormapJoinHashTableFactory = mapJoinTableContainer.getMapJoinHashTableFactory(); + + totalNumSmallTableKeys = mapJoinTableContainer.size(); } /* @@ -654,7 +473,7 @@ protected VectorizedRowBatch setupOverflowBatch() throws HiveException { // First, just allocate just the projection columns we will be using. for (int i = 0; i < outputProjection.length; i++) { int outputColumn = outputProjection[i]; - String typeName = outputTypeNames[i]; + String typeName = outputTypeInfos[i].getTypeName(); allocateOverflowBatchColumnVector(overflowBatch, outputColumn, typeName); } @@ -686,7 +505,7 @@ private void allocateOverflowBatchColumnVector(VectorizedRowBatch overflowBatch, overflowBatch.cols[outputColumn] = VectorizedBatchUtil.createColumnVector(typeInfo); if (isLogDebugEnabled) { - LOG.debug(taskName + ", " + getOperatorId() + " VectorMapJoinCommonOperator initializeOp overflowBatch outputColumn " + outputColumn + " class " + overflowBatch.cols[outputColumn].getClass().getSimpleName()); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator initializeOp overflowBatch outputColumn " + outputColumn + " class " + overflowBatch.cols[outputColumn].getClass().getSimpleName()); } } } @@ -723,9 +542,9 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { } protected void displayBatchColumns(VectorizedRowBatch batch, String batchName) { - LOG.debug("commonSetup " + batchName + " column count " + batch.numCols); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator commonSetup " + batchName + " column count " + batch.numCols); for (int column = 0; column < batch.numCols; column++) { - LOG.debug("commonSetup " + batchName + " column " + column + " type " + (batch.cols[column] == null ? "NULL" : batch.cols[column].getClass().getSimpleName())); + LOG.debug(getLoggingPrefix() + " VectorMapJoinCommonOperator commonSetup " + batchName + " column " + column + " type " + (batch.cols[column] == null ? "NULL" : batch.cols[column].getClass().getSimpleName())); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java index 469f86a..0ffaf3c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinGenerateResultOperator.java @@ -28,8 +28,9 @@ import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorDeserializeRow; import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; @@ -37,9 +38,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedCreateHashTable; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeries; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.SerDeException; @@ -47,11 +46,11 @@ import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.ByteStream.Output; +import com.google.common.base.Preconditions; + /** * This class has methods for generating vectorized join results and forwarding batchs. * @@ -73,8 +72,16 @@ public abstract class VectorMapJoinGenerateResultOperator extends VectorMapJoinCommonOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinGenerateResultOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinGenerateResultOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix(String className) { + // Use operator's class name. + return super.getLoggingPrefix(className); + } //------------------------------------------------------------------------------------------------ @@ -86,6 +93,13 @@ // Debug display. protected transient long batchCounter; + protected transient long spilledRowCounter; + protected transient long inputRowCounter; + protected transient long bigTableOutputRowCounter; + protected transient long overflowOutputRowCounter; + + protected transient long singleValueCounter; + protected transient long multiValueCounter; /** Kryo ctor. */ protected VectorMapJoinGenerateResultOperator() { @@ -105,13 +119,138 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); batchCounter = 0; + spilledRowCounter = 0; + inputRowCounter = 0; + bigTableOutputRowCounter = 0; + overflowOutputRowCounter = 0; + + singleValueCounter = 0; + multiValueCounter = 0; + } + + //------------------------------------------------------------------------------------------------ + + protected static int makeSelectedByRemovingSeries( + boolean selectedInUse, int[] selected, int selectedSize, + int[] seriesLogicalIndices, int[] seriesDuplicateCounts, int keyCount, + boolean seriesSelectedInUse, int[] seriesSelected, int seriesSelectedSize, + int[] resultSelected) { + int resultCount = 0; + int logical = 0; + for (int i = 0; i < keyCount; i++) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + + int seriesLogical = seriesLogicalIndices[i]; + int seriesBatchIndex = (seriesSelectedInUse ? seriesSelected[seriesLogical] : seriesLogical); + + // Add any selected batch indices before series batch index; + while (batchIndex < seriesBatchIndex) { + resultSelected[resultCount++] = batchIndex; + logical++; + batchIndex = (selectedInUse ? selected[logical] : logical); + } + Preconditions.checkState(batchIndex == seriesBatchIndex); + + // Skip series + logical += seriesDuplicateCounts[i]; + } + + // Grab non series after last series. + while (logical < selectedSize) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + resultSelected[resultCount++] = batchIndex; + logical++; + } + return resultCount; + } + + protected static int makeSelectedByRemovingMultiValues( + boolean selectedInUse, int[] selected, int selectedSize, + int[] matchLogicalIndices, int[] matchDuplicateCounts, boolean[] matchIsSingleValue, + int matchSeriesCount, + boolean seriesSelectedInUse, int[] seriesSelected, int seriesSelectedSize, + int[] resultSelected) { + int resultCount = 0; + int logical = 0; + for (int i = 0; i < matchSeriesCount; i++) { + if (matchIsSingleValue[i]) { + continue; + } + + int batchIndex = (selectedInUse ? selected[logical] : logical); + int seriesLogical = matchLogicalIndices[i]; + int seriesBatchIndex = (seriesSelectedInUse ? seriesSelected[seriesLogical] : seriesLogical); + + // Add any selected batch indices before match series multi-value batch index, including + // match single-value series. + while (batchIndex < seriesBatchIndex) { + resultSelected[resultCount++] = batchIndex; + logical++; + batchIndex = (selectedInUse ? selected[logical] : logical); + } + Preconditions.checkState(batchIndex == seriesBatchIndex); + + // Skip series + logical += matchDuplicateCounts[i]; + } + + // Grab non series after last match series multi-value. + while (logical < selectedSize) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + resultSelected[resultCount++] = batchIndex; + logical++; + } + return resultCount; + } + + + protected static int flattenLogicalSeriesIntoSelected( + boolean selectedInUse, int[] selected, int size, + int[] seriesLogicalIndices, int[] seriesDuplicateCounts, int keyCount, + int[] resultSelected) { + + int resultCount = 0; + for (int i = 0; i < keyCount; i++) { + int seriesLogical = seriesLogicalIndices[i]; + int seriesDuplicateCount = seriesDuplicateCounts[i]; + for (int s = seriesLogical; s < seriesLogical + seriesDuplicateCount; s++) { + int batchIndex = (selectedInUse ? selected[s] : s); + resultSelected[resultCount++] = batchIndex; + } + } + return resultCount; + } + + protected static int makeMatchSelectedWithoutMultiValues( + boolean selectedInUse, int[] selected, int size, + int[] matchLogicalIndices, int[] matchDuplicateCounts, boolean[] matchIsSingleValue, + int matchSeriesCount, + int[] resultSelected) { + + int resultCount = 0; + int matchLogical; + int matchDuplicateCount; + for (int i = 0; i < matchSeriesCount; i++) { + if (matchIsSingleValue[i]) { + // Only include single-value small table result series. + matchLogical = matchLogicalIndices[i]; + matchDuplicateCount = matchDuplicateCounts[i]; + for (int m = matchLogical; m < matchLogical + matchDuplicateCount; m++) { + int batchIndex = (selectedInUse ? selected[m] : m); + resultSelected[resultCount++] = batchIndex; + } + } + } + + return resultCount; } //------------------------------------------------------------------------------------------------ - protected void performValueExpressions(VectorizedRowBatch batch, - int[] allMatchs, int allMatchCount) { + protected void performValueExpressions(VectorizedRowBatch batch, int[] newSelected, + int newSelectedCount) { + /* * For the moment, pretend all matched are selected so we can evaluate the value * expressions. @@ -120,10 +259,10 @@ protected void performValueExpressions(VectorizedRowBatch batch, * selected and real batch size later... */ int[] saveSelected = batch.selected; - batch.selected = allMatchs; + batch.selected = newSelected; boolean saveSelectedInUse = batch.selectedInUse; batch.selectedInUse = true; - batch.size = allMatchCount; + batch.size = newSelectedCount; // Run our value expressions over whole batch. for(VectorExpression ve: bigTableValueExpressions) { @@ -135,7 +274,7 @@ protected void performValueExpressions(VectorizedRowBatch batch, } protected void doSmallTableDeserializeRow(VectorizedRowBatch batch, int batchIndex, - ByteSegmentRef byteSegmentRef, VectorMapJoinHashMapResult hashMapResult) + ByteSegmentRef byteSegmentRef, MapJoinHashMapResult hashMapResult) throws HiveException { byte[] bytes = byteSegmentRef.getBytes(); @@ -165,23 +304,22 @@ protected void doSmallTableDeserializeRow(VectorizedRowBatch batch, int batchInd * Generate join results for a single small table value match. * * @param batch - * The big table batch. + * The big table batch. * @param hashMapResult - * The hash map results for the matching key. - * @param allMatchs - * The selection array for all matches key. - * @param allMatchesIndex - * Index into allMatches of the matching key we are generating for. + * The hash map results for the matching key. + * @param logical * @param duplicateCount - * Number of equal key rows. - * @param numSel - * Current number of rows that are remaining in the big table for forwarding. - * @return - * The new count of selected rows. + * @throws HiveException + * @throws IOException */ - protected int generateHashMapResultSingleValue(VectorizedRowBatch batch, - VectorMapJoinHashMapResult hashMapResult, int[] allMatchs, int allMatchesIndex, - int duplicateCount, int numSel) throws HiveException, IOException { + protected void generateHashMapResultSingleValue(VectorizedRowBatch batch, + MapJoinHashMapResult hashMapResult, int logical, int duplicateCount) + throws HiveException, IOException { + + singleValueCounter += duplicateCount; + + final boolean selectedInUse = batch.selectedInUse; + final int[] selected = batch.selected; // Read single value. @@ -189,9 +327,9 @@ protected int generateHashMapResultSingleValue(VectorizedRowBatch batch, // Generate result within big table batch itself. - for (int i = 0; i < duplicateCount; i++) { + for (int i = logical; i < logical + duplicateCount; i++) { - int batchIndex = allMatchs[allMatchesIndex + i]; + int batchIndex = (selectedInUse ? selected[i] : i); // Outer key copying is only used when we are using the input BigTable batch as the output. // @@ -205,32 +343,28 @@ protected int generateHashMapResultSingleValue(VectorizedRowBatch batch, byteSegmentRef, hashMapResult); } - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, "generateHashMapResultSingleValue big table"); - - // Use the big table row as output. - batch.selected[numSel++] = batchIndex; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, getLoggingPrefix() + " generateHashMapResultSingleValue"); } - - return numSel; } /** * Generate results for a N x M cross product. * * @param batch - * The big table batch. * @param hashMapResult - * The hash map results for the matching key. - * @param allMatchs - * The all match selected array that contains (physical) batch indices. - * @param allMatchesIndex - * The index of the match key. + * @param logical * @param duplicateCount - * Number of equal key rows. + * @throws HiveException + * @throws IOException */ protected void generateHashMapResultMultiValue(VectorizedRowBatch batch, - VectorMapJoinHashMapResult hashMapResult, int[] allMatchs, int allMatchesIndex, - int duplicateCount) throws HiveException, IOException { + MapJoinHashMapResult hashMapResult, int logical, int duplicateCount) + throws HiveException, IOException { + + final boolean selectedInUse = batch.selectedInUse; + final int[] selected = batch.selected; + + multiValueCounter += duplicateCount; if (useOverflowRepeatedThreshold && hashMapResult.isCappedCountAvailable() && @@ -240,16 +374,19 @@ protected void generateHashMapResultMultiValue(VectorizedRowBatch batch, // row batch optimization in the overflow batch. generateHashMapResultLargeMultiValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount); + batch, + hashMapResult, + logical, + duplicateCount); return; } // We do the cross product of the N big table equal key row's values against the // small table matching key which has M value rows into overflow batch. - for (int i = 0; i < duplicateCount; i++) { + for (int i = logical; i < logical + duplicateCount; i++) { - int batchIndex = allMatchs[allMatchesIndex + i]; + int batchIndex = (selectedInUse ? selected[i] : i); ByteSegmentRef byteSegmentRef = hashMapResult.first(); while (byteSegmentRef != null) { @@ -284,19 +421,18 @@ protected void generateHashMapResultMultiValue(VectorizedRowBatch batch, * batch optimization. * * @param batch - * The big table batch. * @param hashMapResult - * The hash map results for the matching key. - * @param allMatchs - * The all match selected array that contains (physical) batch indices. - * @param allMatchesIndex - * The index of the match key. + * @param logical * @param duplicateCount - * Number of equal key rows. + * @throws HiveException + * @throws IOException */ private void generateHashMapResultLargeMultiValue(VectorizedRowBatch batch, - VectorMapJoinHashMapResult hashMapResult, int[] allMatchs, int allMatchesIndex, - int duplicateCount) throws HiveException, IOException { + MapJoinHashMapResult hashMapResult, int logical, int duplicateCount) + throws HiveException, IOException { + + final boolean selectedInUse = batch.selectedInUse; + final int[] selected = batch.selected; // Kick out previous overflow batch results. if (overflowBatch.size > 0) { @@ -332,9 +468,9 @@ private void generateHashMapResultLargeMultiValue(VectorizedRowBatch batch, // And, not set repeating every time... // - for (int i = 0; i < duplicateCount; i++) { + for (int i = logical; i < logical + duplicateCount; i++) { - int batchIndex = allMatchs[allMatchesIndex + i]; + int batchIndex = (selectedInUse ? selected[i] : i); if (bigTableRetainedVectorCopy != null) { // The one big table row's values repeat. @@ -367,41 +503,6 @@ private void generateHashMapResultLargeMultiValue(VectorizedRowBatch batch, overflowBatch.reset(); } - /** - * Generate optimized results when entire batch key is repeated and it matched the hash map. - * - * @param batch - * The big table batch. - * @param hashMapResult - * The hash map results for the repeated key. - */ - protected void generateHashMapResultRepeatedAll(VectorizedRowBatch batch, - VectorMapJoinHashMapResult hashMapResult) throws IOException, HiveException { - - int[] selected = batch.selected; - - if (batch.selectedInUse) { - // The selected array is already filled in as we want it. - } else { - for (int i = 0; i < batch.size; i++) { - selected[i] = i; - } - batch.selectedInUse = true; - } - - int numSel = 0; - if (hashMapResult.isSingleRow()) { - numSel = generateHashMapResultSingleValue(batch, hashMapResult, - batch.selected, 0, batch.size, numSel); - - } else { - generateHashMapResultMultiValue(batch, hashMapResult, - batch.selected, 0, batch.size); - } - - batch.size = numSel; - } - //----------------------------------------------------------------------------------------------- /* @@ -448,26 +549,42 @@ private void setupSpillSerDe(VectorizedRowBatch batch) throws HiveException { } private void spillSerializeRow(VectorizedRowBatch batch, int batchIndex, - VectorMapJoinHashTableResult hashTableResult) throws IOException { + MapJoinHashTableResult hashTableResult) throws IOException { - int partitionId = hashTableResult.spillPartitionId(); + int partitionId = hashTableResult.getSpillPartitionId(); HybridHashTableContainer ht = (HybridHashTableContainer) mapJoinTables[posSingleVectorMapJoinSmallTable]; HashPartition hp = ht.getHashPartitions()[partitionId]; VectorMapJoinRowBytesContainer rowBytesContainer = hp.getMatchfileRowBytesContainer(); Output output = rowBytesContainer.getOuputForRowBytes(); -// int offset = output.getLength(); bigTableVectorSerializeRow.setOutputAppend(output); bigTableVectorSerializeRow.serializeWrite(batch, batchIndex); -// int length = output.getLength() - offset; rowBytesContainer.finishRow(); + } + + protected void spillHashMapBatch(VectorizedRowBatch batch, int[] spillLogicalIndices, + int[] spillDuplicateCounts, MapJoinHashTableResult[] spillHashTableResults, + int spillCount, boolean selectedInUse, int[] selected, int size) + throws HiveException, IOException { + + if (bigTableVectorSerializeRow == null) { + setupSpillSerDe(batch); + } -// LOG.debug("spillSerializeRow spilled batchIndex " + batchIndex + ", length " + length); + for (int i = 0; i < spillCount; i++) { + int logical = spillLogicalIndices[i]; + int duplicateCount = spillDuplicateCounts[i]; + MapJoinHashTableResult hashTableResult = spillHashTableResults[i]; + for (int s = logical; s < logical + duplicateCount; s++) { + int batchIndex = (selectedInUse ? selected[s] : s); + spillSerializeRow(batch, batchIndex, hashTableResult); + } + } } protected void spillHashMapBatch(VectorizedRowBatch batch, - VectorMapJoinHashTableResult[] hashTableResults, + MapJoinHashTableResult[] hashTableResults, int[] spills, int[] spillHashTableResultIndices, int spillCount) throws HiveException, IOException { @@ -479,25 +596,30 @@ protected void spillHashMapBatch(VectorizedRowBatch batch, int batchIndex = spills[i]; int hashTableResultIndex = spillHashTableResultIndices[i]; - VectorMapJoinHashTableResult hashTableResult = hashTableResults[hashTableResultIndex]; + MapJoinHashTableResult hashTableResult = hashTableResults[hashTableResultIndex]; spillSerializeRow(batch, batchIndex, hashTableResult); } } - protected void spillBatchRepeated(VectorizedRowBatch batch, - VectorMapJoinHashTableResult hashTableResult) throws HiveException, IOException { + protected void spillRepeated(VectorizedRowBatch batch, VectorKeySeries keySeries, + MapJoinHashTableResult hashTableResult) + throws HiveException, IOException { if (bigTableVectorSerializeRow == null) { setupSpillSerDe(batch); } - int[] selected = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - for (int logical = 0; logical < batch.size; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - spillSerializeRow(batch, batchIndex, hashTableResult); + final int end = keySeries.currentDuplicateCount; + if (batch.selectedInUse) { + final int[] selected = batch.selected; + for (int logical = 1; logical < end; logical++) { + spillSerializeRow(batch, selected[logical], hashTableResult); + } + } else { + for (int batchIndex = 1; batchIndex < end; batchIndex++) { + spillSerializeRow(batch, batchIndex, hashTableResult); + } } } @@ -511,10 +633,9 @@ protected void reloadHashTable(byte pos, int partitionId) MapJoinTableContainer smallTable = spilledMapJoinTables[pos]; - vectorMapJoinHashTable = VectorMapJoinOptimizedCreateHashTable.createHashTable(conf, - smallTable); + vectorMapJoinHashTableFind = smallTable.getMapJoinHashTableFind(); + needHashTableSetup = true; - LOG.info("Created " + vectorMapJoinHashTable.getClass().getSimpleName() + " from " + this.getClass().getSimpleName()); if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " reloadHashTable!"); @@ -607,7 +728,10 @@ public void forwardBigTableBatch(VectorizedRowBatch batch) throws HiveException batch.projectionSize = outputProjection.length; batch.projectedColumns = outputProjection; + // VectorizedBatchUtil.debugDisplayBatch(batch, CLASS_NAME); + forward(batch, null); + bigTableOutputRowCounter += batch.size; // Revert the projected columns back, because batch can be re-used by our parent operators. batch.projectionSize = originalProjectionSize; @@ -620,6 +744,7 @@ public void forwardBigTableBatch(VectorizedRowBatch batch) throws HiveException */ protected void forwardOverflow() throws HiveException { forward(overflowBatch, null); + overflowOutputRowCounter += overflowBatch.size; overflowBatch.reset(); } @@ -628,6 +753,7 @@ protected void forwardOverflow() throws HiveException { */ private void forwardOverflowNoReset() throws HiveException { forward(overflowBatch, null); + overflowOutputRowCounter += overflowBatch.size; } /* @@ -640,11 +766,20 @@ private void forwardOverflowNoReset() throws HiveException { @Override public void closeOp(boolean aborted) throws HiveException { super.closeOp(aborted); - if (!aborted && overflowBatch.size > 0) { - forwardOverflow(); - } - if (isLogDebugEnabled) { - LOG.debug("VectorMapJoinInnerLongOperator closeOp " + batchCounter + " batches processed"); + if (!aborted) { + if (overflowBatch.size > 0) { + forwardOverflow(); + } + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " closeOp " + batchCounter + " batches processed, " + + inputRowCounter + " big table input rows, " + + totalNumSmallTableKeys + " small table keys, " + + spilledRowCounter + " spilled rows, " + + singleValueCounter + " single value rows, " + + multiValueCounter + " multiple value rows " + + bigTableOutputRowCounter + " big table output rows, " + + overflowOutputRowCounter + " overflow output rows"); + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyGenerateResultOperator.java index dfb5bf8..ea3f1fe 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyGenerateResultOperator.java @@ -23,16 +23,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeries; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import com.google.common.base.Preconditions; + /** * This class has methods for generating vectorized join results for the big table only * variation of inner joins. @@ -48,42 +49,41 @@ extends VectorMapJoinGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyGenerateResultOperator.class.getName()); - //--------------------------------------------------------------------------- + //------------------------------------------------------------------------------------------------ + + private static final String CLASS_NAME = VectorMapJoinInnerBigOnlyGenerateResultOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------------------------ // Inner big-table only join specific members. // // An array of hash multi-set results so we can do lookups on the whole batch before output result // generation. - protected transient VectorMapJoinHashMultiSetResult hashMultiSetResults[]; + protected transient MapJoinHashMultiSetResult hashMultiSetResults[]; - // Pre-allocated member for storing the (physical) batch index of matching row (single- or - // multi-small-table-valued) indexes during a process call. - protected transient int[] allMatchs; - - /* - * Pre-allocated members for storing information on single- and multi-valued-small-table matches. - * - * ~ValueCounts - * Number of (empty) small table values. - * ~AllMatchIndices - * (Logical) indices into allMatchs to the first row of a match of a - * possible series of duplicate keys. - * ~DuplicateCounts - * The duplicate count for each matched key. - * - */ - protected transient long[] equalKeySeriesValueCounts; - protected transient int[] equalKeySeriesAllMatchIndices; - protected transient int[] equalKeySeriesDuplicateCounts; + // Pre-allocated member for storing the batch indices of the matched rows. + protected transient int[] matchSelected; + // Pre-allocated member for storing the new logical batch index (within newSelected) and + // series count of rows that matched. + protected transient int[] matchLogicalIndices; + protected transient int[] matchDuplicateCounts; + protected transient long[] matchValueCounts; - // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. - protected transient int[] spills; + // Pre-allocated member for storing the logical batch index and series count of rows that spilled. + protected transient int[] spillLogicalIndices; + protected transient int[] spillDuplicateCounts; - // Pre-allocated member for storing index into the hashMultiSetResults for each spilled row. - protected transient int[] spillHashMapResultIndices; + // Pre-allocated member for storing the reference to the hash multi-set results of rows that spilled. + protected transient MapJoinHashMultiSetResult[] spillHashMultiSetResults; /** Kryo ctor. */ protected VectorMapJoinInnerBigOnlyGenerateResultOperator() { @@ -106,21 +106,20 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); // Inner big-table only join specific. - VectorMapJoinHashMultiSet baseHashMultiSet = (VectorMapJoinHashMultiSet) vectorMapJoinHashTable; - - hashMultiSetResults = new VectorMapJoinHashMultiSetResult[batch.DEFAULT_SIZE]; + hashMultiSetResults = new MapJoinHashMultiSetResult[batch.DEFAULT_SIZE]; for (int i = 0; i < hashMultiSetResults.length; i++) { - hashMultiSetResults[i] = baseHashMultiSet.createHashMultiSetResult(); + hashMultiSetResults[i] = vectormapJoinHashTableFactory.createHashMultiSetResult(); } - allMatchs = new int[batch.DEFAULT_SIZE]; + matchSelected = new int[batch.DEFAULT_SIZE]; - equalKeySeriesValueCounts = new long[batch.DEFAULT_SIZE]; - equalKeySeriesAllMatchIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchLogicalIndices = new int[batch.DEFAULT_SIZE]; + matchDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchValueCounts = new long[batch.DEFAULT_SIZE]; - spills = new int[batch.DEFAULT_SIZE]; - spillHashMapResultIndices = new int[batch.DEFAULT_SIZE]; + spillLogicalIndices = new int[batch.DEFAULT_SIZE]; + spillDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillHashMultiSetResults = new MapJoinHashMultiSetResult[batch.DEFAULT_SIZE]; } //----------------------------------------------------------------------------------------------- @@ -135,50 +134,60 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { * @param batch * The big table batch with any matching and any non matching rows both as * selected in use. - * @param allMatchCount - * Number of matches in allMatchs. - * @param equalKeySeriesCount - * Number of single value matches. - * @param spillCount - * Number of spills in spills. - * @param hashTableResults - * The array of all hash table results for the batch. We need the - * VectorMapJoinHashTableResult for the spill information. - * @param hashMapResultCount - * Number of entries in hashMapResults. + * @param matchSeriesCount + * Number of matches in the match* arrays. + * @param matchSelectedCount + * The selected count in matchSelected. + * @param spillSeriesCount + * Number of spills in spill* arrays. * **/ - protected void finishInnerBigOnly(VectorizedRowBatch batch, - int allMatchCount, int equalKeySeriesCount, int spillCount, - VectorMapJoinHashTableResult[] hashTableResults, int hashMapResultCount) - throws HiveException, IOException { + protected void finishInnerBigOnly(VectorizedRowBatch batch, int matchSeriesCount, + int spillSeriesCount) throws HiveException, IOException { + + final int selectedSize = batch.size; + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; - // Get rid of spills before we start modifying the batch. - if (spillCount > 0) { - spillHashMapBatch(batch, hashTableResults, - spills, spillHashMapResultIndices, spillCount); + // Dump out the spill rows now. + if (spillSeriesCount > 0) { + + spillHashMapBatch(batch, spillLogicalIndices, spillDuplicateCounts, spillHashMultiSetResults, + spillSeriesCount, selectedInUse, selected, selectedSize); + + if (spillSeriesCount == selectedSize) { + batch.size = 0; + batch.selectedInUse = false; + return; + } } + + /* * Optimize by running value expressions only over the matched rows. */ - if (allMatchCount > 0 && bigTableValueExpressions != null) { - performValueExpressions(batch, allMatchs, allMatchCount); + if (matchSeriesCount > 0 && bigTableValueExpressions != null) { + int matchSelectedCount = + flattenLogicalSeriesIntoSelected( + selectedInUse, selected, selectedSize, + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + matchSelected); + performValueExpressions(batch, matchSelected, matchSelectedCount); } int numSel = 0; - for (int i = 0; i < equalKeySeriesCount; i++) { - long count = equalKeySeriesValueCounts[i]; - int allMatchesIndex = equalKeySeriesAllMatchIndices[i]; - int duplicateCount = equalKeySeriesDuplicateCounts[i]; + for (int i = 0; i < matchSeriesCount; i++) { + int logical = matchLogicalIndices[i]; + int duplicateCount = matchDuplicateCounts[i]; + long count = matchValueCounts[i]; if (count == 1) { - numSel = generateHashMultiSetResultSingleValue( - batch, allMatchs, allMatchesIndex, duplicateCount, numSel); + numSel = generateHashMultiSetResultSingleValue(batch, + logical, duplicateCount, selectedInUse, selected, numSel); } else { generateHashMultiSetResultMultiValue(batch, - allMatchs, allMatchesIndex, - duplicateCount, count); + logical, duplicateCount, selectedInUse, selected, count); } } batch.size = numSel; @@ -186,14 +195,64 @@ protected void finishInnerBigOnly(VectorizedRowBatch batch, } /** + * Generate the inner big table only join output results for one vectorized row batch that + * has a repeated key. + * + * @param batch + * The big table batch with any matching and any non matching rows both as + * selected in use. + * @param keySeries + * @param hashMultiSetResult + * @throws IOException + * @throws HiveException + */ + protected void finishInnerBigOnlyRepeated(VectorizedRowBatch batch, VectorKeySeries keySeries, + MapJoinHashMultiSetResult hashMultiSetResult) throws HiveException, IOException { + if (hashMultiSetResult.getMapJoinResult() == MapJoinResult.SPILL) { + spillRepeated(batch, keySeries, hashMultiSetResult); + return; + } + Preconditions.checkState(hashMultiSetResult.getMapJoinResult() == MapJoinResult.MATCH); + + // The whole batch is matched. + if (hashMultiSetResult.count() == 1) { + // Single value -- use batch as output. + } else { + + // CONSIDER: Since key is repeated, could do repeated column optimization. + + final boolean selectedInUse = batch.selectedInUse; + final int[] selected = batch.selected; + final int begin = keySeries.currentLogical; + final int end = begin + keySeries.currentDuplicateCount; + for (int logical = begin; logical < end; logical++) { + + int batchIndex = (selectedInUse ? selected[logical] : logical); + + // Copy the BigTable values into the overflow batch. Since the overflow batch may + // not get flushed here, we must copy by value. + if (bigTableRetainedVectorCopy != null) { + bigTableRetainedVectorCopy.copyByValue(batch, batchIndex, + overflowBatch, overflowBatch.size); + } + + overflowBatch.size++; + if (overflowBatch.size == overflowBatch.DEFAULT_SIZE) { + forwardOverflow(); + } + } + + // Use nothing from big table batch. + batch.size = 0; + } + } + + /** * Generate the single value match inner big table only join output results for a match. * * @param batch * The big table batch. - * @param allMatchs - * A subset of the rows of the batch that are matches. - * @param allMatchesIndex - * The logical index into allMatchs of the first equal key. + * @param logical * @param duplicateCount * The number of duplicates or equal keys. * @param numSel @@ -202,24 +261,20 @@ protected void finishInnerBigOnly(VectorizedRowBatch batch, * @return * The new count of selected rows. */ - private int generateHashMultiSetResultSingleValue(VectorizedRowBatch batch, - int[] allMatchs, int allMatchesIndex, int duplicateCount, int numSel) + private int generateHashMultiSetResultSingleValue(VectorizedRowBatch batch, int logical, + int duplicateCount, boolean selectedInUse, int[] selected, int numSel) throws HiveException, IOException { - // LOG.debug("generateHashMultiSetResultSingleValue enter..."); - // Generate result within big table batch itself. - // LOG.debug("generateHashMultiSetResultSingleValue with big table..."); + for (int i = logical; i < logical + duplicateCount; i++) { - for (int i = 0; i < duplicateCount; i++) { - - int batchIndex = allMatchs[allMatchesIndex + i]; + int batchIndex = (selectedInUse ? selected[i] : i); // Use the big table row as output. batch.selected[numSel++] = batchIndex; } - + return numSel; } @@ -237,17 +292,14 @@ private int generateHashMultiSetResultSingleValue(VectorizedRowBatch batch, * @param count * Value count. */ - private void generateHashMultiSetResultMultiValue(VectorizedRowBatch batch, - int[] allMatchs, int allMatchesIndex, - int duplicateCount, long count) throws HiveException, IOException { - - // LOG.debug("generateHashMultiSetResultMultiValue allMatchesIndex " + allMatchesIndex + " duplicateCount " + duplicateCount + " count " + count); + private void generateHashMultiSetResultMultiValue(VectorizedRowBatch batch, int logical, + int duplicateCount, boolean selectedInUse, int[] selected, long count) throws HiveException, IOException { // TODO: Look at repeating optimizations... for (int i = 0; i < duplicateCount; i++) { - int batchIndex = allMatchs[allMatchesIndex + i]; + int batchIndex = (selectedInUse ? selected[i] : i); for (long l = 0; l < count; l++) { @@ -265,70 +317,4 @@ private void generateHashMultiSetResultMultiValue(VectorizedRowBatch batch, } } } - - /** - * Generate the inner big table only join output results for one vectorized row batch with - * a repeated key. - * - * @param batch - * The big table batch with any matching and any non matching rows both as - * selected in use. - * @param hashMultiSetResult - * The hash multi-set results for the batch. - */ - protected int generateHashMultiSetResultRepeatedAll(VectorizedRowBatch batch, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws HiveException { - - long count = hashMultiSetResult.count(); - - if (batch.selectedInUse) { - // The selected array is already filled in as we want it. - } else { - int[] selected = batch.selected; - for (int i = 0; i < batch.size; i++) { - selected[i] = i; - } - batch.selectedInUse = true; - } - - do { - forwardBigTableBatch(batch); - count--; - } while (count > 0); - - // We forwarded the batch in this method. - return 0; - } - - protected void finishInnerBigOnlyRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws HiveException, IOException { - - switch (joinResult) { - case MATCH: - - if (bigTableValueExpressions != null) { - // Run our value expressions over whole batch. - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } - } - - // Generate special repeated case. - int numSel = generateHashMultiSetResultRepeatedAll(batch, hashMultiSetResult); - batch.size = numSel; - batch.selectedInUse = true; - break; - - case SPILL: - // Whole batch is spilled. - spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashMultiSetResult); - batch.size = 0; - break; - - case NOMATCH: - // No match for entire batch. - batch.size = 0; - break; - } - } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyLongOperator.java index 0bba141..a7fd9e2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyLongOperator.java @@ -24,19 +24,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; - -// Single-Column Long hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMultiSet; - -// Single-Column Long specific imports. -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /* * Specialized class for doing a vectorized map join that is an inner join on a Single-Column Long @@ -45,8 +43,17 @@ public class VectorMapJoinInnerBigOnlyLongOperator extends VectorMapJoinInnerBigOnlyGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerBigOnlyLongOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -54,8 +61,10 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinLongHashMultiSet hashMultiSet; + // CONSIDER using final class VectorMapJoinFastLongHashMultiSet object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashMultiSet; //--------------------------------------------------------------------------- // Single-Column Long specific members. @@ -68,6 +77,10 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + private transient PrimitiveTypeInfo singleJoinColumnPrimitiveTypeInfo; + + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesLong longKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -108,6 +121,10 @@ public void process(Object row, int tag) throws HiveException { */ singleJoinColumn = bigTableKeyColumnMap[0]; + singleJoinColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) bigTableKeyTypeInfos[0]; + + longKeySeries = + new VectorKeySeriesLong(singleJoinColumn, singleJoinColumnPrimitiveTypeInfo); needCommonSetup = false; } @@ -119,8 +136,7 @@ public void process(Object row, int tag) throws HiveException { /* * Get our Single-Column Long hash multi-set information for this specialized class. */ - - hashMultiSet = (VectorMapJoinLongHashMultiSet) vectorMapJoinHashTable; + hashMultiSet = vectorMapJoinHashTableFind; useMinMax = hashMultiSet.useMinMax(); if (useMinMax) { min = hashMultiSet.min(); @@ -131,6 +147,7 @@ public void process(Object row, int tag) throws HiveException { } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner big-only join. @@ -142,9 +159,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -158,230 +175,116 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column Long specific declarations. - */ + longKeySeries.processBatch(batch); - // The one join column for this specialized class. - LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn]; - long[] vector = joinColVector.vector; + MapJoinHashMultiSetResult hashMultiSetResult; + if (longKeySeries.keyCount == 1) { - /* - * Single-Column Long check for repeating. - */ + // Effectively, one repeating key. - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + if (longKeySeries.currentKeyIsNull) { - if (allKeyInputColumnsRepeating) { + // CONSIDER: Add support for NullSafe option. - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Single-Column Long specific repeated lookup. - */ - - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; + batch.size = 0; } else { - long key = vector[0]; - if (useMinMax && (key < min || key > max)) { - // Out of range for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; + long repeatedKey = longKeySeries.getCurrentKey(); + if (useMinMax && (repeatedKey < min || repeatedKey > max)) { + batch.size = 0; } else { - joinResult = hashMultiSet.contains(key, hashMultiSetResults[0]); + hashMultiSetResult = hashMultiSetResults[0]; + hashMultiSet.hashMultiSetContains( + repeatedKey, + longKeySeries.currentHashCode, + hashMultiSetResult); + if (hashMultiSetResult.getMapJoinResult() != MapJoinResult.NO_MATCH) { + finishInnerBigOnlyRepeated(batch, longKeySeries, hashMultiSetResult); + } else { + batch.size = 0; + } } } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]); } else { - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashMultiSetResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - /* - * Single-Column Long specific variables. - */ - - long saveKey = 0; + MapJoinHashTableResult.MapJoinResult containsResult; + long key; + do { + // Use the next hash multi-set result entry. + hashMultiSetResult = hashMultiSetResults[hashMultiSetResultCount]; - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + if (longKeySeries.currentKeyIsNull) { - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + // CONSIDER: Add support for NullSafe option. - /* - * Single-Column Long get key. - */ + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - long currentKey; - boolean isNull; - if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) { - currentKey = 0; - isNull = true; } else { - currentKey = vector[batchIndex]; - isNull = false; - } - - /* - * Equal key series checking. - */ - - if (isNull || !haveSaveKey || currentKey != saveKey) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; - break; - case SPILL: - // We keep the hash multi-set result for its spill information. - hashMultiSetResultCount++; - break; - case NOMATCH: - break; - } - } - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; + key = longKeySeries.getCurrentKey(); + if (useMinMax && (key < min || key > max)) { + // Out of range for whole batch. + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column Long specific save key. - */ - - saveKey = currentKey; - - /* - * Single-Column Long specific lookup key. - */ - - if (useMinMax && (currentKey < min || currentKey > max)) { - // Key out of range for whole hash table. - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - } else { - saveJoinResult = hashMultiSet.contains(currentKey, hashMultiSetResults[hashMultiSetResultCount]); - } + hashMultiSet.hashMultiSetContains( + key, + longKeySeries.currentHashCode, + hashMultiSetResult); + containsResult = hashMultiSetResult.getMapJoinResult(); } - /* - * Common inner big-only join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count(); - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } } - } - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + /* + * Common inner join result processing. + */ + + switch (containsResult) { case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; + // We have extracted the existence and count from the hash multi-set result, so we don't + // keep it. + matchLogicalIndices[matchSeriesCount] = longKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = longKeySeries.currentDuplicateCount; + matchValueCounts[matchSeriesCount] = hashMultiSetResult.count(); + matchSeriesCount++; break; + case SPILL: - // We keep the hash multi-set result for its spill information. + spillLogicalIndices[spillSeriesCount] = longKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = longKeySeries.currentDuplicateCount; + spillHashMultiSetResults[spillSeriesCount] = hashMultiSetResult; + spillSeriesCount++; hashMultiSetResultCount++; + spilledRowCounter += longKeySeries.currentDuplicateCount; break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - } + + if (!longKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount))); + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - finishInnerBigOnly(batch, - allMatchCount, equalKeySeriesCount, spillCount, - (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount); + finishInnerBigOnly(batch, matchSeriesCount, spillSeriesCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyMultiKeyOperator.java index 621804b..7ce62da 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyMultiKeyOperator.java @@ -24,22 +24,22 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -// Multi-Key hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; - // Multi-Key specific imports. -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; +import com.google.common.base.Preconditions; + /* * Specialized class for doing a vectorized map join that is an inner join on Multi-Key * and only big table columns appear in the join result so a hash multi-set is used. @@ -48,8 +48,17 @@ public class VectorMapJoinInnerBigOnlyMultiKeyOperator extends VectorMapJoinInnerBigOnlyGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyMultiKeyOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerBigOnlyMultiKeyOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -57,22 +66,20 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMultiSet hashMultiSet; + // CONSIDER using final class VectorMapJoinFastMultiKeyHashMultiSet object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashMultiSet; //--------------------------------------------------------------------------- // Multi-Key specific members. // - // Object that can take a set of columns in row in a vectorized row batch and serialized it. - // Known to not have any nulls. - private transient VectorSerializeRow keyVectorSerializeWrite; + // Binary sortable multi-key serializer. + private transient BinarySortableSerializeWrite binarySortableSerializeWrite; - // The BinarySortable serialization of the current key. - private transient Output currentKeyOutput; - - // The BinarySortable serialization of the saved key for a possible series of equal keys. - private transient Output saveKeyOutput; + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesMultiFast serializedMultiKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -112,12 +119,16 @@ public void process(Object row, int tag) throws HiveException { * Initialize Multi-Key members for this specialized class. */ - keyVectorSerializeWrite = new VectorSerializeRow( - new BinarySortableSerializeWrite(bigTableKeyColumnMap.length)); - keyVectorSerializeWrite.init(bigTableKeyTypeNames, bigTableKeyColumnMap); + binarySortableSerializeWrite = + new BinarySortableSerializeWrite(bigTableKeyColumnMap.length); - currentKeyOutput = new Output(); - saveKeyOutput = new Output(); + // For Multi-Key the Fast hash table computes the serialized key's hash code, + // so we don't use VectorKeySeriesMulti's hash code. + serializedMultiKeySeries = + new VectorKeySeriesMultiFast( + binarySortableSerializeWrite); + Preconditions.checkState(bigTableKeyTypeInfos.length > 0); + serializedMultiKeySeries.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); needCommonSetup = false; } @@ -130,12 +141,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Multi-Key hash multi-set information for this specialized class. */ - hashMultiSet = (VectorMapJoinBytesHashMultiSet) vectorMapJoinHashTable; + hashMultiSet = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner big-only join. @@ -147,9 +159,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -163,232 +175,111 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Multi-Key specific declarations. - */ - - // None. - - /* - * Multi-Key check for repeating. - */ - - // If all BigTable input columns to key expressions are isRepeating, then - // calculate key once; lookup once. - boolean allKeyInputColumnsRepeating; - if (bigTableKeyColumnMap.length == 0) { - allKeyInputColumnsRepeating = false; - } else { - allKeyInputColumnsRepeating = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) { - allKeyInputColumnsRepeating = false; - break; - } - } - } + serializedMultiKeySeries.processBatch(batch); - if (allKeyInputColumnsRepeating) { + MapJoinHashMultiSetResult hashMultiSetResult; + if (serializedMultiKeySeries.keyCount == 1) { - /* - * Repeating. - */ + // Effectively, one repeating key. - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + // NOTE: Any null column in the key columns is a non-match. + if (serializedMultiKeySeries.currentKeyIsNull) { - /* - * Multi-Key specific repeated lookup. - */ + // CONSIDER: Add support for NullSafe option. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, 0); - JoinUtil.JoinResult joinResult; - if (keyVectorSerializeWrite.getHasAnyNulls()) { - joinResult = JoinUtil.JoinResult.NOMATCH; + batch.size = 0; } else { - byte[] keyBytes = currentKeyOutput.getData(); - int keyLength = currentKeyOutput.getLength(); - joinResult = hashMultiSet.contains(keyBytes, 0, keyLength, hashMultiSetResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + hashMultiSetResult = hashMultiSetResults[0]; + hashMultiSet.hashMultiSetContains( + serializedMultiKeySeries.serializedBytes, + serializedMultiKeySeries.serializedStart, + serializedMultiKeySeries.serializedLength, + serializedMultiKeySeries.currentHashCode, + hashMultiSetResult); + if (hashMultiSetResult.getMapJoinResult() != MapJoinResult.NO_MATCH) { + finishInnerBigOnlyRepeated(batch, serializedMultiKeySeries, hashMultiSetResult); + } else { + batch.size = 0; + } } - finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]); } else { - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashMultiSetResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - /* - * Multi-Key specific variables. - */ + MapJoinHashTableResult.MapJoinResult containsResult; + do { + // Use the next hash multi-set result entry. + hashMultiSetResult = hashMultiSetResults[hashMultiSetResultCount]; - Output temp; + // NOTE: Any null column in the key columns is a non-match. + if (serializedMultiKeySeries.currentKeyIsNull) { - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + // CONSIDER: Add support for NullSafe option. - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - /* - * Multi-Key get key. - */ + LOG.info(CLASS_NAME + " logical " + serializedMultiKeySeries.currentLogical + " hasAnyNulls true"); + } else { + + hashMultiSet.hashMultiSetContains( + serializedMultiKeySeries.serializedBytes, + serializedMultiKeySeries.serializedStart, + serializedMultiKeySeries.serializedLength, + serializedMultiKeySeries.currentHashCode, + hashMultiSetResult); + containsResult = hashMultiSetResult.getMapJoinResult(); - // Generate binary sortable key for current row in vectorized row batch. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, batchIndex); - boolean isAnyNulls = keyVectorSerializeWrite.getHasAnyNulls(); + } /* - * Equal key series checking. + * Common inner join result processing. */ - if (isAnyNulls || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; - break; - case SPILL: - // We keep the hash multi-set result for its spill information. - hashMultiSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isAnyNulls) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Multi-Key specific save key. - */ - - temp = saveKeyOutput; - saveKeyOutput = currentKeyOutput; - currentKeyOutput = temp; - - /* - * Single-Column Long specific lookup key. - */ - - byte[] keyBytes = saveKeyOutput.getData(); - int keyLength = saveKeyOutput.getLength(); - saveJoinResult = hashMultiSet.contains(keyBytes, 0, keyLength, hashMultiSetResults[hashMultiSetResultCount]); - } - - /* - * Common inner big-only join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count(); - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + switch (containsResult) { case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; + // We have extracted the existence and count from the hash multi-set result, so we don't + // keep it. + matchLogicalIndices[matchSeriesCount] = serializedMultiKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = serializedMultiKeySeries.currentDuplicateCount; + matchValueCounts[matchSeriesCount] = hashMultiSetResult.count(); + matchSeriesCount++; break; + case SPILL: - // We keep the hash multi-set result for its spill information. + spillLogicalIndices[spillSeriesCount] = serializedMultiKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = serializedMultiKeySeries.currentDuplicateCount; + spillHashMultiSetResults[spillSeriesCount] = hashMultiSetResult; + spillSeriesCount++; hashMultiSetResultCount++; + spilledRowCounter += serializedMultiKeySeries.currentDuplicateCount; break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - } + + if (!serializedMultiKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount))); + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - finishInnerBigOnly(batch, - allMatchCount, equalKeySeriesCount, spillCount, - (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount); + finishInnerBigOnly(batch, matchSeriesCount, spillSeriesCount); } if (batch.size > 0) { @@ -402,4 +293,9 @@ public void process(Object row, int tag) throws HiveException { throw new HiveException(e); } } + + private String getCurentLogical() { + // TODO Auto-generated method stub + return null; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyStringOperator.java index 10e75ab..23000b7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerBigOnlyStringOperator.java @@ -24,20 +24,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytes; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column String hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; - -// Single-Column String specific imports. -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; /* * Specialized class for doing a vectorized map join that is an inner join on a Single-Column String @@ -46,8 +44,17 @@ public class VectorMapJoinInnerBigOnlyStringOperator extends VectorMapJoinInnerBigOnlyGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyStringOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerBigOnlyStringOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -55,8 +62,10 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMultiSet hashMultiSet; + // CONSIDER using final class VectorMapJoinFastStringHashMultiSet object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashMultiSet; //--------------------------------------------------------------------------- // Single-Column String specific members. @@ -65,6 +74,9 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesBytes bytesKeySeries; + //--------------------------------------------------------------------------- // Pass-thru constructors. // @@ -105,6 +117,8 @@ public void process(Object row, int tag) throws HiveException { singleJoinColumn = bigTableKeyColumnMap[0]; + bytesKeySeries = new VectorKeySeriesBytes(singleJoinColumn); + needCommonSetup = false; } @@ -116,12 +130,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column String hash multi-set information for this specialized class. */ - hashMultiSet = (VectorMapJoinBytesHashMultiSet) vectorMapJoinHashTable; + hashMultiSet = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner big-only join. @@ -133,9 +148,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -149,225 +164,108 @@ public void process(Object row, int tag) throws HiveException { } } - // We rebuild in-place the selected array with rows destine to be forwarded. - int numSel = 0; - - /* - * Single-Column String specific declarations. - */ - - // The one join column for this specialized class. - BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; - byte[][] vector = joinColVector.vector; - int[] start = joinColVector.start; - int[] length = joinColVector.length; + bytesKeySeries.processBatch(batch); - /* - * Single-Column String check for repeating. - */ + MapJoinHashMultiSetResult hashMultiSetResult; + if (bytesKeySeries.keyCount == 1) { - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; - - if (allKeyInputColumnsRepeating) { - - /* - * Repeating. - */ + // Effectively, one repeating key. - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + if (bytesKeySeries.currentKeyIsNull) { - /* - * Single-Column String specific repeated lookup. - */ + // CONSIDER: Add support for NullSafe option. - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; + batch.size = 0; } else { - byte[] keyBytes = vector[0]; - int keyStart = start[0]; - int keyLength = length[0]; - joinResult = hashMultiSet.contains(keyBytes, keyStart, keyLength, hashMultiSetResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + hashMultiSetResult = hashMultiSetResults[0]; + hashMultiSet.hashMultiSetContains( + bytesKeySeries.currentBytes, + bytesKeySeries.currentStart, + bytesKeySeries.currentLength, + bytesKeySeries.currentHashCode, + hashMultiSetResult); + if (hashMultiSetResult.getMapJoinResult() != MapJoinResult.NO_MATCH) { + finishInnerBigOnlyRepeated(batch, bytesKeySeries, hashMultiSetResult); + } else { + batch.size = 0; + } } - finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]); } else { - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashMultiSetResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; + int matchCount = 0; int spillCount = 0; + int hashMultiSetResultCount = 0; - /* - * Single-Column String specific variables. - */ + MapJoinHashTableResult.MapJoinResult containsResult; + do { + // Use the next hash multi-set result entry. + hashMultiSetResult = hashMultiSetResults[hashMultiSetResultCount]; - int saveKeyBatchIndex = -1; + if (bytesKeySeries.currentKeyIsNull) { - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + // CONSIDER: Add support for NullSafe option. - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - /* - * Single-Column String get key. - */ + } else { - // Implicit -- use batchIndex. - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; + hashMultiSet.hashMultiSetContains( + bytesKeySeries.currentBytes, + bytesKeySeries.currentStart, + bytesKeySeries.currentLength, + bytesKeySeries.currentHashCode, + hashMultiSetResult); + containsResult = hashMultiSetResult.getMapJoinResult(); + + } /* - * Equal key series checking. + * Common inner join result processing. */ - if (isNull || !haveSaveKey || - StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], - vector[batchIndex], start[batchIndex], length[batchIndex]) == false) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; - break; - case SPILL: - // We keep the hash multi-set result for its spill information. - hashMultiSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column String specific save key. - */ - - saveKeyBatchIndex = batchIndex; - - /* - * Single-Column String specific lookup key. - */ - - byte[] keyBytes = vector[batchIndex]; - int keyStart = start[batchIndex]; - int keyLength = length[batchIndex]; - saveJoinResult = hashMultiSet.contains(keyBytes, keyStart, keyLength, hashMultiSetResults[hashMultiSetResultCount]); - } - - /* - * Common inner big-only join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count(); - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMultiSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + switch (containsResult) { case MATCH: - // We have extracted the count from the hash multi-set result, so we don't keep it. - equalKeySeriesCount++; + // We have extracted the existence and count from the hash multi-set result, so we don't + // keep it. + matchLogicalIndices[matchCount] = bytesKeySeries.currentLogical; + matchDuplicateCounts[matchCount] = bytesKeySeries.currentDuplicateCount; + matchValueCounts[matchCount] = hashMultiSetResult.count(); + matchCount++; break; + case SPILL: - // We keep the hash multi-set result for its spill information. + spillLogicalIndices[spillCount] = bytesKeySeries.currentLogical; + spillDuplicateCounts[spillCount] = bytesKeySeries.currentDuplicateCount; + spillHashMultiSetResults[spillCount] = hashMultiSetResult; + spillCount++; hashMultiSetResultCount++; + spilledRowCounter += bytesKeySeries.currentDuplicateCount; break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - } + + if (!bytesKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount))); + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchCount " + matchCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchCount)) + + " spillCount " + spillCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillCount))); } - finishInnerBigOnly(batch, - allMatchCount, equalKeySeriesCount, spillCount, - (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount); + finishInnerBigOnly(batch, matchCount, spillCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerGenerateResultOperator.java index 319a2b0..c7c0935 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerGenerateResultOperator.java @@ -23,17 +23,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeries; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import com.google.common.base.Preconditions; + /** * This class has methods for generating vectorized join results for inner joins. * @@ -59,36 +60,25 @@ // An array of hash map results so we can do lookups on the whole batch before output result // generation. - protected transient VectorMapJoinHashMapResult hashMapResults[]; + protected transient MapJoinHashMapResult[] hashMapResults; - // Pre-allocated member for storing the (physical) batch index of matching row (single- or - // multi-small-table-valued) indexes during a process call. - protected transient int[] allMatchs; + // Pre-allocated member for storing the new logical batch index (within matchSelected), + // series count of rows, and hash map results that matched. + protected transient int[] matchLogicalIndices; + protected transient int[] matchDuplicateCounts; + protected transient boolean[] matchIsSingleValue; + protected transient MapJoinHashMapResult[] matchHashMapResults; - /* - * Pre-allocated members for storing information equal key series for small-table matches. - * - * ~HashMapResultIndices - * Index into the hashMapResults array for the match. - * ~AllMatchIndices - * (Logical) indices into allMatchs to the first row of a match of a - * possible series of duplicate keys. - * ~IsSingleValue - * Whether there is 1 or multiple small table values. - * ~DuplicateCounts - * The duplicate count for each matched key. - * - */ - protected transient int[] equalKeySeriesHashMapResultIndices; - protected transient int[] equalKeySeriesAllMatchIndices; - protected transient boolean[] equalKeySeriesIsSingleValue; - protected transient int[] equalKeySeriesDuplicateCounts; + // Pre-allocated member for storing the logical batch index and series count of rows that spilled. + protected transient int[] spillLogicalIndices; + protected transient int[] spillDuplicateCounts; - // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. - protected transient int[] spills; + // Pre-allocated member for storing the reference to the hash map results of rows that spilled. + protected transient MapJoinHashMapResult[] spillHashMapResults; - // Pre-allocated member for storing index into the hashMapResults for each spilled row. - protected transient int[] spillHashMapResultIndices; + // Pre-allocated member for storing the batch indices of the matched rows. + protected transient int[] matchSelected; + protected transient int[] resultSelected; /** Kryo ctor. */ protected VectorMapJoinInnerGenerateResultOperator() { @@ -111,22 +101,23 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); // Inner join specific. - VectorMapJoinHashMap baseHashMap = (VectorMapJoinHashMap) vectorMapJoinHashTable; - - hashMapResults = new VectorMapJoinHashMapResult[batch.DEFAULT_SIZE]; + hashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; for (int i = 0; i < hashMapResults.length; i++) { - hashMapResults[i] = baseHashMap.createHashMapResult(); + hashMapResults[i] = vectormapJoinHashTableFactory.createHashMapResult(); } - allMatchs = new int[batch.DEFAULT_SIZE]; + matchLogicalIndices = new int[batch.DEFAULT_SIZE]; + matchDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchIsSingleValue = new boolean[batch.DEFAULT_SIZE]; + matchHashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; + + spillLogicalIndices = new int[batch.DEFAULT_SIZE]; + spillDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillHashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; - equalKeySeriesHashMapResultIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesAllMatchIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesIsSingleValue = new boolean[batch.DEFAULT_SIZE]; - equalKeySeriesDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchSelected = new int[batch.DEFAULT_SIZE]; + resultSelected = new int[batch.DEFAULT_SIZE]; - spills = new int[batch.DEFAULT_SIZE]; - spillHashMapResultIndices = new int[batch.DEFAULT_SIZE]; } /* @@ -153,82 +144,123 @@ protected void innerPerBatchSetup(VectorizedRowBatch batch) { * @param batch * The big table batch with any matching and any non matching rows both as * selected in use. - * @param allMatchCount - * Number of matches in allMatchs. - * @param equalKeySeriesCount - * Number of single value matches. - * @param spillCount - * Number of spills in spills. - * @param hashMapResultCount - * Number of entries in hashMapResults. + * @param matchSeriesCount + * Number of matches in the match* arrays. + * @param matchSelectedCount + * The selected count in matchSelected. + * @param spillSeriesCount + * Number of spills in spill* arrays. */ - protected void finishInner(VectorizedRowBatch batch, - int allMatchCount, int equalKeySeriesCount, int spillCount, int hashMapResultCount) - throws HiveException, IOException { + protected void finishInner(VectorizedRowBatch batch, int matchSeriesCount, int spillSeriesCount) + throws HiveException, IOException { + + final int size = batch.size; + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; + + // Dump out the spill rows now. + if (spillSeriesCount > 0) { - int numSel = 0; + spillHashMapBatch(batch, spillLogicalIndices, spillDuplicateCounts, spillHashMapResults, + spillSeriesCount, selectedInUse, selected, size); + + if (spillSeriesCount == size) { + batch.size = 0; + batch.selectedInUse = false; + return; + } + } /* * Optimize by running value expressions only over the matched rows. */ - if (allMatchCount > 0 && bigTableValueExpressions != null) { - performValueExpressions(batch, allMatchs, allMatchCount); - } + int matchSelectedCount = 0; + if (matchSeriesCount > 0 && bigTableValueExpressions != null) { + + // Create matchSelected by adding the batch indices for the match logical index ranges from + // the input batch range. + matchSelectedCount = + flattenLogicalSeriesIntoSelected( + selectedInUse, selected, size, + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + matchSelected); - for (int i = 0; i < equalKeySeriesCount; i++) { - int hashMapResultIndex = equalKeySeriesHashMapResultIndices[i]; - VectorMapJoinHashMapResult hashMapResult = hashMapResults[hashMapResultIndex]; - int allMatchesIndex = equalKeySeriesAllMatchIndices[i]; - boolean isSingleValue = equalKeySeriesIsSingleValue[i]; - int duplicateCount = equalKeySeriesDuplicateCounts[i]; + performValueExpressions(batch, matchSelected, matchSelectedCount); - if (isSingleValue) { - numSel = generateHashMapResultSingleValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount, numSel); + } + + // Output matches + for (int i = 0; i < matchSeriesCount; i++) { + MapJoinHashMapResult hashMapResult = matchHashMapResults[i]; + int logical = matchLogicalIndices[i]; + int duplicateCount = matchDuplicateCounts[i]; + + if (matchIsSingleValue[i]) { + generateHashMapResultSingleValue( + batch, + hashMapResult, + logical, + duplicateCount); } else { generateHashMapResultMultiValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount); + batch, + hashMapResult, + logical, + duplicateCount); } } - if (spillCount > 0) { - spillHashMapBatch(batch, (VectorMapJoinHashTableResult[]) hashMapResults, - spills, spillHashMapResultIndices, spillCount); - } + // Create selected by not including match logical indices for multi-value small table results + // from input batch logical range, and then putting the batch indices in selected. + + int numSel = + makeMatchSelectedWithoutMultiValues( + selectedInUse, selected, size, + matchLogicalIndices, matchDuplicateCounts, matchIsSingleValue, matchSeriesCount, + resultSelected); - batch.size = numSel; batch.selectedInUse = true; + batch.size = numSel; + if (numSel > 0) { + System.arraycopy(resultSelected, 0, selected, 0, numSel); + } } - protected void finishInnerRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, - VectorMapJoinHashTableResult hashMapResult) throws HiveException, IOException { - - int numSel = 0; - - switch (joinResult) { - case MATCH: - - if (bigTableValueExpressions != null) { - // Run our value expressions over whole batch. - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } - } - - // Generate special repeated case. - generateHashMapResultRepeatedAll(batch, hashMapResults[0]); - break; - - case SPILL: - // Whole batch is spilled. - spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashMapResults[0]); - batch.size = 0; - break; - - case NOMATCH: - // No match for entire batch. + /** + * Generate the inner join output results for one vectorized row batch that + * has a repeated key. + * + * @param batch + * The big table batch with any matching and any non matching rows both as + * selected in use. + * @param keySeries + * @param hashMapResult + * @throws IOException + * @throws HiveException + */ + protected void finishInnerRepeated(VectorizedRowBatch batch, VectorKeySeries keySeries, + MapJoinHashMapResult hashMapResult) throws HiveException, IOException { + if (hashMapResult.getMapJoinResult() == MapJoinResult.SPILL) { + spillRepeated(batch, keySeries, hashMapResult); + return; + } + Preconditions.checkState(hashMapResult.getMapJoinResult() == MapJoinResult.MATCH); + + // The whole batch is matched. + if (hashMapResult.cappedCount() == 1) { + // Single value -- use batch as output. + generateHashMapResultSingleValue( + batch, + hashMapResult, + keySeries.currentLogical, + keySeries.currentDuplicateCount); + } else { + // Use overflow for multiple values. + generateHashMapResultMultiValue(batch, + hashMapResult, + keySeries.currentLogical, + keySeries.currentDuplicateCount); batch.size = 0; - break; } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerLongOperator.java index 804d69c..a2d3ee4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerLongOperator.java @@ -24,18 +24,19 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -// Single-Column Long hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; - // Single-Column Long specific imports. -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /* * Specialized class for doing a vectorized map join that is an inner join on a Single-Column Long @@ -44,8 +45,17 @@ public class VectorMapJoinInnerLongOperator extends VectorMapJoinInnerGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerLongOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -53,8 +63,10 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinLongHashMap hashMap; + // CONSIDER using final class VectorMapJoinFastLongHashMap object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Single-Column Long specific members. @@ -67,6 +79,10 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + private transient PrimitiveTypeInfo singleJoinColumnPrimitiveTypeInfo; + + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesLong longKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -107,6 +123,10 @@ public void process(Object row, int tag) throws HiveException { */ singleJoinColumn = bigTableKeyColumnMap[0]; + singleJoinColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) bigTableKeyTypeInfos[0]; + + longKeySeries = + new VectorKeySeriesLong(singleJoinColumn, singleJoinColumnPrimitiveTypeInfo); needCommonSetup = false; } @@ -119,7 +139,7 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column Long hash map information for this specialized class. */ - hashMap = (VectorMapJoinLongHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; useMinMax = hashMap.useMinMax(); if (useMinMax) { min = hashMap.min(); @@ -130,6 +150,7 @@ public void process(Object row, int tag) throws HiveException { } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner join. @@ -140,9 +161,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -156,229 +177,114 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column Long specific declarations. - */ - - // The one join column for this specialized class. - LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn]; - long[] vector = joinColVector.vector; + longKeySeries.processBatch(batch); - /* - * Single-Column Long check for repeating. - */ + MapJoinHashMapResult hashMapResult; + if (longKeySeries.keyCount == 1) { - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + // Effectively, one repeating key. - if (allKeyInputColumnsRepeating) { + if (longKeySeries.currentKeyIsNull) { - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Single-Column Long specific repeated lookup. - */ + // CONSIDER: Add support for NullSafe option. - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; + batch.size = 0; } else { - long key = vector[0]; - if (useMinMax && (key < min || key > max)) { - // Out of range for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; + long repeatedKey = longKeySeries.getCurrentKey(); + if (useMinMax && (repeatedKey < min || repeatedKey > max)) { + batch.size = 0; } else { - joinResult = hashMap.lookup(key, hashMapResults[0]); + hashMapResult = hashMapResults[0]; + hashMap.hashMapLookup( + repeatedKey, + longKeySeries.currentHashCode, + hashMapResult); + if (hashMapResult.getMapJoinResult() != MapJoinResult.NO_MATCH) { + finishInnerRepeated(batch, longKeySeries, hashMapResult); + } else { + batch.size = 0; + } } } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishInnerRepeated(batch, joinResult, hashMapResults[0]); } else { - - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - /* - * Single-Column Long specific variables. - */ - - long saveKey = 0; + MapJoinHashTableResult.MapJoinResult lookupResult; + long key; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + if (longKeySeries.currentKeyIsNull) { - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + // CONSIDER: Add support for NullSafe option. - /* - * Single-Column Long get key. - */ + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - long currentKey; - boolean isNull; - if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) { - currentKey = 0; - isNull = true; } else { - currentKey = vector[batchIndex]; - isNull = false; - } - - /* - * Equal key series checking. - */ - - if (isNull || !haveSaveKey || currentKey != saveKey) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; + key = longKeySeries.getCurrentKey(); + if (useMinMax && (key < min || key > max)) { + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column Long specific save key. - */ - - saveKey = currentKey; - - /* - * Single-Column Long specific lookup key. - */ - - if (useMinMax && (currentKey < min || currentKey > max)) { - // Key out of range for whole hash table. - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - } else { - saveJoinResult = hashMap.lookup(currentKey, hashMapResults[hashMapResultCount]); - } + hashMap.hashMapLookup( + key, + longKeySeries.currentHashCode, + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); } - /* - * Common inner join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } } - } - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + /* + * Common inner join result processing. + */ + + switch (lookupResult) { case MATCH: + matchLogicalIndices[matchSeriesCount] = longKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = longKeySeries.currentDuplicateCount; + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; hashMapResultCount++; - equalKeySeriesCount++; break; + case SPILL: + spillLogicalIndices[spillSeriesCount] = longKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = longKeySeries.currentDuplicateCount; + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; hashMapResultCount++; + spilledRowCounter += longKeySeries.currentDuplicateCount; break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); } - } + + if (!longKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - finishInner(batch, - allMatchCount, equalKeySeriesCount, spillCount, hashMapResultCount); + finishInner(batch, matchSeriesCount, spillSeriesCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerMultiKeyOperator.java index fcfa0bd..30ba29c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerMultiKeyOperator.java @@ -24,21 +24,23 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -// Multi-Key hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; // Multi-Key specific imports. -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; +import com.google.common.base.Preconditions; + /* * Specialized class for doing a vectorized map join that is an inner join on a Multi-Key * using a hash map. @@ -46,8 +48,17 @@ public class VectorMapJoinInnerMultiKeyOperator extends VectorMapJoinInnerGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerMultiKeyOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerMultiKeyOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerMultiKeyOperator.class.getName()); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -55,22 +66,20 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMap hashMap; + // CONSIDER using final class VectorMapJoinFastMultiKeyHashMap object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Multi-Key specific members. // - // Object that can take a set of columns in row in a vectorized row batch and serialized it. - // Known to not have any nulls. - private transient VectorSerializeRow keyVectorSerializeWrite; - - // The BinarySortable serialization of the current key. - private transient Output currentKeyOutput; + // Binary sortable multi-key serializer. + private transient BinarySortableSerializeWrite binarySortableSerializeWrite; - // The BinarySortable serialization of the saved key for a possible series of equal keys. - private transient Output saveKeyOutput; + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesMultiFast serializedMultiKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -110,12 +119,16 @@ public void process(Object row, int tag) throws HiveException { * Initialize Multi-Key members for this specialized class. */ - keyVectorSerializeWrite = new VectorSerializeRow( - new BinarySortableSerializeWrite(bigTableKeyColumnMap.length)); - keyVectorSerializeWrite.init(bigTableKeyTypeNames, bigTableKeyColumnMap); + binarySortableSerializeWrite = + new BinarySortableSerializeWrite(bigTableKeyColumnMap.length); - currentKeyOutput = new Output(); - saveKeyOutput = new Output(); + // For Multi-Key the Fast hash table computes the serialized key's hash code, + // so we don't use VectorKeySeriesMulti's hash code. + serializedMultiKeySeries = + new VectorKeySeriesMultiFast( + binarySortableSerializeWrite); + Preconditions.checkState(bigTableKeyTypeInfos.length > 0); + serializedMultiKeySeries.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); needCommonSetup = false; } @@ -128,12 +141,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Multi-Key hash map information for this specialized class. */ - hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner join. @@ -144,9 +158,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -160,231 +174,110 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Multi-Key specific declarations. - */ + serializedMultiKeySeries.processBatch(batch); - // None. + MapJoinHashMapResult hashMapResult; + if (serializedMultiKeySeries.keyCount == 1) { - /* - * Multi-Key check for repeating. - */ + // Effectively, one repeating key. - // If all BigTable input columns to key expressions are isRepeating, then - // calculate key once; lookup once. - boolean allKeyInputColumnsRepeating; - if (bigTableKeyColumnMap.length == 0) { - allKeyInputColumnsRepeating = false; - } else { - allKeyInputColumnsRepeating = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) { - allKeyInputColumnsRepeating = false; - break; - } - } - } + // NOTE: Any null column in the key columns is a non-match. + if (serializedMultiKeySeries.currentKeyIsNull) { - if (allKeyInputColumnsRepeating) { + // CONSIDER: Add support for NullSafe option. - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Multi-Key specific repeated lookup. - */ - - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, 0); - JoinUtil.JoinResult joinResult; - if (keyVectorSerializeWrite.getHasAnyNulls()) { - joinResult = JoinUtil.JoinResult.NOMATCH; + batch.size = 0; } else { - byte[] keyBytes = currentKeyOutput.getData(); - int keyLength = currentKeyOutput.getLength(); - joinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + hashMapResult = hashMapResults[0]; + hashMap.hashMapLookup( + serializedMultiKeySeries.serializedBytes, + serializedMultiKeySeries.serializedStart, + serializedMultiKeySeries.serializedLength, + serializedMultiKeySeries.currentHashCode, + hashMapResult); + if (hashMapResult.getMapJoinResult() != MapJoinResult.NO_MATCH) { + finishInnerRepeated(batch, serializedMultiKeySeries, hashMapResult); + } else { + batch.size = 0; + } } - finishInnerRepeated(batch, joinResult, hashMapResults[0]); } else { - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - /* - * Multi-Key specific variables. - */ + MapJoinHashTableResult.MapJoinResult lookupResult; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - Output temp; + // NOTE: Any null column in the key columns is a non-match. + if (serializedMultiKeySeries.currentKeyIsNull) { - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + // CONSIDER: Add support for NullSafe option. - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - /* - * Multi-Key get key. - */ + } else { - // Generate binary sortable key for current row in vectorized row batch. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, batchIndex); - boolean isAnyNull = keyVectorSerializeWrite.getHasAnyNulls(); + hashMap.hashMapLookup( + serializedMultiKeySeries.serializedBytes, + serializedMultiKeySeries.serializedStart, + serializedMultiKeySeries.serializedLength, + serializedMultiKeySeries.currentHashCode, + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); + + } /* - * Equal key series checking. + * Common inner join result processing. */ - if (isAnyNull || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isAnyNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Multi-Key specific save key. - */ - - temp = saveKeyOutput; - saveKeyOutput = currentKeyOutput; - currentKeyOutput = temp; - - /* - * Multi-Key specific lookup key. - */ - - byte[] keyBytes = saveKeyOutput.getData(); - int keyLength = saveKeyOutput.getLength(); - saveJoinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[hashMapResultCount]); - } - - /* - * Common inner join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + switch (lookupResult) { case MATCH: + matchLogicalIndices[matchSeriesCount] = serializedMultiKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = serializedMultiKeySeries.currentDuplicateCount; + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; hashMapResultCount++; - equalKeySeriesCount++; break; + case SPILL: + spillLogicalIndices[spillSeriesCount] = serializedMultiKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = serializedMultiKeySeries.currentDuplicateCount; + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; hashMapResultCount++; + spilledRowCounter += serializedMultiKeySeries.currentDuplicateCount; break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); } - } + + if (!serializedMultiKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - finishInner(batch, - allMatchCount, equalKeySeriesCount, spillCount, hashMapResultCount); + finishInner(batch, matchSeriesCount, spillSeriesCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerStringOperator.java index 0f9baae..4025f24 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinInnerStringOperator.java @@ -24,7 +24,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; @@ -32,11 +35,9 @@ import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column String hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytes; // Single-Column String specific imports. -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; /* * Specialized class for doing a vectorized map join that is an inner join on a Single-Column String @@ -45,8 +46,17 @@ public class VectorMapJoinInnerStringOperator extends VectorMapJoinInnerGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerStringOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinInnerStringOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -54,8 +64,10 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMap hashMap; + // CONSIDER using final class VectorMapJoinFastStringHashMap object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Single-Column String specific members. @@ -64,6 +76,9 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesBytes bytesKeySeries; + //--------------------------------------------------------------------------- // Pass-thru constructors. // @@ -104,6 +119,8 @@ public void process(Object row, int tag) throws HiveException { singleJoinColumn = bigTableKeyColumnMap[0]; + bytesKeySeries = new VectorKeySeriesBytes(singleJoinColumn); + needCommonSetup = false; } @@ -115,12 +132,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column String hash map information for this specialized class. */ - hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an inner join. @@ -131,9 +149,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -147,220 +165,112 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column String specific declarations. - */ - - // The one join column for this specialized class. - BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; - byte[][] vector = joinColVector.vector; - int[] start = joinColVector.start; - int[] length = joinColVector.length; - - /* - * Single-Column String check for repeating. - */ + bytesKeySeries.processBatch(batch); - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + MapJoinHashMapResult hashMapResult; + if (bytesKeySeries.keyCount == 1) { - if (allKeyInputColumnsRepeating) { + // Effectively, one repeating key. - /* - * Repeating. - */ + if (bytesKeySeries.currentKeyIsNull) { - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + // CONSIDER: Add support for NullSafe option. - /* - * Single-Column String specific repeated lookup. - */ - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; + batch.size = 0; } else { - byte[] keyBytes = vector[0]; - int keyStart = start[0]; - int keyLength = length[0]; - joinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + hashMapResult = hashMapResults[0]; + hashMap.hashMapLookup( + bytesKeySeries.currentBytes, + bytesKeySeries.currentStart, + bytesKeySeries.currentLength, + bytesKeySeries.currentHashCode, + hashMapResult); + if (hashMapResult.getMapJoinResult() != MapJoinResult.NO_MATCH) { + finishInnerRepeated(batch, bytesKeySeries, hashMapResult); + } else { + batch.size = 0; + } } - finishInnerRepeated(batch, joinResult, hashMapResults[0]); } else { - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - /* - * Single-Column String specific variables. - */ + MapJoinHashTableResult.MapJoinResult lookupResult; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - int saveKeyBatchIndex = -1; + if (bytesKeySeries.currentKeyIsNull) { - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + // CONSIDER: Add support for NullSafe option. - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - /* - * Single-Column String get key. - */ + } else { - // Implicit -- use batchIndex. - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; + hashMap.hashMapLookup( + bytesKeySeries.currentBytes, + bytesKeySeries.currentStart, + bytesKeySeries.currentLength, + bytesKeySeries.currentHashCode, + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); - /* - * Equal key series checking. - */ + // LOG.info(CLASS_NAME + " " + + // VectorizedBatchUtil.displayBytes(bytesKeySeries.currentBytes, bytesKeySeries.currentStart, + // bytesKeySeries.currentLength) + " hashCode " + Integer.toHexString(bytesKeySeries.getCurrentHashCode()) + + // " lookupResult " + lookupResult.name()); - if (isNull || !haveSaveKey || - StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], - vector[batchIndex], start[batchIndex], length[batchIndex]) == false) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column String specific save key. - */ - - saveKeyBatchIndex = batchIndex; - - /* - * Single-Column String specific lookup key. - */ - - byte[] keyBytes = vector[batchIndex]; - int keyStart = start[batchIndex]; - int keyLength = length[batchIndex]; - saveJoinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[hashMapResultCount]); - } - - /* - * Common inner join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } } - } - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + /* + * Common inner join result processing. + */ + switch (lookupResult) { case MATCH: + matchLogicalIndices[matchSeriesCount] = bytesKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = bytesKeySeries.currentDuplicateCount; + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; hashMapResultCount++; - equalKeySeriesCount++; break; + case SPILL: + spillLogicalIndices[spillSeriesCount] = bytesKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = bytesKeySeries.currentDuplicateCount; + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; hashMapResultCount++; + spilledRowCounter += bytesKeySeries.currentDuplicateCount; break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); } - } + + if (!bytesKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - finishInner(batch, - allMatchCount, equalKeySeriesCount, spillCount, hashMapResultCount); + finishInner(batch, matchSeriesCount, spillSeriesCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiGenerateResultOperator.java index c71ebba..626afcd 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiGenerateResultOperator.java @@ -23,16 +23,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeries; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import com.google.common.base.Preconditions; + /** * This class has methods for generating vectorized join results for left semi joins. * @@ -59,17 +60,22 @@ // An array of hash set results so we can do lookups on the whole batch before output result // generation. - protected transient VectorMapJoinHashSetResult hashSetResults[]; + protected transient MapJoinHashSetResult hashSetResults[]; + + // Pre-allocated member for storing the batch indices of the matched rows. + protected transient int[] matchSelected; - // Pre-allocated member for storing the (physical) batch index of matching row (single- or - // multi-small-table-valued) indexes during a process call. - protected transient int[] allMatchs; + // Pre-allocated member for storing the new logical batch index (within newSelected) and + // series count of rows that matched. + protected transient int[] matchLogicalIndices; + protected transient int[] matchDuplicateCounts; - // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. - protected transient int[] spills; + // Pre-allocated member for storing the logical batch index and series count of rows that spilled. + protected transient int[] spillLogicalIndices; + protected transient int[] spillDuplicateCounts; - // Pre-allocated member for storing index into the hashSetResults for each spilled row. - protected transient int[] spillHashMapResultIndices; + // Pre-allocated member for storing the reference to the hash map results of rows that spilled. + protected transient MapJoinHashSetResult[] spillHashSetResults; /** Kryo ctor. */ protected VectorMapJoinLeftSemiGenerateResultOperator() { @@ -92,17 +98,19 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); // Semi join specific. - VectorMapJoinHashSet baseHashSet = (VectorMapJoinHashSet) vectorMapJoinHashTable; - - hashSetResults = new VectorMapJoinHashSetResult[batch.DEFAULT_SIZE]; + hashSetResults = new MapJoinHashSetResult[batch.DEFAULT_SIZE]; for (int i = 0; i < hashSetResults.length; i++) { - hashSetResults[i] = baseHashSet.createHashSetResult(); + hashSetResults[i] = vectormapJoinHashTableFactory.createHashSetResult(); } - allMatchs = new int[batch.DEFAULT_SIZE]; + matchSelected = new int[batch.DEFAULT_SIZE]; - spills = new int[batch.DEFAULT_SIZE]; - spillHashMapResultIndices = new int[batch.DEFAULT_SIZE]; + matchLogicalIndices = new int[batch.DEFAULT_SIZE]; + matchDuplicateCounts = new int[batch.DEFAULT_SIZE]; + + spillLogicalIndices = new int[batch.DEFAULT_SIZE]; + spillDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillHashSetResults = new MapJoinHashSetResult[batch.DEFAULT_SIZE]; } //----------------------------------------------------------------------------------------------- @@ -117,37 +125,75 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { * @param batch * The big table batch with any matching and any non matching rows both as * selected in use. - * @param allMatchCount - * Number of matches in allMatchs. - * @param spillCount - * Number of spills in spills. - * @param hashTableResults - * The array of all hash table results for the batch. We need the - * VectorMapJoinHashTableResult for the spill information. + * @param matchSeriesCount + * Number of matches in the match* arrays. + * @param matchSelectedCount + * The selected count in matchSelected. + * @param spillSeriesCount + * Number of spills in spill* arrays. */ - protected void finishLeftSemi(VectorizedRowBatch batch, - int allMatchCount, int spillCount, - VectorMapJoinHashTableResult[] hashTableResults) throws HiveException, IOException { - - // Get rid of spills before we start modifying the batch. - if (spillCount > 0) { - spillHashMapBatch(batch, hashTableResults, - spills, spillHashMapResultIndices, spillCount); + protected void finishLeftSemi(VectorizedRowBatch batch, int matchSeriesCount, + int spillSeriesCount) throws HiveException, IOException { + + final int selectedSize = batch.size; + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; + + // Dump out the spill rows now. + if (spillSeriesCount > 0) { + + spillHashMapBatch(batch, spillLogicalIndices, spillDuplicateCounts, spillHashSetResults, + spillSeriesCount, selectedInUse, selected, selectedSize); + + if (spillSeriesCount == selectedSize) { + batch.size = 0; + batch.selectedInUse = false; + return; + } } + int matchSelectedCount = + flattenLogicalSeriesIntoSelected( + selectedInUse, selected, selectedSize, + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + matchSelected); + /* * Optimize by running value expressions only over the matched rows. */ - if (allMatchCount > 0 && bigTableValueExpressions != null) { - performValueExpressions(batch, allMatchs, allMatchCount); + if (matchSeriesCount > 0 && bigTableValueExpressions != null) { + performValueExpressions(batch, matchSelected, matchSelectedCount); } - int numSel = generateHashSetResults(batch, allMatchs, allMatchCount); + int numSel = generateHashSetResults(batch, matchSelected, matchSelectedCount); batch.size = numSel; batch.selectedInUse = true; } /** + * Generate the left semi join output results for one vectorized row batch that + * has a repeated key. + * + * @param batch + * The big table batch with any matching and any non matching rows both as + * selected in use. + * @param keySeries + * @param hashSetResult + * @throws IOException + * @throws HiveException + */ + protected void finishLeftSemiRepeated(VectorizedRowBatch batch, VectorKeySeries keySeries, + MapJoinHashSetResult hashSetResult) throws HiveException, IOException { + if (hashSetResult.getMapJoinResult() == MapJoinResult.SPILL) { + spillRepeated(batch, keySeries, hashSetResult); + return; + } + Preconditions.checkState(hashSetResult.getMapJoinResult() == MapJoinResult.MATCH); + + // The whole batch is matched (no small table side for left semi). + } + + /** * Generate the matching left semi join output results of a vectorized row batch. * * @param batch @@ -175,57 +221,4 @@ private int generateHashSetResults(VectorizedRowBatch batch, return numSel; } - - /** - * Generate the left semi join output results for one vectorized row batch with a repeated key. - * - * @param batch - * The big table batch whose repeated key matches. - */ - protected int generateHashSetResultRepeatedAll(VectorizedRowBatch batch) throws HiveException { - - if (batch.selectedInUse) { - // The selected array is already filled in as we want it. - } else { - int[] selected = batch.selected; - for (int i = 0; i < batch.size; i++) { - selected[i] = i; - } - batch.selectedInUse = true; - } - - return batch.size; - } - - protected void finishLeftSemiRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, - VectorMapJoinHashTableResult hashSetResult) throws HiveException, IOException { - - switch (joinResult) { - case MATCH: - - if (bigTableValueExpressions != null) { - // Run our value expressions over whole batch. - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } - } - - // Generate special repeated case. - int numSel = generateHashSetResultRepeatedAll(batch); - batch.size = numSel; - batch.selectedInUse = true; - break; - - case SPILL: - // Whole batch is spilled. - spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashSetResult); - batch.size = 0; - break; - - case NOMATCH: - // No match for entire batch. - batch.size = 0; - break; - } - } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiLongOperator.java index 1149a9d..8b8380a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiLongOperator.java @@ -24,19 +24,19 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column Long hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashSet; - -// Single-Column Long specific imports. -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /* * Specialized class for doing a vectorized map join that is an left semi join on a Single-Column Long @@ -45,8 +45,17 @@ public class VectorMapJoinLeftSemiLongOperator extends VectorMapJoinLeftSemiGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinLeftSemiLongOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -54,8 +63,10 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinLongHashSet hashSet; + // CONSIDER using final class VectorMapJoinFastLongHashSet object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashSet; //--------------------------------------------------------------------------- // Single-Column Long specific members. @@ -68,6 +79,10 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + private transient PrimitiveTypeInfo singleJoinColumnPrimitiveTypeInfo; + + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesLong longKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -108,6 +123,10 @@ public void process(Object row, int tag) throws HiveException { */ singleJoinColumn = bigTableKeyColumnMap[0]; + singleJoinColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) bigTableKeyTypeInfos[0]; + + longKeySeries = + new VectorKeySeriesLong(singleJoinColumn, singleJoinColumnPrimitiveTypeInfo); needCommonSetup = false; } @@ -120,7 +139,7 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column Long hash set information for this specialized class. */ - hashSet = (VectorMapJoinLongHashSet) vectorMapJoinHashTable; + hashSet = vectorMapJoinHashTableFind; useMinMax = hashSet.useMinMax(); if (useMinMax) { min = hashSet.min(); @@ -131,6 +150,7 @@ public void process(Object row, int tag) throws HiveException { } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an left semi join. @@ -142,9 +162,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -158,220 +178,114 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column Long specific declarations. - */ - - // The one join column for this specialized class. - LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn]; - long[] vector = joinColVector.vector; - /* - * Single-Column Long check for repeating. - */ + longKeySeries.processBatch(batch); - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + MapJoinHashSetResult hashSetResult; + if (longKeySeries.keyCount == 1) { - if (allKeyInputColumnsRepeating) { + // Effectively, one repeating key. - /* - * Repeating. - */ + if (longKeySeries.currentKeyIsNull) { - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Single-Column Long specific repeated lookup. - */ + // CONSIDER: Add support for NullSafe option. - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; + batch.size = 0; } else { - long key = vector[0]; - if (useMinMax && (key < min || key > max)) { - // Out of range for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; + long repeatedKey = longKeySeries.getCurrentKey(); + if (useMinMax && (repeatedKey < min || repeatedKey > max)) { + batch.size = 0; } else { - joinResult = hashSet.contains(key, hashSetResults[0]); + hashSetResult = hashSetResults[0]; + hashSet.hashSetContains( + repeatedKey, + longKeySeries.currentHashCode, + hashSetResult); + if (hashSetResult.getMapJoinResult() != MapJoinResult.NO_MATCH) { + finishLeftSemiRepeated(batch, longKeySeries, hashSetResult); + } else { + batch.size = 0; + } } } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]); } else { - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashSetResultCount = 0; - int allMatchCount = 0; - int spillCount = 0; - /* - * Single-Column Long specific variables. - */ + MapJoinHashTableResult.MapJoinResult containsResult; + long key; + do { + // Use the next hash set result entry. + hashSetResult = hashSetResults[hashSetResultCount]; - long saveKey = 0; + if (longKeySeries.currentKeyIsNull) { - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + // CONSIDER: Add support for NullSafe option. - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Single-Column Long get key. - */ + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - long currentKey; - boolean isNull; - if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) { - currentKey = 0; - isNull = true; } else { - currentKey = vector[batchIndex]; - isNull = false; - } - - /* - * Equal key series checking. - */ - - if (isNull || !haveSaveKey || currentKey != saveKey) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. - break; - case SPILL: - // We keep the hash set result for its spill information. - hashSetResultCount++; - break; - case NOMATCH: - break; - } - } - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; + key = longKeySeries.getCurrentKey(); + if (useMinMax && (key < min || key > max)) { + // Out of range for whole batch. + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column Long specific save key. - */ - - saveKey = currentKey; - - /* - * Single-Column Long specific lookup key. - */ - - if (useMinMax && (currentKey < min || currentKey > max)) { - // Key out of range for whole hash table. - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - } else { - saveJoinResult = hashSet.contains(currentKey, hashSetResults[hashSetResultCount]); - } + hashSet.hashSetContains( + key, + longKeySeries.currentHashCode, + hashSetResult); + containsResult = hashSetResult.getMapJoinResult(); } - /* - * Common left-semi join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } } - } - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + /* + * Common inner join result processing. + */ + switch (containsResult) { case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. + // We have extracted the existence from the hash set result, so we don't keep it. + matchLogicalIndices[matchSeriesCount] = longKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = longKeySeries.currentDuplicateCount; + matchSeriesCount++; break; + case SPILL: - // We keep the hash set result for its spill information. + spillLogicalIndices[spillSeriesCount] = longKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = longKeySeries.currentDuplicateCount; + spillHashSetResults[spillSeriesCount] = hashSetResult; + spillSeriesCount++; hashSetResultCount++; + spilledRowCounter += longKeySeries.currentDuplicateCount; break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - } + + if (!longKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount))); + LOG.info(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - finishLeftSemi(batch, - allMatchCount, spillCount, - (VectorMapJoinHashTableResult[]) hashSetResults); + finishLeftSemi(batch, matchSeriesCount, spillSeriesCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiMultiKeyOperator.java index e0baebc..fa32fa3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiMultiKeyOperator.java @@ -24,22 +24,22 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -// Multi-Key hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; - // Multi-Key specific imports. -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; +import com.google.common.base.Preconditions; + /* * Specialized class for doing a vectorized map join that is an left semi join on Multi-Key * using hash set. @@ -47,8 +47,17 @@ public class VectorMapJoinLeftSemiMultiKeyOperator extends VectorMapJoinLeftSemiGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinLeftSemiMultiKeyOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -56,22 +65,20 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinBytesHashSet hashSet; + // CONSIDER using final class VectorMapJoinFastMultiKeyHashSet object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashSet; //--------------------------------------------------------------------------- // Multi-Key specific members. // - // Object that can take a set of columns in row in a vectorized row batch and serialized it. - // Known to not have any nulls. - private transient VectorSerializeRow keyVectorSerializeWrite; + // Binary sortable multi-key serializer. + private transient BinarySortableSerializeWrite binarySortableSerializeWrite; - // The BinarySortable serialization of the current key. - private transient Output currentKeyOutput; - - // The BinarySortable serialization of the saved key for a possible series of equal keys. - private transient Output saveKeyOutput; + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesMultiFast serializedMultiKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -111,12 +118,16 @@ public void process(Object row, int tag) throws HiveException { * Initialize Multi-Key members for this specialized class. */ - keyVectorSerializeWrite = new VectorSerializeRow( - new BinarySortableSerializeWrite(bigTableKeyColumnMap.length)); - keyVectorSerializeWrite.init(bigTableKeyTypeNames, bigTableKeyColumnMap); + binarySortableSerializeWrite = + new BinarySortableSerializeWrite(bigTableKeyColumnMap.length); - currentKeyOutput = new Output(); - saveKeyOutput = new Output(); + // For Multi-Key the Fast hash table computes the serialized key's hash code, + // so we don't use VectorKeySeriesMulti's hash code. + serializedMultiKeySeries = + new VectorKeySeriesMultiFast( + binarySortableSerializeWrite); + Preconditions.checkState(bigTableKeyTypeInfos.length > 0); + serializedMultiKeySeries.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); needCommonSetup = false; } @@ -129,12 +140,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Multi-Key hash set information for this specialized class. */ - hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable; + hashSet = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an left semi join. @@ -146,9 +158,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -162,226 +174,107 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Multi-Key specific declarations. - */ + serializedMultiKeySeries.processBatch(batch); - // None. + MapJoinHashSetResult hashSetResult; + if (serializedMultiKeySeries.keyCount == 1) { - /* - * Multi-Key Long check for repeating. - */ + // Effectively, one repeating key. - // If all BigTable input columns to key expressions are isRepeating, then - // calculate key once; lookup once. - boolean allKeyInputColumnsRepeating; - if (bigTableKeyColumnMap.length == 0) { - allKeyInputColumnsRepeating = false; - } else { - allKeyInputColumnsRepeating = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) { - allKeyInputColumnsRepeating = false; - break; - } - } - } + // NOTE: Any null column in the key columns is a non-match. + if (serializedMultiKeySeries.currentKeyIsNull) { - if (allKeyInputColumnsRepeating) { + // CONSIDER: Add support for NullSafe option. - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Multi-Key specific repeated lookup. - */ - - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, 0); - JoinUtil.JoinResult joinResult; - if (keyVectorSerializeWrite.getHasAnyNulls()) { - joinResult = JoinUtil.JoinResult.NOMATCH; + batch.size = 0; } else { - byte[] keyBytes = currentKeyOutput.getData(); - int keyLength = currentKeyOutput.getLength(); - // LOG.debug(CLASS_NAME + " processOp all " + displayBytes(keyBytes, 0, keyLength)); - joinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + hashSetResult = hashSetResults[0]; + hashSet.hashSetContains( + serializedMultiKeySeries.serializedBytes, + serializedMultiKeySeries.serializedStart, + serializedMultiKeySeries.serializedLength, + serializedMultiKeySeries.currentHashCode, + hashSetResult); + if (hashSetResult.getMapJoinResult() != MapJoinResult.NO_MATCH) { + finishLeftSemiRepeated(batch, serializedMultiKeySeries, hashSetResult); + } else { + batch.size = 0; + } } - finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]); } else { - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashSetResultCount = 0; - int allMatchCount = 0; - int spillCount = 0; - /* - * Multi-Key specific variables. - */ + MapJoinHashTableResult.MapJoinResult containsResult; + do { + // Use the next hash set result entry. + hashSetResult = hashSetResults[hashSetResultCount]; - Output temp; + // NOTE: Any null column in the key for inner/left-semi join is a non-match. + if (serializedMultiKeySeries.currentKeyIsNull) { - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + // CONSIDER: Add support for NullSafe option. - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - /* - * Multi-Key get key. - */ + } else { - // Generate binary sortable key for current row in vectorized row batch. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, batchIndex); - boolean isAnyNull = keyVectorSerializeWrite.getHasAnyNulls(); + hashSet.hashSetContains( + serializedMultiKeySeries.serializedBytes, + serializedMultiKeySeries.serializedStart, + serializedMultiKeySeries.serializedLength, + serializedMultiKeySeries.currentHashCode, + hashSetResult); + containsResult = hashSetResult.getMapJoinResult(); - // LOG.debug(CLASS_NAME + " currentKey " + - // VectorizedBatchUtil.displayBytes(currentKeyOutput.getData(), 0, currentKeyOutput.getLength())); + } /* - * Equal key series checking. + * Common inner join result processing. */ - - if (isAnyNull || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. - break; - case SPILL: - // We keep the hash set result for its spill information. - hashSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isAnyNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Multi-Key specific save key and lookup. - */ - - temp = saveKeyOutput; - saveKeyOutput = currentKeyOutput; - currentKeyOutput = temp; - - /* - * Multi-key specific lookup key. - */ - - byte[] keyBytes = saveKeyOutput.getData(); - int keyLength = saveKeyOutput.getLength(); - saveJoinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[hashSetResultCount]); - } - - /* - * Common left-semi join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } - } - } - - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + switch (containsResult) { case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. + // We have extracted the existence from the hash set result, so we don't keep it. + matchLogicalIndices[matchSeriesCount] = serializedMultiKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = serializedMultiKeySeries.currentDuplicateCount; + matchSeriesCount++; break; + case SPILL: - // We keep the hash set result for its spill information. + spillLogicalIndices[spillSeriesCount] = serializedMultiKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = serializedMultiKeySeries.currentDuplicateCount; + spillHashSetResults[spillSeriesCount] = hashSetResult; + spillSeriesCount++; hashSetResultCount++; + spilledRowCounter += serializedMultiKeySeries.currentDuplicateCount; break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - } + + if (!serializedMultiKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount))); + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - finishLeftSemi(batch, - allMatchCount, spillCount, - (VectorMapJoinHashTableResult[]) hashSetResults); + finishLeftSemi(batch, matchSeriesCount, spillSeriesCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiStringOperator.java index 49e1177..7b02f19 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinLeftSemiStringOperator.java @@ -24,20 +24,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column String hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; - -// Single-Column String specific imports. -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytes; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; /* * Specialized class for doing a vectorized map join that is an left semi join on a Single-Column String @@ -46,8 +44,17 @@ public class VectorMapJoinLeftSemiStringOperator extends VectorMapJoinLeftSemiGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinInnerBigOnlyLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinLeftSemiStringOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -55,8 +62,10 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinBytesHashSet hashSet; + // CONSIDER using final class VectorMapJoinFastStringHashSet object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashSet; //--------------------------------------------------------------------------- // Single-Column String specific members. @@ -65,6 +74,9 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesBytes bytesKeySeries; + //--------------------------------------------------------------------------- // Pass-thru constructors. // @@ -105,6 +117,8 @@ public void process(Object row, int tag) throws HiveException { singleJoinColumn = bigTableKeyColumnMap[0]; + bytesKeySeries = new VectorKeySeriesBytes(singleJoinColumn); + needCommonSetup = false; } @@ -116,12 +130,13 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column String hash set information for this specialized class. */ - hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable; + hashSet = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; // Do the per-batch setup for an left semi join. @@ -133,9 +148,9 @@ public void process(Object row, int tag) throws HiveException { ve.evaluate(batch); } - final int inputLogicalSize = batch.size; + final int filteredSize = batch.size; - if (inputLogicalSize == 0) { + if (filteredSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } @@ -149,212 +164,103 @@ public void process(Object row, int tag) throws HiveException { } } - /* - * Single-Column String specific declarations. - */ + bytesKeySeries.processBatch(batch); - // The one join column for this specialized class. - BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; - byte[][] vector = joinColVector.vector; - int[] start = joinColVector.start; - int[] length = joinColVector.length; + MapJoinHashSetResult hashSetResult; + if (bytesKeySeries.keyCount == 1) { - /* - * Single-Column Long check for repeating. - */ + // Effectively, one repeating key. - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + if (bytesKeySeries.currentKeyIsNull) { - if (allKeyInputColumnsRepeating) { + // CONSIDER: Add support for NullSafe option. - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Single-Column String specific repeated lookup. - */ - - JoinUtil.JoinResult joinResult; - if (!joinColVector.noNulls && joinColVector.isNull[0]) { - joinResult = JoinUtil.JoinResult.NOMATCH; + batch.size = 0; } else { - byte[] keyBytes = vector[0]; - int keyStart = start[0]; - int keyLength = length[0]; - joinResult = hashSet.contains(keyBytes, keyStart, keyLength, hashSetResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); + hashSetResult = hashSetResults[0]; + hashSet.hashSetContains( + bytesKeySeries.currentBytes, + bytesKeySeries.currentStart, + bytesKeySeries.currentLength, + bytesKeySeries.currentHashCode, + hashSetResult); + if (hashSetResult.getMapJoinResult() != MapJoinResult.NO_MATCH) { + finishLeftSemiRepeated(batch, bytesKeySeries, hashSetResult); + } else { + batch.size = 0; + } } - finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]); } else { - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - // We remember any matching rows in matchs / matchSize. At the end of the loop, - // selected / batch.size will represent both matching and non-matching rows for outer join. - // Only deferred rows will have been removed from selected. - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - + int matchSeriesCount = 0; + int spillSeriesCount = 0; int hashSetResultCount = 0; - int allMatchCount = 0; - int spillCount = 0; - /* - * Single-Column String specific variables. - */ - - int saveKeyBatchIndex = -1; + MapJoinHashTableResult.MapJoinResult containsResult; + do { + // Use the next hash set result entry. + hashSetResult = hashSetResults[hashSetResultCount]; - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; - - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < inputLogicalSize; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); - - /* - * Single-Column String get key. - */ + if (bytesKeySeries.currentKeyIsNull) { - // Implicit -- use batchIndex. - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; + // CONSIDER: Add support for NullSafe option. - /* - * Equal key series checking. - */ + containsResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; - if (isNull || !haveSaveKey || - StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], - vector[batchIndex], start[batchIndex], length[batchIndex]) == false) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. - break; - case SPILL: - // We keep the hash set result for its spill information. - hashSetResultCount++; - break; - case NOMATCH: - break; - } - } - - if (isNull) { - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - haveSaveKey = false; - } else { - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column String specific save key and lookup. - */ - - saveKeyBatchIndex = batchIndex; - - /* - * Single-Column String specific lookup key. - */ - - byte[] keyBytes = vector[batchIndex]; - int keyStart = start[batchIndex]; - int keyLength = length[batchIndex]; - saveJoinResult = hashSet.contains(keyBytes, keyStart, keyLength, hashSetResults[hashSetResultCount]); - } - - /* - * Common left-semi join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } } else { - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashSetResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } + hashSet.hashSetContains( + bytesKeySeries.currentBytes, + bytesKeySeries.currentStart, + bytesKeySeries.currentLength, + bytesKeySeries.currentHashCode, + hashSetResult); + containsResult = hashSetResult.getMapJoinResult(); } - } - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { + /* + * Common inner join result processing. + */ + switch (containsResult) { case MATCH: - // We have extracted the existence from the hash set result, so we don't keep it. + // We have extracted the existence from the hash set result, so we don't keep it. + matchLogicalIndices[matchSeriesCount] = bytesKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = bytesKeySeries.currentDuplicateCount; + matchSeriesCount++; break; + case SPILL: - // We keep the hash set result for its spill information. + spillLogicalIndices[spillSeriesCount] = bytesKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = bytesKeySeries.currentDuplicateCount; + spillHashSetResults[spillSeriesCount] = hashSetResult; + spillSeriesCount++; hashSetResultCount++; + spilledRowCounter += bytesKeySeries.currentDuplicateCount; break; - case NOMATCH: + + case NO_MATCH: break; + + default: + throw new RuntimeException("Unexpected contains result " + containsResult.name()); } - } + + if (!bytesKeySeries.next()) { + break; + } + } while (true); if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + - " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount))); + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " filteredSize " + filteredSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - finishLeftSemi(batch, - allMatchCount, spillCount, - (VectorMapJoinHashTableResult[]) hashSetResults); + finishLeftSemi(batch, matchSeriesCount, spillSeriesCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java index 0e2d65a..e635e50 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java @@ -19,20 +19,21 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin; import java.io.IOException; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeries; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; + +import com.google.common.base.Preconditions; /** * This class has methods for generating vectorized join results for outer joins. @@ -67,50 +68,44 @@ // Outer join specific members. // + protected transient long outerJoinNullKeyCounter; + protected transient long outerJoinFilteredOutCounter; + // An array of hash map results so we can do lookups on the whole batch before output result // generation. - protected transient VectorMapJoinHashMapResult hashMapResults[]; + protected transient MapJoinHashMapResult hashMapResults[]; - // Pre-allocated member for remembering the big table's selected array at the beginning of - // the process method before applying any filter. For outer join we need to remember which - // rows did not match since they will appear the in outer join result with NULLs for the - // small table. - protected transient int[] inputSelected; + // For outer join, we must some how retain our input row selection before ON expression + // filtering and before hash table matching so we can generate results for all rows (matching + // and non matching) later. Since we are knocking rows out in different phases, we use a + // copy of the selected array. + protected int inputLogicalSize; + protected int filteredSize; + protected boolean inputSelectedInUse; - // Pre-allocated member for storing the (physical) batch index of matching row (single- or - // multi-small-table-valued) indexes during a process call. - protected transient int[] allMatchs; + // Pre-allocated member for storing the batch indices of the input batch rows. + protected transient int[] inputSelected; - /* - * Pre-allocated members for storing information equal key series for small-table matches. - * - * ~HashMapResultIndices - * Index into the hashMapResults array for the match. - * ~AllMatchIndices - * (Logical) indices into allMatchs to the first row of a match of a - * possible series of duplicate keys. - * ~IsSingleValue - * Whether there is 1 or multiple small table values. - * ~DuplicateCounts - * The duplicate count for each matched key. - * - */ - protected transient int[] equalKeySeriesHashMapResultIndices; - protected transient int[] equalKeySeriesAllMatchIndices; - protected transient boolean[] equalKeySeriesIsSingleValue; - protected transient int[] equalKeySeriesDuplicateCounts; + // Pre-allocated member for storing the logical batch index (within input batch), + // series count of rows, and hash map results that matched. + protected transient int[] matchLogicalIndices; + protected transient int[] matchDuplicateCounts; + protected transient boolean[] matchIsSingleValue; + protected transient MapJoinHashMapResult[] matchHashMapResults; - // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. - protected transient int[] spills; + // Pre-allocated member for storing the logical batch index and series count of rows that spilled. + protected transient int[] spillLogicalIndices; + protected transient int[] spillDuplicateCounts; - // Pre-allocated member for storing index into the hashSetResults for each spilled row. - protected transient int[] spillHashMapResultIndices; + // Pre-allocated member for storing the reference to the hash map results of rows that spilled. + protected transient MapJoinHashMapResult[] spillHashMapResults; - // Pre-allocated member for storing any non-spills, non-matches, or merged row indexes during a - // process method call. - protected transient int[] nonSpills; - protected transient int[] noMatchs; - protected transient int[] merged; + // Pre-allocated member for storing any non-spills, matches, and non-matches batch indexes during + // a process method call. + protected transient int[] nonSpillSelected; + protected transient int[] matchSelected; + protected transient int[] nonMatchSelected; + protected transient int[] resultSelected; /** Kryo ctor. */ protected VectorMapJoinOuterGenerateResultOperator() { @@ -132,29 +127,30 @@ public VectorMapJoinOuterGenerateResultOperator(CompilationOpContext ctx, protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); - // Outer join specific. - VectorMapJoinHashMap baseHashMap = (VectorMapJoinHashMap) vectorMapJoinHashTable; + outerJoinNullKeyCounter = 0; + outerJoinFilteredOutCounter = 0; - hashMapResults = new VectorMapJoinHashMapResult[batch.DEFAULT_SIZE]; + // Outer join specific. + hashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; for (int i = 0; i < hashMapResults.length; i++) { - hashMapResults[i] = baseHashMap.createHashMapResult(); + hashMapResults[i] = vectormapJoinHashTableFactory.createHashMapResult(); } inputSelected = new int[batch.DEFAULT_SIZE]; - allMatchs = new int[batch.DEFAULT_SIZE]; - - equalKeySeriesHashMapResultIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesAllMatchIndices = new int[batch.DEFAULT_SIZE]; - equalKeySeriesIsSingleValue = new boolean[batch.DEFAULT_SIZE]; - equalKeySeriesDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchLogicalIndices = new int[batch.DEFAULT_SIZE]; + matchDuplicateCounts = new int[batch.DEFAULT_SIZE]; + matchIsSingleValue = new boolean[batch.DEFAULT_SIZE]; + matchHashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; - spills = new int[batch.DEFAULT_SIZE]; - spillHashMapResultIndices = new int[batch.DEFAULT_SIZE]; + spillLogicalIndices = new int[batch.DEFAULT_SIZE]; + spillDuplicateCounts = new int[batch.DEFAULT_SIZE]; + spillHashMapResults = new MapJoinHashMapResult[batch.DEFAULT_SIZE]; - nonSpills = new int[batch.DEFAULT_SIZE]; - noMatchs = new int[batch.DEFAULT_SIZE]; - merged = new int[batch.DEFAULT_SIZE]; + nonSpillSelected = new int[batch.DEFAULT_SIZE]; + matchSelected = new int[batch.DEFAULT_SIZE]; + nonMatchSelected = new int[batch.DEFAULT_SIZE]; + resultSelected = new int[batch.DEFAULT_SIZE]; } @@ -167,7 +163,20 @@ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { /** * Do the per-batch setup for an outer join. */ - protected void outerPerBatchSetup(VectorizedRowBatch batch) { + protected boolean outerPerBatchSetup(VectorizedRowBatch batch) { + + inputLogicalSize = batch.size; + if (inputLogicalSize == 0) { + return false; + } + + inputSelectedInUse = batch.selectedInUse; + if (inputSelectedInUse) { + // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) { + // throw new HiveException("batch.selected is not in sort order and unique"); + // } + System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize); + } // For join operators that can generate small table results, reset their // (target) scratch columns. @@ -181,136 +190,198 @@ protected void outerPerBatchSetup(VectorizedRowBatch batch) { ColumnVector bigTableOuterKeyColumn = batch.cols[column]; bigTableOuterKeyColumn.reset(); } + + // Filtering for outer join just removes rows available for hash table matching. + if (bigTableFilterExpressions.length > 0) { + // Since the input + for (VectorExpression ve : bigTableFilterExpressions) { + ve.evaluate(batch); + } + + // Since outer join outputs non matches, we do not return here on filteredSize == 0. + filteredSize = batch.size; + outerJoinFilteredOutCounter += (inputLogicalSize - filteredSize); + } else { + filteredSize = inputLogicalSize; + } + + return true; } /** - * Apply the value expression to rows in the (original) input selected array. + * Generate the outer join output results for one vectorized row batch. * * @param batch - * The vectorized row batch. - * @param inputSelectedInUse - * Whether the (original) input batch is selectedInUse. - * @param inputLogicalSize - * The (original) input batch size. + * The big table batch with any matching and any non matching rows both as + * selected in use. + * @param matchSeriesCount + * Number of match duplicate key series. + * @param spillSeriesCount + * Number of spill duplicate key series. */ - private void doValueExprOnInputSelected(VectorizedRowBatch batch, - boolean inputSelectedInUse, int inputLogicalSize) { + protected void finishOuter(VectorizedRowBatch batch, int matchSeriesCount, int spillSeriesCount) + throws IOException, HiveException { - int saveBatchSize = batch.size; - int[] saveSelected = batch.selected; - boolean saveSelectedInUse = batch.selectedInUse; + // The match and spill information is with respect to the batch selected not inputSelected. + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; + final int selectedSize = batch.size; - batch.size = inputLogicalSize; - batch.selected = inputSelected; - batch.selectedInUse = inputSelectedInUse; + // Dump out the spill rows now and determine what is left. + int nonSpillCount = 0; + if (spillSeriesCount > 0) { - if (bigTableValueExpressions != null) { - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); + spillHashMapBatch(batch, spillLogicalIndices, spillDuplicateCounts, spillHashMapResults, + spillSeriesCount, selectedInUse, selected, selectedSize); + + if (spillSeriesCount == inputLogicalSize) { + batch.size = 0; + batch.selectedInUse = false; + return; } + + // Create nonSpillSelected by not including spill batch indices. Note the spill series + // logical indices are with respect to the batch selected not inputSelected. + + nonSpillCount = + makeSelectedByRemovingSeries( + /* Remove batch indices from input batch: */ + inputSelectedInUse, inputSelected, inputLogicalSize, + /* That are in the series: */ + spillLogicalIndices, spillDuplicateCounts, spillSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + nonSpillSelected); + } - batch.size = saveBatchSize; - batch.selected = saveSelected; - batch.selectedInUse = saveSelectedInUse; - } + /* + * Optimize by running value expressions only over the matched rows. + */ + if (matchSeriesCount > 0 && bigTableValueExpressions != null) { - /** - * Apply the value expression to rows specified by a selected array. - * - * @param batch - * The vectorized row batch. - * @param selected - * The (physical) batch indices to apply the expression to. - * @param size - * The size of selected. - */ - private void doValueExpr(VectorizedRowBatch batch, - int[] selected, int size) { + // Create matchSelected by adding the batch indices for the match logical index ranges from + // the input batch range. - int saveBatchSize = batch.size; - int[] saveSelected = batch.selected; - boolean saveSelectedInUse = batch.selectedInUse; + int matchSelectedCount = + flattenLogicalSeriesIntoSelected( + selectedInUse, selected, selectedSize, + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + matchSelected); - batch.size = size; - batch.selected = selected; - batch.selectedInUse = true; + performValueExpressions(batch, matchSelected, matchSelectedCount); - if (bigTableValueExpressions != null) { - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } } - batch.size = saveBatchSize; - batch.selected = saveSelected; - batch.selectedInUse = saveSelectedInUse; - } + int nonMatchSelectedCount = 0; + if (spillSeriesCount > 0) { - /** - * Remove (subtract) members from the input selected array and produce the results into - * a difference array. - * - * @param inputSelectedInUse - * Whether the (original) input batch is selectedInUse. - * @param inputLogicalSize - * The (original) input batch size. - * @param remove - * The indices to remove. They must all be present in input selected array. - * @param removeSize - * The size of remove. - * @param difference - * The resulting difference -- the input selected array indices not in the - * remove array. - * @return - * The resulting size of the difference array. - * @throws HiveException - */ - private int subtractFromInputSelected(boolean inputSelectedInUse, int inputLogicalSize, - int[] remove, int removeSize, int[] difference) throws HiveException { - - // if (!verifyMonotonicallyIncreasing(remove, removeSize)) { - // throw new HiveException("remove is not in sort order and unique"); - // } - - int differenceCount = 0; - - // Determine which rows are left. - int removeIndex = 0; - if (inputSelectedInUse) { - for (int i = 0; i < inputLogicalSize; i++) { - int candidateIndex = inputSelected[i]; - if (removeIndex < removeSize && candidateIndex == remove[removeIndex]) { - removeIndex++; - } else { - difference[differenceCount++] = candidateIndex; - } - } - } else { - for (int candidateIndex = 0; candidateIndex < inputLogicalSize; candidateIndex++) { - if (removeIndex < removeSize && candidateIndex == remove[removeIndex]) { - removeIndex++; - } else { - difference[differenceCount++] = candidateIndex; - } - } - } + // Create non match selected by not including match batch indices from non spill selected. + // Note the spill series logical indices are with respect to the batch selected not + // inputSelected. - if (removeIndex != removeSize) { - throw new HiveException("Not all batch indices removed"); - } + nonMatchSelectedCount = + makeSelectedByRemovingSeries( + /* Remove batch indices from non spill: */ + true, nonSpillSelected, nonSpillCount, + /* That are in the series: */ + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + nonMatchSelected); - // if (!verifyMonotonicallyIncreasing(difference, differenceCount)) { - // throw new HiveException("difference is not in sort order and unique"); - // } + } else { + + // Create non match selected by not including match batch indices from input batch. + // Note the spill series logical indices are with respect to the batch selected + // not inputSelected. + + nonMatchSelectedCount = + makeSelectedByRemovingSeries( + /* Remove batch indices from input batch: */ + inputSelectedInUse, inputSelected, inputLogicalSize, + /* That are in the series: */ + matchLogicalIndices, matchDuplicateCounts, matchSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + nonMatchSelected); + } + + for (int i = 0; i < matchSeriesCount; i++) { + MapJoinHashMapResult hashMapResult = matchHashMapResults[i]; + int logical = matchLogicalIndices[i]; + int duplicateCount = matchDuplicateCounts[i]; + + + // Logical indices are with respect to the batch selected not inputSelected. + if (matchIsSingleValue[i]) { + generateHashMapResultSingleValue( + batch, + hashMapResult, + logical, + duplicateCount); + } else { + generateHashMapResultMultiValue( + batch, + hashMapResult, + logical, + duplicateCount); + } + } - return differenceCount; - } + // Output non matches + if (nonMatchSelectedCount > 0) { + generateOuterNulls(batch, nonMatchSelected, nonMatchSelectedCount); + } + + // Create selected by not including match logical indices for multi-value small table results + // from input batch logical range, and then putting the batch indices in selected. + + int numSel; + if (spillSeriesCount > 0) { + + // Create non match selected by not including match batch indices from non spills. + // Note the spill series logical indices are with respect to the batch selected + // not inputSelected. + + numSel = + makeSelectedByRemovingMultiValues( + /* Remove batch indices from non spill: */ + true, nonSpillSelected, nonSpillCount, + /* That are in the series: */ + matchLogicalIndices, matchDuplicateCounts, matchIsSingleValue, matchSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + resultSelected); + } else { + + // Create non match selected by not including match batch indices from input batch. + // Note the spill series logical indices are with respect to the batch selected + // not inputSelected. + + numSel = + makeSelectedByRemovingMultiValues( + /* Remove batch indices from input batch: */ + inputSelectedInUse, inputSelected, inputLogicalSize, + /* That are in the series: */ + matchLogicalIndices, matchDuplicateCounts, matchIsSingleValue, matchSeriesCount, + selectedInUse, selected, selectedSize, + /* Result: */ + resultSelected); + } + + batch.selectedInUse = true; + batch.size = numSel; + if (numSel > 0) { + System.arraycopy(resultSelected, 0, selected, 0, numSel); + } + } /** * Remove (subtract) members from an array and produce the results into * a difference array. - + * + * @param allSelectedInUse * @param all * The selected array containing all members. * @param allSize @@ -324,21 +395,26 @@ private int subtractFromInputSelected(boolean inputSelectedInUse, int inputLogic * remove array. * @return * The resulting size of the difference array. - * @throws HiveException */ - private int subtract(int[] all, int allSize, - int[] remove, int removeSize, int[] difference) throws HiveException { + private int subtract(boolean allSelectedInUse, int[] all, int allSize, + int[] remove, int removeSize, int[] difference) { + + Preconditions.checkState((all != remove) && (remove != difference) && (difference != all)); - // if (!verifyMonotonicallyIncreasing(remove, removeSize)) { - // throw new HiveException("remove is not in sort order and unique"); - // } + // Comment out these checks when we are happy.. + if (allSelectedInUse && !verifyMonotonicallyIncreasing(all, allSize)) { + throw new RuntimeException("all is not in sort order and unique"); + } + if (!verifyMonotonicallyIncreasing(remove, removeSize)) { + throw new RuntimeException("remove is not in sort order and unique"); + } int differenceCount = 0; // Determine which rows are left. int removeIndex = 0; for (int i = 0; i < allSize; i++) { - int candidateIndex = all[i]; + int candidateIndex = (allSelectedInUse ? all[i] : i); if (removeIndex < removeSize && candidateIndex == remove[removeIndex]) { removeIndex++; } else { @@ -347,206 +423,98 @@ private int subtract(int[] all, int allSize, } if (removeIndex != removeSize) { - throw new HiveException("Not all batch indices removed"); + throw new RuntimeException("Not all batch indices removed"); } - return differenceCount; - } - - /** - * Sort merge two select arrays so the resulting array is ordered by (batch) index. - * - * @param selected1 - * @param selected1Count - * @param selected2 - * @param selected2Count - * @param sortMerged - * The resulting sort merge of selected1 and selected2. - * @return - * The resulting size of the sortMerged array. - * @throws HiveException - */ - private int sortMerge(int[] selected1, int selected1Count, - int[] selected2, int selected2Count, int[] sortMerged) throws HiveException { - - // if (!verifyMonotonicallyIncreasing(selected1, selected1Count)) { - // throw new HiveException("selected1 is not in sort order and unique"); - // } - - // if (!verifyMonotonicallyIncreasing(selected2, selected2Count)) { - // throw new HiveException("selected1 is not in sort order and unique"); - // } - - - int sortMergeCount = 0; - - int selected1Index = 0; - int selected2Index = 0; - for (int i = 0; i < selected1Count + selected2Count; i++) { - if (selected1Index < selected1Count && selected2Index < selected2Count) { - if (selected1[selected1Index] < selected2[selected2Index]) { - sortMerged[sortMergeCount++] = selected1[selected1Index++]; - } else { - sortMerged[sortMergeCount++] = selected2[selected2Index++]; - } - } else if (selected1Index < selected1Count) { - sortMerged[sortMergeCount++] = selected1[selected1Index++]; - } else { - sortMerged[sortMergeCount++] = selected2[selected2Index++]; - } + if (!verifyMonotonicallyIncreasing(difference, differenceCount)) { + throw new RuntimeException("difference is not in sort order and unique"); } - // if (!verifyMonotonicallyIncreasing(sortMerged, sortMergeCount)) { - // throw new HiveException("sortMerged is not in sort order and unique"); - // } - - return sortMergeCount; + return differenceCount; } /** - * Generate the outer join output results for one vectorized row batch. + * Generate the outer join output results for one vectorized row batch that + * has a repeated key. * * @param batch * The big table batch with any matching and any non matching rows both as * selected in use. - * @param allMatchCount - * Number of matches in allMatchs. - * @param equalKeySeriesCount - * Number of single value matches. - * @param atLeastOneNonMatch - * Whether at least one row was a non-match. - * @param inputSelectedInUse - * A copy of the batch's selectedInUse flag on input to the process method. - * @param inputLogicalSize - * The batch's size on input to the process method. - * @param spillCount - * Number of spills in spills. - * @param hashMapResultCount - * Number of entries in hashMapResults. + * @param keySeries + * @param hashMapResult + * @throws IOException + * @throws HiveException */ - public void finishOuter(VectorizedRowBatch batch, - int allMatchCount, int equalKeySeriesCount, boolean atLeastOneNonMatch, - boolean inputSelectedInUse, int inputLogicalSize, - int spillCount, int hashMapResultCount) throws IOException, HiveException { - - // Get rid of spills before we start modifying the batch. - if (spillCount > 0) { - spillHashMapBatch(batch, (VectorMapJoinHashTableResult[]) hashMapResults, - spills, spillHashMapResultIndices, spillCount); + protected void finishOuterRepeated(VectorizedRowBatch batch, VectorKeySeries keySeries, + MapJoinHashMapResult hashMapResult) throws HiveException, IOException { + + // If there was filtering, we need to generate non matching results. + if (batch.size < inputLogicalSize) { + // Subtract current batch selected from input + int nonMatchCount = subtract(inputSelectedInUse, inputSelected, inputLogicalSize, + batch.selected, batch.size, + nonMatchSelected); + generateOuterNulls(batch, nonMatchSelected, nonMatchCount); } - int noMatchCount = 0; - if (spillCount > 0) { - - // Subtract the spills to get all match and non-match rows. - int nonSpillCount = subtractFromInputSelected( - inputSelectedInUse, inputLogicalSize, spills, spillCount, nonSpills); - - if (isLogDebugEnabled) { - LOG.debug("finishOuter spillCount > 0" + - " nonSpills " + intArrayToRangesString(nonSpills, nonSpillCount)); - } - - // Big table value expressions apply to ALL matching and non-matching rows. - if (bigTableValueExpressions != null) { - - doValueExpr(batch, nonSpills, nonSpillCount); - - } - - if (atLeastOneNonMatch) { - noMatchCount = subtract(nonSpills, nonSpillCount, allMatchs, allMatchCount, - noMatchs); - - if (isLogDebugEnabled) { - LOG.debug("finishOuter spillCount > 0" + - " noMatchs " + intArrayToRangesString(noMatchs, noMatchCount)); - } - - } - } else { - - // Run value expressions over original (whole) input batch. - doValueExprOnInputSelected(batch, inputSelectedInUse, inputLogicalSize); - - if (atLeastOneNonMatch) { - noMatchCount = subtractFromInputSelected( - inputSelectedInUse, inputLogicalSize, allMatchs, allMatchCount, noMatchs); - - if (isLogDebugEnabled) { - LOG.debug("finishOuter spillCount == 0" + - " noMatchs " + intArrayToRangesString(noMatchs, noMatchCount)); - } - } + if (hashMapResult.getMapJoinResult() == MapJoinResult.SPILL) { + spillRepeated(batch, keySeries, hashMapResult); + return; } - - // When we generate results into the overflow batch, we may still end up with fewer rows - // in the big table batch. So, nulSel and the batch's selected array will be rebuilt with - // just the big table rows that need to be forwarded, minus any rows processed with the - // overflow batch. - if (allMatchCount > 0) { - - int numSel = 0; - for (int i = 0; i < equalKeySeriesCount; i++) { - int hashMapResultIndex = equalKeySeriesHashMapResultIndices[i]; - VectorMapJoinHashMapResult hashMapResult = hashMapResults[hashMapResultIndex]; - int allMatchesIndex = equalKeySeriesAllMatchIndices[i]; - boolean isSingleValue = equalKeySeriesIsSingleValue[i]; - int duplicateCount = equalKeySeriesDuplicateCounts[i]; - - if (isSingleValue) { - numSel = generateHashMapResultSingleValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount, numSel); - } else { - generateHashMapResultMultiValue( - batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount); - } - } - - // The number of single value rows that were generated in the big table batch. - batch.size = numSel; - batch.selectedInUse = true; - - if (isLogDebugEnabled) { - LOG.debug("finishOuter allMatchCount > 0" + - " batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } - + Preconditions.checkState(hashMapResult.getMapJoinResult() == MapJoinResult.MATCH); + + // The whole batch is matched. + if (hashMapResult.cappedCount() == 1) { + // Single value -- use batch as output. + generateHashMapResultSingleValue( + batch, + hashMapResult, + keySeries.currentLogical, + keySeries.currentDuplicateCount); } else { + // Use overflow for multiple values. + generateHashMapResultMultiValue( + batch, + hashMapResult, + keySeries.currentLogical, + keySeries.currentDuplicateCount); batch.size = 0; } + } - if (noMatchCount > 0) { - if (batch.size > 0) { - - generateOuterNulls(batch, noMatchs, noMatchCount); - - // Merge noMatchs and (match) selected. - int mergeCount = sortMerge( - noMatchs, noMatchCount, batch.selected, batch.size, merged); - - if (isLogDebugEnabled) { - LOG.debug("finishOuter noMatchCount > 0 && batch.size > 0" + - " merged " + intArrayToRangesString(merged, mergeCount)); - } - - System.arraycopy(merged, 0, batch.selected, 0, mergeCount); - batch.size = mergeCount; - batch.selectedInUse = true; - } else { - - // We can use the whole batch for output of no matches. - - generateOuterNullsRepeatedAll(batch); - - System.arraycopy(noMatchs, 0, batch.selected, 0, noMatchCount); - batch.size = noMatchCount; - batch.selectedInUse = true; + /** + * Generate the outer join output results for one vectorized row batch that + * has a repeated key when no match. + * + * To include an filtered out rows, we generate for the entire input batch. + * + * @param batch + * The big table batch with any matching and any non matching rows both as + * selected in use. + * @param keySeries + * @throws IOException + * @throws HiveException + */ + protected void finishOuterRepeatedNoMatch(VectorizedRowBatch batch) + throws HiveException, IOException { + + // The entire input batch is not matched (filtered plus repeated key non match). + for (int logical = 0; logical < inputLogicalSize; logical++) { + int batchIndex = (inputSelectedInUse ? inputSelected[logical] : logical); + + // Mark any scratch small table scratch columns that would normally receive a copy of the + // key as null, too. + for (int column : bigTableOuterKeyOutputVectorColumns) { + ColumnVector colVector = batch.cols[column]; + colVector.noNulls = false; + colVector.isNull[batchIndex] = true; + } - if (isLogDebugEnabled) { - LOG.debug("finishOuter noMatchCount > 0 && batch.size == 0" + - " batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } + // Small table values are set to null. + for (int column : smallTableOutputVectorColumns) { + ColumnVector colVector = batch.cols[column]; + colVector.noNulls = false; + colVector.isNull[batchIndex] = true; } } } @@ -559,18 +527,18 @@ public void finishOuter(VectorizedRowBatch batch, * @param batch * The big table batch with any matching and any non matching rows both as * selected in use. - * @param noMatchs + * @param nonMatchSel * A subset of the rows of the batch that are non matches. * @param noMatchSize * Number of non matches in noMatchs. */ - protected void generateOuterNulls(VectorizedRowBatch batch, int[] noMatchs, - int noMatchSize) throws IOException, HiveException { + protected void generateOuterNulls(VectorizedRowBatch batch, int[] nonMatchSel, int noMatchSize) + throws IOException, HiveException { // Set null information in the small table results area. for (int i = 0; i < noMatchSize; i++) { - int batchIndex = noMatchs[i]; + int batchIndex = nonMatchSel[i]; // Mark any scratch small table scratch columns that would normally receive a copy of the // key as null, too. @@ -589,163 +557,15 @@ protected void generateOuterNulls(VectorizedRowBatch batch, int[] noMatchs, } } - /** - * Generate the outer join output results for one vectorized row batch with a repeated key. - * - * Any filter expressions will apply now since hash map lookup for outer join is complete. - * - * @param batch - * The big table batch with any matching and any non matching rows both as - * selected in use. - * @param joinResult - * The hash map lookup result for the repeated key. - * @param hashMapResults - * The array of all hash map results for the batch. - * @param someRowsFilteredOut - * Whether some rows of the repeated key batch were knocked out by the filter. - * @param inputSelectedInUse - * A copy of the batch's selectedInUse flag on input to the process method. - * @param inputLogicalSize - * The batch's size on input to the process method. - * @param scratch1 - * Pre-allocated storage to internal use. - * @param scratch2 - * Pre-allocated storage to internal use. - */ - public void finishOuterRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, - VectorMapJoinHashMapResult hashMapResult, boolean someRowsFilteredOut, - boolean inputSelectedInUse, int inputLogicalSize) - throws IOException, HiveException { - - // LOG.debug("finishOuterRepeated batch #" + batchCounter + " " + joinResult.name() + " batch.size " + batch.size + " someRowsFilteredOut " + someRowsFilteredOut); - - switch (joinResult) { - case MATCH: - - // Rows we looked up as one repeated key are a match. But filtered out rows - // need to be generated as non-matches, too. - - if (someRowsFilteredOut) { - - // For the filtered out rows that didn't (logically) get looked up in the hash table, - // we need to generate no match results for those too... - - // Run value expressions over original (whole) input batch. - doValueExprOnInputSelected(batch, inputSelectedInUse, inputLogicalSize); - - // Now calculate which rows were filtered out (they are logically no matches). - - // Determine which rows are non matches by determining the delta between inputSelected and - // (current) batch selected. - - int noMatchCount = subtractFromInputSelected( - inputSelectedInUse, inputLogicalSize, batch.selected, batch.size, noMatchs); - - generateOuterNulls(batch, noMatchs, noMatchCount); - - // Now generate the matchs. Single small table values will be put into the big table - // batch and come back in matchs. Any multiple small table value results will go into - // the overflow batch. - generateHashMapResultRepeatedAll(batch, hashMapResult); - - // Merge noMatchs and (match) selected. - int mergeCount = sortMerge( - noMatchs, noMatchCount, batch.selected, batch.size, merged); - - System.arraycopy(merged, 0, batch.selected, 0, mergeCount); - batch.size = mergeCount; - batch.selectedInUse = true; - } else { - - // Just run our value expressions over input batch. - - if (bigTableValueExpressions != null) { - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } - } - - generateHashMapResultRepeatedAll(batch, hashMapResult); - } - break; - - case SPILL: - - // Rows we looked up as one repeated key need to spill. But filtered out rows - // need to be generated as non-matches, too. - - spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashMapResult); - - // After using selected to generate spills, generate non-matches, if any. - if (someRowsFilteredOut) { - - // Determine which rows are non matches by determining the delta between inputSelected and - // (current) batch selected. - - int noMatchCount = subtractFromInputSelected( - inputSelectedInUse, inputLogicalSize, batch.selected, batch.size, noMatchs); - - System.arraycopy(noMatchs, 0, batch.selected, 0, noMatchCount); - batch.size = noMatchCount; - batch.selectedInUse = true; - - generateOuterNullsRepeatedAll(batch); - } else { - batch.size = 0; - } - - break; - - case NOMATCH: - - if (someRowsFilteredOut) { - - // When the repeated no match is due to filtering, we need to restore the - // selected information. - - if (inputSelectedInUse) { - System.arraycopy(inputSelected, 0, batch.selected, 0, inputLogicalSize); - } - batch.size = inputLogicalSize; - } - - // Run our value expressions over whole batch. - if (bigTableValueExpressions != null) { - for(VectorExpression ve: bigTableValueExpressions) { - ve.evaluate(batch); - } - } - - generateOuterNullsRepeatedAll(batch); - break; - } - } - - /** - * Generate the non-match outer join output results for the whole repeating vectorized - * row batch. - * - * Each row will get nulls for all small table values. - * - * @param batch - * The big table batch. - */ - protected void generateOuterNullsRepeatedAll(VectorizedRowBatch batch) throws HiveException { - - for (int column : smallTableOutputVectorColumns) { - ColumnVector colVector = batch.cols[column]; - colVector.noNulls = false; - colVector.isNull[0] = true; - colVector.isRepeating = true; - } - - // Mark any scratch small table scratch columns that would normally receive a copy of the key - // as null, too. - for (int column : bigTableOuterKeyOutputVectorColumns) { - ColumnVector colVector = batch.cols[column]; - colVector.noNulls = false; - colVector.isNull[0] = true; - colVector.isRepeating = true; - } - } + @Override + public void closeOp(boolean aborted) throws HiveException { + super.closeOp(aborted); + if (!aborted) { + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " closeOp outer join " + + outerJoinNullKeyCounter + " null keys, " + + outerJoinFilteredOutCounter + " filtered out rows"); + } + } + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterLongOperator.java index 58bd0ab..b41d753 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterLongOperator.java @@ -24,20 +24,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column Long hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -// Single-Column Long specific imports. -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import com.google.common.base.Preconditions; /* * Specialized class for doing a vectorized map join that is an outer join on a Single-Column Long @@ -45,8 +45,17 @@ */ public class VectorMapJoinOuterLongOperator extends VectorMapJoinOuterGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOuterLongOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinOuterLongOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -54,8 +63,10 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinLongHashMap hashMap; + // CONSIDER using final class VectorMapJoinFastLongHashMap object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Single-Column Long specific members. @@ -68,6 +79,10 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + private transient PrimitiveTypeInfo singleJoinColumnPrimitiveTypeInfo; + + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesLong longKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -108,6 +123,10 @@ public void process(Object row, int tag) throws HiveException { */ singleJoinColumn = bigTableKeyColumnMap[0]; + singleJoinColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) bigTableKeyTypeInfos[0]; + + longKeySeries = + new VectorKeySeriesLong(singleJoinColumn, singleJoinColumnPrimitiveTypeInfo); needCommonSetup = false; } @@ -120,7 +139,7 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column Long hash map information for this specialized class. */ - hashMap = (VectorMapJoinLongHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; useMinMax = hashMap.useMinMax(); if (useMinMax) { min = hashMap.min(); @@ -131,313 +150,154 @@ public void process(Object row, int tag) throws HiveException { } batchCounter++; + inputRowCounter += batch.size; - final int inputLogicalSize = batch.size; + /* + * Do the per-batch setup for an Outer join. + * + * Remember the input selected array so all non matches can be processed correctly + * (i.e. generate result rows with NULLs for the small table). + * + * Execute filter expression if present. + * + * Reset some output columns. + */ - if (inputLogicalSize == 0) { + if (!outerPerBatchSetup(batch)) { if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); + LOG.info(getLoggingPrefix() + " batch #" + batchCounter + " empty"); } return; } - // Do the per-batch setup for an outer join. - - outerPerBatchSetup(batch); - - // For outer join, remember our input rows before ON expression filtering or before - // hash table matching so we can generate results for all rows (matching and non matching) - // later. - boolean inputSelectedInUse = batch.selectedInUse; - if (inputSelectedInUse) { - // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) { - // throw new HiveException("batch.selected is not in sort order and unique"); - // } - System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize); - } - - // Filtering for outer join just removes rows available for hash table matching. - boolean someRowsFilteredOut = false; - if (bigTableFilterExpressions.length > 0) { - // Since the input - for (VectorExpression ve : bigTableFilterExpressions) { - ve.evaluate(batch); - } - someRowsFilteredOut = (batch.size != inputLogicalSize); - if (isLogDebugEnabled) { - if (batch.selectedInUse) { - if (inputSelectedInUse) { - LOG.debug(CLASS_NAME + - " inputSelected " + intArrayToRangesString(inputSelected, inputLogicalSize) + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } else { - LOG.debug(CLASS_NAME + - " inputLogicalSize " + inputLogicalSize + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } - } - } - } - - // Perform any key expressions. Results will go into scratch columns. - if (bigTableKeyExpressions != null) { - for (VectorExpression ve : bigTableKeyExpressions) { - ve.evaluate(batch); - } - } + if (batch.size == 0) { - /* - * Single-Column Long specific declarations. - */ + // All rows were filtered out. But for Outer join they are considered non matches that + // generate result rows with NULLs for the small table. - // The one join column for this specialized class. - LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn]; - long[] vector = joinColVector.vector; + Preconditions.checkState(inputLogicalSize > 0); + finishOuterRepeatedNoMatch(batch); + } else { - /* - * Single-Column Long check for repeating. - */ + // We have rows to lookup in the hash table. - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; + longKeySeries.processBatch(batch); - if (allKeyInputColumnsRepeating) { + MapJoinHashMapResult hashMapResult; + if (longKeySeries.keyCount == 1) { - /* - * Repeating. - */ + // Effectively, one repeating key. - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + if (longKeySeries.currentKeyIsNull) { - /* - * Single-Column Long specific repeated lookup. - */ + // CONSIDER: Add support for NullSafe option. - JoinUtil.JoinResult joinResult; - if (batch.size == 0) { - // Whole repeated key batch was filtered out. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else if (!joinColVector.noNulls && joinColVector.isNull[0]) { - // Any (repeated) null key column is no match for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - // Handle *repeated* join key, if found. - long key = vector[0]; - // LOG.debug(CLASS_NAME + " repeated key " + key); - if (useMinMax && (key < min || key > max)) { - // Out of range for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; + finishOuterRepeatedNoMatch(batch); } else { - joinResult = hashMap.lookup(key, hashMapResults[0]); + long repeatedKey = longKeySeries.getCurrentKey(); + if (useMinMax && (repeatedKey < min || repeatedKey > max)) { + finishOuterRepeatedNoMatch(batch); + } else { + hashMapResult = hashMapResults[0]; + hashMap.hashMapLookup( + repeatedKey, + longKeySeries.currentHashCode, + hashMapResult); + if (hashMapResult.getMapJoinResult() == MapJoinResult.NO_MATCH) { + finishOuterRepeatedNoMatch(batch); + } else { + finishOuterRepeated(batch, longKeySeries, hashMapResult); + } + } } - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishOuterRepeated(batch, joinResult, hashMapResults[0], someRowsFilteredOut, - inputSelectedInUse, inputLogicalSize); - } else { - - /* - * NOT Repeating. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } - - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; - - int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; - - boolean atLeastOneNonMatch = someRowsFilteredOut; - - /* - * Single-Column Long specific variables. - */ - - long saveKey = 0; - - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + } else { - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < batch.size; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + // Multiple key series. - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, taskName + ", " + getOperatorId() + " candidate " + CLASS_NAME + " batch"); + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashMapResultCount = 0; - /* - * Single-Column Long outer null detection. - */ + MapJoinHashTableResult.MapJoinResult lookupResult; + long key; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; + if (longKeySeries.currentKeyIsNull) { - if (isNull) { + // CONSIDER: Add support for NullSafe option. - // Have that the NULL does not interfere with the current equal key series, if there - // is one. We do not set saveJoinResult. - // - // Let a current MATCH equal key series keep going, or - // Let a current SPILL equal key series keep going, or - // Let a current NOMATCH keep not matching. + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; + outerJoinNullKeyCounter += longKeySeries.currentDuplicateCount; - atLeastOneNonMatch = true; + } else { - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " NULL"); - } else { + key = longKeySeries.getCurrentKey(); - /* - * Single-Column Long outer get key. - */ + if (useMinMax && (key < min || key > max)) { + // Out of range for whole batch. + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; + } else { + hashMap.hashMapLookup( + key, + longKeySeries.currentHashCode, + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); + } - long currentKey = vector[batchIndex]; + } /* - * Equal key series checking. + * Common inner join result processing. */ - if (!haveSaveKey || currentKey != saveKey) { - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column Long specific save key. - */ - - saveKey = currentKey; - - /* - * Single-Column Long specific lookup key. - */ - - if (useMinMax && (currentKey < min || currentKey > max)) { - // Key out of range for whole hash table. - saveJoinResult = JoinUtil.JoinResult.NOMATCH; - } else { - saveJoinResult = hashMap.lookup(currentKey, hashMapResults[hashMapResultCount]); - } + switch (lookupResult) { + case MATCH: + matchLogicalIndices[matchSeriesCount] = longKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = longKeySeries.currentDuplicateCount; + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; + hashMapResultCount++; + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, getLoggingPrefix() + " MATCH"); + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = longKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = longKeySeries.currentDuplicateCount; + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; + hashMapResultCount++; + spilledRowCounter += longKeySeries.currentDuplicateCount; + break; + + case NO_MATCH: + // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, getLoggingPrefix() + " NOMATCH"); + break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); + } - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " New Key " + currentKey + " " + saveJoinResult.name()); - - /* - * Common outer join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - atLeastOneNonMatch = true; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " Key Continues " + saveKey + " " + saveJoinResult.name()); - - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } + if (!longKeySeries.next()) { + break; } - // if (!verifyMonotonicallyIncreasing(allMatchs, allMatchCount)) { - // throw new HiveException("allMatchs is not in sort order and unique"); - // } - } - } + } while (true); - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " batch #" + batchCounter + " batch info " + + " inputLogicalSize " + inputLogicalSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " matchHashMapResults " + Arrays.toString(Arrays.copyOfRange(matchHashMapResults, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } + finishOuter(batch, matchSeriesCount, spillSeriesCount); } - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " atLeastOneNonMatch " + atLeastOneNonMatch + - " inputSelectedInUse " + inputSelectedInUse + - " inputLogicalSize " + inputLogicalSize + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); - } - - // We will generate results for all matching and non-matching rows. - finishOuter(batch, - allMatchCount, equalKeySeriesCount, atLeastOneNonMatch, - inputSelectedInUse, inputLogicalSize, - spillCount, hashMapResultCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterMultiKeyOperator.java index 7f9afd2..57477a8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterMultiKeyOperator.java @@ -24,22 +24,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastMultiKeyHashMap; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; - -// Multi-Key hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; - -// Multi-Key specific imports. -import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; -import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; +import com.google.common.base.Preconditions; + /* * Specialized class for doing a vectorized map join that is an outer join on Multi-Key * using a hash map. @@ -47,8 +45,17 @@ public class VectorMapJoinOuterMultiKeyOperator extends VectorMapJoinOuterGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOuterMultiKeyOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinOuterMultiKeyOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -56,21 +63,20 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMap hashMap; + // CONSIDER using final class VectorMapJoinFastMultiKeyHashMap object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Multi-Key specific members. // - // Object that can take a set of columns in row in a vectorized row batch and serialized it. - private transient VectorSerializeRow keyVectorSerializeWrite; + // Binary sortable multi-key serializer. + private transient BinarySortableSerializeWrite binarySortableSerializeWrite; - // The BinarySortable serialization of the current key. - private transient Output currentKeyOutput; - - // The BinarySortable serialization of the saved key for a possible series of equal keys. - private transient Output saveKeyOutput; + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesMultiFast serializedMultiKeySeries; //--------------------------------------------------------------------------- // Pass-thru constructors. @@ -110,12 +116,16 @@ public void process(Object row, int tag) throws HiveException { * Initialize Multi-Key members for this specialized class. */ - keyVectorSerializeWrite = new VectorSerializeRow( - new BinarySortableSerializeWrite(bigTableKeyColumnMap.length)); - keyVectorSerializeWrite.init(bigTableKeyTypeNames, bigTableKeyColumnMap); + binarySortableSerializeWrite = + new BinarySortableSerializeWrite(bigTableKeyColumnMap.length); - currentKeyOutput = new Output(); - saveKeyOutput = new Output(); + // For Multi-Key the Fast hash table computes the serialized key's hash code, + // so we don't use VectorKeySeriesMulti's hash code. + serializedMultiKeySeries = + new VectorKeySeriesMultiFast( + binarySortableSerializeWrite); + Preconditions.checkState(bigTableKeyTypeInfos.length > 0); + serializedMultiKeySeries.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); needCommonSetup = false; } @@ -128,332 +138,152 @@ public void process(Object row, int tag) throws HiveException { * Get our Multi-Key hash map information for this specialized class. */ - hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; - final int inputLogicalSize = batch.size; + /* + * Do the per-batch setup for an Outer join. + * + * Remember the input selected array so all non matches can be processed correctly + * (i.e. generate result rows with NULLs for the small table). + * + * Execute filter expression if present. + * + * Reset some output columns. + */ - if (inputLogicalSize == 0) { + if (!outerPerBatchSetup(batch)) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } return; } - // Do the per-batch setup for an outer join. - - outerPerBatchSetup(batch); - - // For outer join, remember our input rows before ON expression filtering or before - // hash table matching so we can generate results for all rows (matching and non matching) - // later. - boolean inputSelectedInUse = batch.selectedInUse; - if (inputSelectedInUse) { - // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) { - // throw new HiveException("batch.selected is not in sort order and unique"); - // } - System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize); - } - - // Filtering for outer join just removes rows available for hash table matching. - boolean someRowsFilteredOut = false; - if (bigTableFilterExpressions.length > 0) { - // Since the input - for (VectorExpression ve : bigTableFilterExpressions) { - ve.evaluate(batch); - } - someRowsFilteredOut = (batch.size != inputLogicalSize); - if (isLogDebugEnabled) { - if (batch.selectedInUse) { - if (inputSelectedInUse) { - LOG.debug(CLASS_NAME + - " inputSelected " + intArrayToRangesString(inputSelected, inputLogicalSize) + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } else { - LOG.debug(CLASS_NAME + - " inputLogicalSize " + inputLogicalSize + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } - } - } - } - - // Perform any key expressions. Results will go into scratch columns. - if (bigTableKeyExpressions != null) { - for (VectorExpression ve : bigTableKeyExpressions) { - ve.evaluate(batch); - } - } - - /* - * Multi-Key specific declarations. - */ - - // None. - - /* - * Multi-Key Long check for repeating. - */ - - // If all BigTable input columns to key expressions are isRepeating, then - // calculate key once; lookup once. - // Also determine if any nulls are present since for a join that means no match. - boolean allKeyInputColumnsRepeating; - boolean someKeyInputColumnIsNull = false; // Only valid if allKeyInputColumnsRepeating is true. - if (bigTableKeyColumnMap.length == 0) { - allKeyInputColumnsRepeating = false; - } else { - allKeyInputColumnsRepeating = true; - for (int i = 0; i < bigTableKeyColumnMap.length; i++) { - ColumnVector colVector = batch.cols[bigTableKeyColumnMap[i]]; - if (!colVector.isRepeating) { - allKeyInputColumnsRepeating = false; - break; - } - if (!colVector.noNulls && colVector.isNull[0]) { - someKeyInputColumnIsNull = true; - } - } - } - - if (allKeyInputColumnsRepeating) { + if (batch.size == 0) { - /* - * Repeating. - */ - - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. - - /* - * Multi-Key specific repeated lookup. - */ - - JoinUtil.JoinResult joinResult; - if (batch.size == 0) { - // Whole repeated key batch was filtered out. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else if (someKeyInputColumnIsNull) { - // Any (repeated) null key column is no match for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - - // All key input columns are repeating. Generate key once. Lookup once. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, 0); - byte[] keyBytes = currentKeyOutput.getData(); - int keyLength = currentKeyOutput.getLength(); - joinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[0]); - } + // All rows were filtered out. But for Outer join they are considered non matches that + // generate result rows with NULLs for the small table. - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishOuterRepeated(batch, joinResult, hashMapResults[0], someRowsFilteredOut, - inputSelectedInUse, inputLogicalSize); + Preconditions.checkState(inputLogicalSize > 0); + finishOuterRepeatedNoMatch(batch); } else { - /* - * NOT Repeating. - */ + // We have rows to lookup in the hash table. - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } + serializedMultiKeySeries.processBatch(batch); - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; + MapJoinHashMapResult hashMapResult; + if (serializedMultiKeySeries.keyCount == 1) { - int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; + // Effectively, one repeating key. - boolean atLeastOneNonMatch = someRowsFilteredOut; + // NOTE: Checking if the whole key is null is different than + // Inner, InnerBigTable, and LeftSemi... Why? + if (serializedMultiKeySeries.currentKeyIsNull) { - /* - * Multi-Key specific variables. - */ + // CONSIDER: Add support for NullSafe option. - Output temp; + finishOuterRepeatedNoMatch(batch); + } else { + hashMapResult = hashMapResults[0]; + hashMap.hashMapLookup( + serializedMultiKeySeries.serializedBytes, + serializedMultiKeySeries.serializedStart, + serializedMultiKeySeries.serializedLength, + serializedMultiKeySeries.currentHashCode, + hashMapResult); + if (hashMapResult.getMapJoinResult() == MapJoinResult.NO_MATCH) { + finishOuterRepeatedNoMatch(batch); + } else { + finishOuterRepeated(batch, serializedMultiKeySeries, hashMapResult); + } + } + } else { - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + // Multiple key series. - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < batch.size; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashMapResultCount = 0; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, taskName + ", " + getOperatorId() + " candidate " + CLASS_NAME + " batch"); + MapJoinHashTableResult.MapJoinResult lookupResult; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - /* - * Multi-Key outer null detection. - */ + // NOTE: Any null column in the key columns is a non-match. + if (serializedMultiKeySeries.currentKeyIsNull) { - // Generate binary sortable key for current row in vectorized row batch. - keyVectorSerializeWrite.setOutput(currentKeyOutput); - keyVectorSerializeWrite.serializeWrite(batch, batchIndex); - if (keyVectorSerializeWrite.getHasAnyNulls()) { + // CONSIDER: Add support for NullSafe option. - // Have that the NULL does not interfere with the current equal key series, if there - // is one. We do not set saveJoinResult. - // - // Let a current MATCH equal key series keep going, or - // Let a current SPILL equal key series keep going, or - // Let a current NOMATCH keep not matching. + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; + outerJoinNullKeyCounter += serializedMultiKeySeries.currentDuplicateCount; - atLeastOneNonMatch = true; + } else { - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " NULL"); - } else { + hashMap.hashMapLookup( + serializedMultiKeySeries.serializedBytes, + serializedMultiKeySeries.serializedStart, + serializedMultiKeySeries.serializedLength, + serializedMultiKeySeries.currentHashCode, + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); + } /* - * Multi-Key outer get key. + * Common inner join result processing. */ - // Generated earlier to get possible null(s). - - /* - * Equal key series checking. - */ + switch (lookupResult) { + case MATCH: + matchLogicalIndices[matchSeriesCount] = serializedMultiKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = serializedMultiKeySeries.currentDuplicateCount; + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; + hashMapResultCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = serializedMultiKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = serializedMultiKeySeries.currentDuplicateCount; + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; + hashMapResultCount++; + spilledRowCounter += serializedMultiKeySeries.currentDuplicateCount; + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); + } - if (!haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { - - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Multi-Key specific save key. - */ - - temp = saveKeyOutput; - saveKeyOutput = currentKeyOutput; - currentKeyOutput = temp; - - /* - * Multi-Key specific lookup key. - */ - - byte[] keyBytes = saveKeyOutput.getData(); - int keyLength = saveKeyOutput.getLength(); - saveJoinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[hashMapResultCount]); - - /* - * Common outer join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - atLeastOneNonMatch = true; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " Key Continues " + saveKey + " " + saveJoinResult.name()); - - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } + if (!serializedMultiKeySeries.next()) { + break; } - // if (!verifyMonotonicallyIncreasing(allMatchs, allMatchCount)) { - // throw new HiveException("allMatchs is not in sort order and unique"); - // } - } - } + } while (true); - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " inputLogicalSize " + inputLogicalSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " matchHashMapResults " + Arrays.toString(Arrays.copyOfRange(matchHashMapResults, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - } - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " atLeastOneNonMatch " + atLeastOneNonMatch + - " inputSelectedInUse " + inputSelectedInUse + - " inputLogicalSize " + inputLogicalSize + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); + finishOuter(batch, matchSeriesCount, spillSeriesCount); } - - // We will generate results for all matching and non-matching rows. - finishOuter(batch, - allMatchCount, equalKeySeriesCount, atLeastOneNonMatch, - inputSelectedInUse, inputLogicalSize, - spillCount, hashMapResultCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterStringOperator.java index 8ed1ed4..0b05bf3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterStringOperator.java @@ -24,19 +24,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFind; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; // Single-Column String hash table import. -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; - -// Single-Column String specific imports. -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytes; +import com.google.common.base.Preconditions; /* * Specialized class for doing a vectorized map join that is an outer join on a Single-Column String @@ -45,8 +44,17 @@ public class VectorMapJoinOuterStringOperator extends VectorMapJoinOuterGenerateResultOperator { private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOuterStringOperator.class.getName()); + + //------------------------------------------------------------------------------------------------ + private static final String CLASS_NAME = VectorMapJoinOuterStringOperator.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + //------------------------------------------------------------------------------------------------ // (none) @@ -54,8 +62,10 @@ // transient. //--------------------------------------------------------------------------- - // The hash map for this specialized class. - private transient VectorMapJoinBytesHashMap hashMap; + // CONSIDER using final class VectorMapJoinFastStringHashMap object for better performance by + // avoiding using an interface (MapJoinHashTableFind). Would require Hybrid versions of + // operators that use final HybridHashTableContainer object.. + private transient MapJoinHashTableFind hashMap; //--------------------------------------------------------------------------- // Single-Column String specific members. @@ -64,6 +74,9 @@ // The column number for this one column join specialization. private transient int singleJoinColumn; + // The object that determines equal key series. It is a final class for good performance. + private transient VectorKeySeriesBytes bytesKeySeries; + //--------------------------------------------------------------------------- // Pass-thru constructors. // @@ -104,6 +117,8 @@ public void process(Object row, int tag) throws HiveException { singleJoinColumn = bigTableKeyColumnMap[0]; + bytesKeySeries = new VectorKeySeriesBytes(singleJoinColumn); + needCommonSetup = false; } @@ -115,315 +130,150 @@ public void process(Object row, int tag) throws HiveException { * Get our Single-Column String hash map information for this specialized class. */ - hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; + hashMap = vectorMapJoinHashTableFind; needHashTableSetup = false; } batchCounter++; + inputRowCounter += batch.size; - final int inputLogicalSize = batch.size; + /* + * Do the per-batch setup for an Outer join. + * + * Remember the input selected array so all non matches can be processed correctly + * (i.e. generate result rows with NULLs for the small table). + * + * Execute filter expression if present. + * + * Reset some output columns. + */ - if (inputLogicalSize == 0) { + if (!outerPerBatchSetup(batch)) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } return; } - // Do the per-batch setup for an outer join. - - outerPerBatchSetup(batch); - - // For outer join, remember our input rows before ON expression filtering or before - // hash table matching so we can generate results for all rows (matching and non matching) - // later. - boolean inputSelectedInUse = batch.selectedInUse; - if (inputSelectedInUse) { - // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) { - // throw new HiveException("batch.selected is not in sort order and unique"); - // } - System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize); - } - - // Filtering for outer join just removes rows available for hash table matching. - boolean someRowsFilteredOut = false; - if (bigTableFilterExpressions.length > 0) { - // Since the input - for (VectorExpression ve : bigTableFilterExpressions) { - ve.evaluate(batch); - } - someRowsFilteredOut = (batch.size != inputLogicalSize); - if (isLogDebugEnabled) { - if (batch.selectedInUse) { - if (inputSelectedInUse) { - LOG.debug(CLASS_NAME + - " inputSelected " + intArrayToRangesString(inputSelected, inputLogicalSize) + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } else { - LOG.debug(CLASS_NAME + - " inputLogicalSize " + inputLogicalSize + - " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); - } - } - } - } - - // Perform any key expressions. Results will go into scratch columns. - if (bigTableKeyExpressions != null) { - for (VectorExpression ve : bigTableKeyExpressions) { - ve.evaluate(batch); - } - } - - /* - * Single-Column String specific declarations. - */ - - // The one join column for this specialized class. - BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; - byte[][] vector = joinColVector.vector; - int[] start = joinColVector.start; - int[] length = joinColVector.length; - - /* - * Single-Column String check for repeating. - */ - - // Check single column for repeating. - boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; - - if (allKeyInputColumnsRepeating) { - - /* - * Repeating. - */ + if (batch.size == 0) { - // All key input columns are repeating. Generate key once. Lookup once. - // Since the key is repeated, we must use entry 0 regardless of selectedInUse. + // All rows were filtered out. But for Outer join they are considered non matches that + // generate result rows with NULLs for the small table. - /* - * Single-Column String specific repeated lookup. - */ - - JoinUtil.JoinResult joinResult; - if (batch.size == 0) { - // Whole repeated key batch was filtered out. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else if (!joinColVector.noNulls && joinColVector.isNull[0]) { - // Any (repeated) null key column is no match for whole batch. - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - // Handle *repeated* join key, if found. - byte[] keyBytes = vector[0]; - int keyStart = start[0]; - int keyLength = length[0]; - joinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[0]); - } - - /* - * Common repeated join result processing. - */ - - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); - } - finishOuterRepeated(batch, joinResult, hashMapResults[0], someRowsFilteredOut, - inputSelectedInUse, inputLogicalSize); + Preconditions.checkState(inputLogicalSize > 0); + finishOuterRepeatedNoMatch(batch); } else { - /* - * NOT Repeating. - */ + // We have rows to lookup in the hash table. - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); - } + bytesKeySeries.processBatch(batch); - int selected[] = batch.selected; - boolean selectedInUse = batch.selectedInUse; + MapJoinHashMapResult hashMapResult; + if (bytesKeySeries.keyCount == 1) { - int hashMapResultCount = 0; - int allMatchCount = 0; - int equalKeySeriesCount = 0; - int spillCount = 0; + // Effectively, one repeating key. - boolean atLeastOneNonMatch = someRowsFilteredOut; + if (bytesKeySeries.currentKeyIsNull) { - /* - * Single-Column String specific variables. - */ + // CONSIDER: Add support for NullSafe option. - int saveKeyBatchIndex = -1; + finishOuterRepeatedNoMatch(batch); + } else { + hashMapResult = hashMapResults[0]; + hashMap.hashMapLookup( + bytesKeySeries.currentBytes, + bytesKeySeries.currentStart, + bytesKeySeries.currentLength, + bytesKeySeries.currentHashCode, + hashMapResult); + if (hashMapResult.getMapJoinResult() == MapJoinResult.NO_MATCH) { + finishOuterRepeatedNoMatch(batch); + } else { + finishOuterRepeated(batch, bytesKeySeries, hashMapResult); + } + } + } else { - // We optimize performance by only looking up the first key in a series of equal keys. - boolean haveSaveKey = false; - JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; + // Multiple key series. - // Logical loop over the rows in the batch since the batch may have selected in use. - for (int logical = 0; logical < batch.size; logical++) { - int batchIndex = (selectedInUse ? selected[logical] : logical); + int matchSeriesCount = 0; + int spillSeriesCount = 0; + int hashMapResultCount = 0; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, taskName + ", " + getOperatorId() + " candidate " + CLASS_NAME + " batch"); + MapJoinHashTableResult.MapJoinResult lookupResult; + do { + // Use the next hash map result entry. + hashMapResult = hashMapResults[hashMapResultCount]; - /* - * Single-Column String outer null detection. - */ + if (bytesKeySeries.currentKeyIsNull) { - boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; + // CONSIDER: Add support for NullSafe option. - if (isNull) { + lookupResult = MapJoinHashTableResult.MapJoinResult.NO_MATCH; + outerJoinNullKeyCounter += bytesKeySeries.currentDuplicateCount; - // Have that the NULL does not interfere with the current equal key series, if there - // is one. We do not set saveJoinResult. - // - // Let a current MATCH equal key series keep going, or - // Let a current SPILL equal key series keep going, or - // Let a current NOMATCH keep not matching. + } else { - atLeastOneNonMatch = true; + hashMap.hashMapLookup( + bytesKeySeries.currentBytes, + bytesKeySeries.currentStart, + bytesKeySeries.currentLength, + bytesKeySeries.currentHashCode, + hashMapResult); + lookupResult = hashMapResult.getMapJoinResult(); - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " NULL"); - } else { + } /* - * Single-Column String outer get key. + * Common inner join result processing. */ - // Implicit -- use batchIndex. - - /* - * Equal key series checking. - */ + switch (lookupResult) { + case MATCH: + matchLogicalIndices[matchSeriesCount] = bytesKeySeries.currentLogical; + matchDuplicateCounts[matchSeriesCount] = bytesKeySeries.currentDuplicateCount; + matchIsSingleValue[matchSeriesCount] = hashMapResult.isSingleRow(); + matchHashMapResults[matchSeriesCount] = hashMapResult; + matchSeriesCount++; + hashMapResultCount++; + break; + + case SPILL: + spillLogicalIndices[spillSeriesCount] = bytesKeySeries.currentLogical; + spillDuplicateCounts[spillSeriesCount] = bytesKeySeries.currentDuplicateCount; + spillHashMapResults[spillSeriesCount] = hashMapResult; + spillSeriesCount++; + hashMapResultCount++; + spilledRowCounter += bytesKeySeries.currentDuplicateCount; + break; + + case NO_MATCH: + break; + + default: + throw new RuntimeException("Unexpected lookup result " + lookupResult.name()); + } - if (!haveSaveKey || - StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], - vector[batchIndex], start[batchIndex], length[batchIndex]) == false) { - // New key. - - if (haveSaveKey) { - // Move on with our counts. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; - } - } - - // Regardless of our matching result, we keep that information to make multiple use - // of it for a possible series of equal keys. - haveSaveKey = true; - - /* - * Single-Column String specific save key. - */ - - saveKeyBatchIndex = batchIndex; - - /* - * Single-Column Long specific lookup key. - */ - - byte[] keyBytes = vector[batchIndex]; - int keyStart = start[batchIndex]; - int keyLength = length[batchIndex]; - saveJoinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[hashMapResultCount]); - - /* - * Common outer join result processing. - */ - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; - equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; - equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); - equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - atLeastOneNonMatch = true; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey); - break; - } - } else { - // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " Key Continues " + saveKey + " " + saveJoinResult.name()); - - // Series of equal keys. - - switch (saveJoinResult) { - case MATCH: - equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; - allMatchs[allMatchCount++] = batchIndex; - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate"); - break; - - case SPILL: - spills[spillCount] = batchIndex; - spillHashMapResultIndices[spillCount] = hashMapResultCount; - spillCount++; - break; - - case NOMATCH: - // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate"); - break; - } + if (!bytesKeySeries.next()) { + break; } - // if (!verifyMonotonicallyIncreasing(allMatchs, allMatchCount)) { - // throw new HiveException("allMatchs is not in sort order and unique"); - // } - } - } + } while (true); - if (haveSaveKey) { - // Update our counts for the last key. - switch (saveJoinResult) { - case MATCH: - hashMapResultCount++; - equalKeySeriesCount++; - break; - case SPILL: - hashMapResultCount++; - break; - case NOMATCH: - break; + if (isLogDebugEnabled) { + LOG.debug(CLASS_NAME + " batch #" + batchCounter + " batch info " + + " inputLogicalSize " + inputLogicalSize + + " matchSeriesCount " + matchSeriesCount + + " matchLogicalIndices " + intArrayToRangesString(matchLogicalIndices, matchSeriesCount) + + " matchDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(matchDuplicateCounts, 0, matchSeriesCount)) + + " matchHashMapResults " + Arrays.toString(Arrays.copyOfRange(matchHashMapResults, 0, matchSeriesCount)) + + " spillSeriesCount " + spillSeriesCount + + " spillLogicalIndices " + intArrayToRangesString(spillLogicalIndices, spillSeriesCount) + + " spillDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(spillDuplicateCounts, 0, spillSeriesCount))); } - } - if (isLogDebugEnabled) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + - " allMatchs " + intArrayToRangesString(allMatchs,allMatchCount) + - " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + - " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + - " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + - " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + - " atLeastOneNonMatch " + atLeastOneNonMatch + - " inputSelectedInUse " + inputSelectedInUse + - " inputLogicalSize " + inputLogicalSize + - " spills " + intArrayToRangesString(spills, spillCount) + - " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + - " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); + finishOuter(batch, matchSeriesCount, spillSeriesCount); } - - // We will generate results for all matching and non-matching rows. - finishOuter(batch, - allMatchCount, equalKeySeriesCount, atLeastOneNonMatch, - inputSelectedInUse, inputLogicalSize, - spillCount, hashMapResultCount); } if (batch.size > 0) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMap.java index d878f65..06069e1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMap.java @@ -18,93 +18,216 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import java.io.IOException; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; import org.apache.hadoop.io.BytesWritable; -import org.apache.hive.common.util.HashCodeUtil; - -import com.google.common.annotations.VisibleForTesting; /* * An bytes key hash map optimized for vector map join. * * This is the abstract base for the multi-key and string bytes key hash map implementations. */ -public abstract class VectorMapJoinFastBytesHashMap - extends VectorMapJoinFastBytesHashTable - implements VectorMapJoinBytesHashMap { +public abstract class VectorMapJoinFastBytesHashMap extends VectorMapJoinFastBytesHashTable { private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastBytesHashMap.class); - private VectorMapJoinFastValueStore valueStore; + private VectorMapJoinFastKeyAndValueStore keyAndValueStore; - protected BytesWritable testValueBytesWritable; + public void add(byte[] keyBytes, int keyStart, int keyLength, int hashCode, BytesWritable currentValue) { - @Override - public VectorMapJoinHashMapResult createHashMapResult() { - return new VectorMapJoinFastValueStore.HashMapResult(); - } + if (resizeThreshold <= keysAssigned) { + expandAndRehash(); + } - @Override - public void assignSlot(int slot, byte[] keyBytes, int keyStart, int keyLength, - long hashCode, boolean isNewKey, BytesWritable currentValue) { + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long part1Word; + long part2Word = -1; + boolean isNewKey; + while (true) { + pairIndex = 2 * slot; + part1Word = slotPairs[pairIndex]; + if (part1Word == 0) { + isNewKey = true;; + break; + } + part2Word = slotPairs[pairIndex + 1]; + if (hashCode == VectorMapJoinFastKeyAndValueStore.getHashCode(part1Word) && + keyAndValueStore.equalKey(part1Word, part2Word, keyBytes, keyStart, keyLength)) { + isNewKey = false; + break; + } + + // Some other key (collision) - keep probing. + metricPutConflict++; + probeSlot += (++i); + slot = (int) (probeSlot & logicalHashBucketMask); + } + + if (largestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + largestNumberOfSteps = i; + // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); + } byte[] valueBytes = currentValue.getBytes(); int valueLength = currentValue.getLength(); - int tripleIndex = 3 * slot; if (isNewKey) { // First entry. - slotTriples[tripleIndex] = keyStore.add(keyBytes, keyStart, keyLength); - slotTriples[tripleIndex + 1] = hashCode; - slotTriples[tripleIndex + 2] = valueStore.addFirst(valueBytes, 0, valueLength); - // LOG.debug("VectorMapJoinFastBytesHashMap add first keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); + keyAndValueStore.addFirst(hashCode, keyBytes, keyStart, keyLength, + valueBytes, 0, valueLength); + + // Save 128 key and value reference that includes hash code. + slotPairs[pairIndex] = keyAndValueStore.addPart1Word; + slotPairs[pairIndex + 1] = keyAndValueStore.addPart2Word; + keysAssigned++; } else { // Add another value. - // LOG.debug("VectorMapJoinFastBytesHashMap add more keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); - slotTriples[tripleIndex + 2] = valueStore.addMore(slotTriples[tripleIndex + 2], valueBytes, 0, valueLength); - // LOG.debug("VectorMapJoinFastBytesHashMap add more new valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); + keyAndValueStore.addMore(part1Word, part2Word, valueBytes, 0, valueLength); + + // Update. + slotPairs[pairIndex] = keyAndValueStore.addPart1Word; } + numValues++; } @Override - public JoinUtil.JoinResult lookup(byte[] keyBytes, int keyStart, int keyLength, VectorMapJoinHashMapResult hashMapResult) { - VectorMapJoinFastValueStore.HashMapResult optimizedHashMapResult = - (VectorMapJoinFastValueStore.HashMapResult) hashMapResult; - - optimizedHashMapResult.forget(); + protected void expandAndRehashImpl(int capacity) { + + long expandTime = System.currentTimeMillis(); + int newLogicalHashBucketCount = capacity; + int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; + int newMetricPutConflict = 0; + int newLargestNumberOfSteps = 0; + + int newSlotPairArraySize = newLogicalHashBucketCount * 2; + long[] newslotPairs = new long[newSlotPairArraySize]; + + for (int slot = 0; slot < logicalHashBucketCount; slot++) { + int pairIndex = slot * 2; + long part1Word = slotPairs[pairIndex]; + if (part1Word != 0) { + int hashCode = VectorMapJoinFastKeyAndValueStore.getHashCode(part1Word); + long part2Word = slotPairs[pairIndex + 1]; + + // Copy to new slot table. + int newSlot = hashCode & newLogicalHashBucketMask; + long newProbeSlot = newSlot; + int newPairIndex; + int i = 0; + while (true) { + newPairIndex = newSlot * 2; + long newPair1Word = newslotPairs[newPairIndex]; + if (newPair1Word == 0) { + break; + } + ++newMetricPutConflict; + // Some other key (collision) - keep probing. + newProbeSlot += (++i); + newSlot = (int)(newProbeSlot & newLogicalHashBucketMask); + } + + if (newLargestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + newLargestNumberOfSteps = i; + // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); + } + + // Use old reference words. + newslotPairs[newPairIndex] = part1Word; + newslotPairs[newPairIndex + 1] = part2Word; + } + } - long hashCode = HashCodeUtil.murmurHash(keyBytes, keyStart, keyLength); - long valueRefWord = findReadSlot(keyBytes, keyStart, keyLength, hashCode, hashMapResult.getReadPos()); - JoinUtil.JoinResult joinResult; - if (valueRefWord == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - // LOG.debug("VectorMapJoinFastBytesHashMap lookup hashCode " + Long.toHexString(hashCode) + " valueRefWord " + Long.toHexString(valueRefWord) + " (valueStore != null) " + (valueStore != null)); + slotPairs = newslotPairs; + logicalHashBucketCount = newLogicalHashBucketCount; + logicalHashBucketMask = newLogicalHashBucketMask; + metricPutConflict = newMetricPutConflict; + largestNumberOfSteps = newLargestNumberOfSteps; + resizeThreshold = (int)(logicalHashBucketCount * loadFactor); + metricExpandsMs += (System.currentTimeMillis() - expandTime); + metricExpands++; + } - optimizedHashMapResult.set(valueStore, valueRefWord); + protected int getLongsPerSlot() { + return 2; + } - joinResult = JoinUtil.JoinResult.MATCH; - } + /* + * The hash table slots. For a bytes key hash table, each slot is 2 longs and the array is + * 2X sized. + * + * The slot pair is a 128 bit key and value reference that includes 32 bits for the hash code.. + */ + protected long[] slotPairs; - optimizedHashMapResult.setJoinResult(joinResult); + @Override + public void allocateBucketArray() { + int slotPairArraySize = 2 * logicalHashBucketCount; + slotPairs = new long[slotPairArraySize]; + } - return joinResult; + @Override + public void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, + int hashCode, MapJoinHashMapResult hashMapResult) { + + VectorMapJoinFastKeyAndValueStore.HashMapResult fastHashMapResult = + (VectorMapJoinFastKeyAndValueStore.HashMapResult) hashMapResult; + + fastHashMapResult.forget(); + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long part1Word; + long part2Word = -1; + while (true) { + pairIndex = slot * 2; + part1Word = slotPairs[pairIndex]; + if (part1Word == 0) { + // Given that we do not delete, an empty slot means no match. + hashMapResult.setNoMatch(); + return; + } else if (hashCode == VectorMapJoinFastKeyAndValueStore.getHashCode(part1Word)) { + // Finally, verify the key bytes match. + part2Word = slotPairs[pairIndex + 1]; + fastHashMapResult.setKey(keyAndValueStore, part1Word, part2Word); + if (fastHashMapResult.equalKey(keyBytes, keyStart, keyLength)) { + fastHashMapResult.setMatch(); + return; + } + } + // Some other key (collision) - keep probing. + metricGetConflict++; + probeSlot += (++i); + if (i > largestNumberOfSteps) { + // We know we never went that far when we were inserting. + hashMapResult.setNoMatch(); + return; + } + slot = (int)(probeSlot & logicalHashBucketMask); + } } public VectorMapJoinFastBytesHashMap( - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); - valueStore = new VectorMapJoinFastValueStore(writeBuffersSize); + keyAndValueStore = new VectorMapJoinFastKeyAndValueStore(writeBuffersSize); + } - // Share the same write buffers with our value store. - keyStore = new VectorMapJoinFastKeyStore(valueStore.writeBuffers()); + @Override + public void debugDumpTable() { + // UNDONE } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMultiSet.java index b328efd..b3dd90c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashMultiSet.java @@ -22,13 +22,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; -import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResultImpl; +import org.apache.hadoop.hive.serde2.WriteBuffers; import org.apache.hadoop.io.BytesWritable; -import org.apache.hive.common.util.HashCodeUtil; +import com.google.common.base.Preconditions; import com.google.common.annotations.VisibleForTesting; /* @@ -36,65 +35,312 @@ * * This is the abstract base for the multi-key and string bytes key hash multi-set implementations. */ -public abstract class VectorMapJoinFastBytesHashMultiSet - extends VectorMapJoinFastBytesHashTable - implements VectorMapJoinBytesHashMultiSet { +public abstract class VectorMapJoinFastBytesHashMultiSet extends VectorMapJoinFastBytesHashTable { - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastBytesHashMultiSet.class); + private static final String CLASS_NAME = VectorMapJoinFastBytesHashMultiSet.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); - @Override - public VectorMapJoinHashMultiSetResult createHashMultiSetResult() { - return new VectorMapJoinFastHashMultiSet.HashMultiSetResult(); + public static class HashMultiSetResult extends MapJoinHashMultiSetResultImpl { + + protected WriteBuffers.Position readPos; + + public HashMultiSetResult() { + super(); + readPos = new WriteBuffers.Position(); + } + + public WriteBuffers.Position getReadPos() { + return readPos; + } + } + + private VectorMapJoinFastKeyStore keyStore; + + private final static class HashCodeAndCounterWord { + + // Lowest field. + private final class HashCode { + private static final int bitLength = 32; + private static final long allBitsOn = (1L << bitLength) - 1; + private static final long bitMask = allBitsOn; + } + + public static int getHashCode(long hashCodeAndCounterWord) { + return (int) ((hashCodeAndCounterWord & HashCode.bitMask)); + } + + private final class Counter { + private static final int bitLength = 31; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int bitShift = HashCode.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + } + + public static int getCounter(long hashCodeAndCounterWord) { + return (int) ((hashCodeAndCounterWord & Counter.bitMask) >> Counter.bitShift); + } + + // This bit should not be on for valid value references. We use -1 for a no value marker. + private final class IsInvalidFlag { + private static final int bitShift = 63; + private static final long flagOnMask = 1L << bitShift; + } + + public static long newWord(int hashCode) { + long hashCodeAndCounterWord = ((long) hashCode) & HashCode.bitMask; + hashCodeAndCounterWord |= ((long) 1 << Counter.bitShift); + return hashCodeAndCounterWord; + } + + public static long incrementCounter(long hashCodeAndCounterWord) { + int counter = (int) ((hashCodeAndCounterWord & Counter.bitMask) >> Counter.bitShift); + hashCodeAndCounterWord &= ~Counter.bitMask; + hashCodeAndCounterWord |= ((long) counter + 1) << Counter.bitShift; + return hashCodeAndCounterWord; + } } @Override - public void assignSlot(int slot, byte[] keyBytes, int keyStart, int keyLength, - long hashCode, boolean isNewKey, BytesWritable currentValue) { + protected void add(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + BytesWritable currentValue) { + + if (resizeThreshold <= keysAssigned) { + expandAndRehash(); + } + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long keyRef; + long hashCodeAndCounterWord = -1; + boolean isNewKey; + while (true) { + pairIndex = 2 * slot; + keyRef = slotPairs[pairIndex]; + if (keyRef == 0) { + isNewKey = true;; + break; + } + hashCodeAndCounterWord = slotPairs[pairIndex + 1]; + if (hashCode == HashCodeAndCounterWord.getHashCode(hashCodeAndCounterWord) && + keyStore.unsafeEqualKey(keyRef, keyBytes, keyStart, keyLength)) { + isNewKey = false; + break; + } + + // Some other key (collision) - keep probing. + metricPutConflict++; + probeSlot += (++i); + slot = (int) (probeSlot & logicalHashBucketMask); + } + + if (largestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + largestNumberOfSteps = i; + } - int tripleIndex = 3 * slot; if (isNewKey) { - // First entry. - slotTriples[tripleIndex] = keyStore.add(keyBytes, keyStart, keyLength); - slotTriples[tripleIndex + 1] = hashCode; - slotTriples[tripleIndex + 2] = 1; // Count. - // LOG.debug("VectorMapJoinFastBytesHashMap add first keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); + slotPairs[pairIndex] = keyStore.add(keyBytes, keyStart, keyLength); + slotPairs[pairIndex + 1] = HashCodeAndCounterWord.newWord(hashCode); + keysAssigned++; } else { - // Add another value. - // LOG.debug("VectorMapJoinFastBytesHashMap add more keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); - slotTriples[tripleIndex + 2]++; + slotPairs[pairIndex + 1] = HashCodeAndCounterWord.incrementCounter(hashCodeAndCounterWord); } + numValues++; } @Override - public JoinUtil.JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMultiSetResult hashMultiSetResult) { + protected void expandAndRehashImpl(int capacity) { - VectorMapJoinFastHashMultiSet.HashMultiSetResult optimizedHashMultiSetResult = - (VectorMapJoinFastHashMultiSet.HashMultiSetResult) hashMultiSetResult; + long expandTime = System.currentTimeMillis(); + int newLogicalHashBucketCount = capacity; + int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; + int newMetricPutConflict = 0; + int newLargestNumberOfSteps = 0; - optimizedHashMultiSetResult.forget(); + int newSlotPairArraySize = newLogicalHashBucketCount * 3; + long[] newSlotPairs = new long[newSlotPairArraySize]; - long hashCode = HashCodeUtil.murmurHash(keyBytes, keyStart, keyLength); - long count = findReadSlot(keyBytes, keyStart, keyLength, hashCode, hashMultiSetResult.getReadPos()); - JoinUtil.JoinResult joinResult; - if (count == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { + int pairIndex; + long keyRef; + long hashCodeAndCounterWord = -1; + for (int slot = 0; slot < logicalHashBucketCount; slot++) { + pairIndex = slot * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef != 0) { + + hashCodeAndCounterWord = slotPairs[pairIndex + 1]; + Preconditions.checkState( + (hashCodeAndCounterWord & HashCodeAndCounterWord.IsInvalidFlag.flagOnMask) == 0); - optimizedHashMultiSetResult.set(count); + // Copy to new slot table. + int hashCode = HashCodeAndCounterWord.getHashCode(hashCodeAndCounterWord); + int newSlot = hashCode & newLogicalHashBucketMask; + long newProbeSlot = newSlot; + int newPairIndex; + int i = 0; + while (true) { + newPairIndex = newSlot * 2; + long newKeyRef = newSlotPairs[newPairIndex]; + if (newKeyRef == 0) { + break; + } + ++newMetricPutConflict; + // Some other key (collision) - keep probing. + newProbeSlot += (++i); + newSlot = (int)(newProbeSlot & newLogicalHashBucketMask); + } - joinResult = JoinUtil.JoinResult.MATCH; + if (newLargestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + newLargestNumberOfSteps = i; + } + + // Use old words. + newSlotPairs[newPairIndex] = keyRef; + newSlotPairs[newPairIndex + 1] = hashCodeAndCounterWord; + } } - optimizedHashMultiSetResult.setJoinResult(joinResult); + slotPairs = newSlotPairs; + logicalHashBucketCount = newLogicalHashBucketCount; + logicalHashBucketMask = newLogicalHashBucketMask; + metricPutConflict = newMetricPutConflict; + largestNumberOfSteps = newLargestNumberOfSteps; + resizeThreshold = (int)(logicalHashBucketCount * loadFactor); + metricExpandsMs += (System.currentTimeMillis() - expandTime); + metricExpands++; + } + + protected int getLongsPerSlot() { + return 2; + } + + /* + * The hash table slots. For a bytes key hash counter table, each slot is 2 longs and the + * array is 2X sized. + * + * The slot pair is a 64 bit key reference and the 2nd 64 bit word has the 32 bits for the + * hash code and a 32 bit counter. + */ + protected long[] slotPairs; + + @Override + public void allocateBucketArray() { + int slotPairArraySize = 2 * logicalHashBucketCount; + slotPairs = new long[slotPairArraySize]; + } - return joinResult; + public static String displayBytes(byte[] bytes, int start, int length) { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < start + length; i++) { + char ch = (char) bytes[i]; + if (ch < ' ' || ch > '~') { + sb.append(String.format("\\%03d", bytes[i] & 0xff)); + } else { + sb.append(ch); + } + } + return sb.toString(); + } + + @Override + public void hashMultiSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) { + + VectorMapJoinFastBytesHashMultiSet.HashMultiSetResult fastHashMultiSetResult = + (VectorMapJoinFastBytesHashMultiSet.HashMultiSetResult) hashMultiSetResult; + + fastHashMultiSetResult.forget(); + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long keyRef; + long hashCodeAndCounterWord; + while (true) { + pairIndex = slot * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef == 0) { + // Given that we do not delete, an empty slot means no match. + fastHashMultiSetResult.setNoMatch(); + return; + } else { + hashCodeAndCounterWord = slotPairs[pairIndex + 1]; + + if (hashCode == HashCodeAndCounterWord.getHashCode(hashCodeAndCounterWord)) { + + // Finally, verify the key bytes match. + if (keyStore.equalKey(keyRef, keyBytes, keyStart, keyLength, fastHashMultiSetResult.getReadPos())) { + hashMultiSetResult.setMatch(HashCodeAndCounterWord.getCounter(hashCodeAndCounterWord)); + return; + } + } + } + // Some other key (collision) - keep probing. + metricGetConflict++; + probeSlot += (++i); + if (i > largestNumberOfSteps) { + // We know we never went that far when we were inserting. + hashMultiSetResult.setNoMatch(); + return; + } + slot = (int)(probeSlot & logicalHashBucketMask); + } } public VectorMapJoinFastBytesHashMultiSet( - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); keyStore = new VectorMapJoinFastKeyStore(writeBuffersSize); } + + @Override + public long memorySize() { + return keyStore.writeBuffers().size() + slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } + + @Override + public void clear() { + // UNDONE + } + + @Override + public void debugDumpTable() { + StringBuilder dump = new StringBuilder(keysAssigned + " keys\n"); + + int pairIndex; + long keyRef; + int hashCode; + long hashCodeAndCounterWord; + int examined = 0; + for (int i = 0; i < logicalHashBucketCount; i++) { + pairIndex = i * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef != 0) { + hashCodeAndCounterWord = slotPairs[pairIndex + 1]; + examined++; + hashCode = HashCodeAndCounterWord.getHashCode(hashCodeAndCounterWord); + dump.append("slot "); + dump.append(i); + dump.append(" hashCode "); + dump.append(Integer.toHexString(hashCode)); + dump.append("\n"); + + // UNDONE: Dump key and count + } + } + if (examined != keysAssigned) { + dump.append("Found " + examined + " keys!\n"); + } + LOG.info("Vector MapJoin Fast BytesHashMultiSet dump:\n " + dump.toString()); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashSet.java index c9b23bf..ce82b4f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashSet.java @@ -20,68 +20,258 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResultImpl; +import org.apache.hadoop.hive.serde2.WriteBuffers; import org.apache.hadoop.io.BytesWritable; -import org.apache.hive.common.util.HashCodeUtil; /* * An bytes key hash set optimized for vector map join. * * This is the abstract base for the multi-key and string bytes key hash set implementations. */ -public abstract class VectorMapJoinFastBytesHashSet - extends VectorMapJoinFastBytesHashTable - implements VectorMapJoinBytesHashSet { +public abstract class VectorMapJoinFastBytesHashSet extends VectorMapJoinFastBytesHashTable { private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastBytesHashSet.class); - @Override - public VectorMapJoinHashSetResult createHashSetResult() { - return new VectorMapJoinFastHashSet.HashSetResult(); + public static class HashSetResult extends MapJoinHashSetResultImpl { + + protected WriteBuffers.Position readPos; + + public HashSetResult() { + super(); + readPos = new WriteBuffers.Position(); + } + + public WriteBuffers.Position getReadPos() { + return readPos; + } } + private VectorMapJoinFastKeyStore keyStore; + @Override - public void assignSlot(int slot, byte[] keyBytes, int keyStart, int keyLength, - long hashCode, boolean isNewKey, BytesWritable currentValue) { + protected void add(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + BytesWritable currentValue) { + + if (resizeThreshold <= keysAssigned) { + expandAndRehash(); + } + + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long keyRef; + boolean isNewKey; + long longHashCode = (long) hashCode; // Allow sign extension. + while (true) { + pairIndex = 2 * slot; + keyRef = slotPairs[pairIndex]; + if (keyRef == 0) { + isNewKey = true;; + break; + } + + if (longHashCode == slotPairs[pairIndex + 1] && + keyStore.unsafeEqualKey(keyRef, keyBytes, keyStart, keyLength)) { + isNewKey = false; + break; + } + + // Some other key (collision) - keep probing. + metricPutConflict++; + probeSlot += (++i); + slot = (int) (probeSlot & logicalHashBucketMask); + } + + if (largestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + largestNumberOfSteps = i; + } - int tripleIndex = 3 * slot; if (isNewKey) { - // First entry. - slotTriples[tripleIndex] = keyStore.add(keyBytes, keyStart, keyLength); - slotTriples[tripleIndex + 1] = hashCode; - slotTriples[tripleIndex + 2] = 1; // Existence + slotPairs[pairIndex] = keyStore.add(keyBytes, keyStart, keyLength); + slotPairs[pairIndex + 1] = longHashCode; + keysAssigned++; + } else { + // Entry exists. } + numValues++; } @Override - public JoinUtil.JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashSetResult hashSetResult) { + protected void expandAndRehashImpl(int capacity) { - VectorMapJoinFastHashSet.HashSetResult optimizedHashSetResult = - (VectorMapJoinFastHashSet.HashSetResult) hashSetResult; + long expandTime = System.currentTimeMillis(); + int newLogicalHashBucketCount = capacity; + int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; + int newMetricPutConflict = 0; + int newLargestNumberOfSteps = 0; - optimizedHashSetResult.forget(); + int newSlotPairArraySize = newLogicalHashBucketCount * 3; + long[] newSlotPairs = new long[newSlotPairArraySize]; - long hashCode = HashCodeUtil.murmurHash(keyBytes, keyStart, keyLength); - long existance = findReadSlot(keyBytes, keyStart, keyLength, hashCode, hashSetResult.getReadPos()); - JoinUtil.JoinResult joinResult; - if (existance == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; - } else { - joinResult = JoinUtil.JoinResult.MATCH; + int pairIndex; + long keyRef; + long longHashCode = -1; + int intHashCode = -1; + for (int slot = 0; slot < logicalHashBucketCount; slot++) { + pairIndex = slot * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef != 0) { + + longHashCode = slotPairs[pairIndex + 1]; + intHashCode = (int) longHashCode; + + // Copy to new slot table. + int newSlot = intHashCode & newLogicalHashBucketMask; + long newProbeSlot = newSlot; + int newPairIndex; + int i = 0; + while (true) { + newPairIndex = newSlot * 2; + long newKeyRef = newSlotPairs[newPairIndex]; + if (newKeyRef == 0) { + break; + } + ++newMetricPutConflict; + // Some other key (collision) - keep probing. + newProbeSlot += (++i); + newSlot = (int)(newProbeSlot & newLogicalHashBucketMask); + } + + if (newLargestNumberOfSteps < i) { + if (isLogDebugEnabled) { + LOG.debug("Probed " + i + " slots (the longest so far) to find space"); + } + newLargestNumberOfSteps = i; + } + + // Use old words. + newSlotPairs[newPairIndex] = keyRef; + newSlotPairs[newPairIndex + 1] = longHashCode; + } } - optimizedHashSetResult.setJoinResult(joinResult); + slotPairs = newSlotPairs; + logicalHashBucketCount = newLogicalHashBucketCount; + logicalHashBucketMask = newLogicalHashBucketMask; + metricPutConflict = newMetricPutConflict; + largestNumberOfSteps = newLargestNumberOfSteps; + resizeThreshold = (int)(logicalHashBucketCount * loadFactor); + metricExpandsMs += (System.currentTimeMillis() - expandTime); + metricExpands++; + } + + protected int getLongsPerSlot() { + return 2; + } + + /* + * The hash table slots. For a bytes key hash counter table, each slot is 2 longs and the + * array is 2X sized. + * + * The slot pair is a 64 bit key reference and the 2nd 64 bit word has the 32 bits for the + * hash code. + */ + protected long[] slotPairs; + + @Override + public void allocateBucketArray() { + int slotPairArraySize = 2 * logicalHashBucketCount; + slotPairs = new long[slotPairArraySize]; + } + + @Override + public void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashSetResult hashSetResult) { + + VectorMapJoinFastBytesHashSet.HashSetResult fastHashSetResult = + (VectorMapJoinFastBytesHashSet.HashSetResult) hashSetResult; + + fastHashSetResult.forget(); - return joinResult; + int slot = (hashCode & logicalHashBucketMask); + long probeSlot = slot; + int i = 0; + int pairIndex; + long keyRef; + long longHashCode = (long) hashCode; // Allow sign extension. + while (true) { + pairIndex = slot * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef == 0) { + // Given that we do not delete, an empty slot means no match. + fastHashSetResult.setNoMatch(); + return; + } else if (longHashCode == slotPairs[pairIndex + 1]) { + // Finally, verify the key bytes match. + if (keyStore.equalKey(keyRef, keyBytes, keyStart, keyLength, fastHashSetResult.getReadPos())) { + hashSetResult.setMatch(); + return; + } + } + // Some other key (collision) - keep probing. + metricGetConflict++; + probeSlot += (++i); + if (i > largestNumberOfSteps) { + // We know we never went that far when we were inserting. + hashSetResult.setNoMatch(); + return; + } + slot = (int)(probeSlot & logicalHashBucketMask); + } } public VectorMapJoinFastBytesHashSet( - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); keyStore = new VectorMapJoinFastKeyStore(writeBuffersSize); } + + @Override + public long memorySize() { + return keyStore.writeBuffers().size() + slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } + + @Override + public void clear() { + // UNDONE + } + + @Override + public void debugDumpTable() { + StringBuilder dump = new StringBuilder(keysAssigned + " keys\n"); + + int pairIndex; + long keyRef; + long longHashCode; + int hashCode; + int examined = 0; + for (int i = 0; i < logicalHashBucketCount; i++) { + pairIndex = i * 2; + keyRef = slotPairs[pairIndex]; + if (keyRef != 0) { + longHashCode = slotPairs[pairIndex + 1]; + examined++; + hashCode = (int) longHashCode; + dump.append("slot "); + dump.append(i); + dump.append(" hashCode "); + dump.append(Integer.toHexString(hashCode)); + dump.append("\n"); + + // UNDONE: Dump key + } + } + if (examined != keysAssigned) { + dump.append("Found " + examined + " keys!\n"); + } + LOG.info("Vector MapJoin Fast BytesHashSet dump:\n " + dump.toString()); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashTable.java index dc0476b..cf3b517 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashTable.java @@ -22,9 +22,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashTable; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.WriteBuffers; import org.apache.hadoop.io.BytesWritable; import org.apache.hive.common.util.HashCodeUtil; @@ -33,185 +31,31 @@ /* * An single byte array value hash map optimized for vector map join. */ -public abstract class VectorMapJoinFastBytesHashTable - extends VectorMapJoinFastHashTable - implements VectorMapJoinBytesHashTable { +public abstract class VectorMapJoinFastBytesHashTable extends VectorMapJoinFastHashTable { private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastBytesHashTable.class); - private final boolean isLogDebugEnabled = LOG.isDebugEnabled(); + public final boolean isLogDebugEnabled = LOG.isDebugEnabled(); - protected VectorMapJoinFastKeyStore keyStore; + protected BytesWritable testValueBytesWritable; - protected BytesWritable testKeyBytesWritable; - - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - // No deserialization of key(s) here -- just get reference to bytes. - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - add(keyBytes, 0, keyLength, currentValue); - } - - protected abstract void assignSlot(int slot, byte[] keyBytes, int keyStart, int keyLength, - long hashCode, boolean isNewKey, BytesWritable currentValue); - - public void add(byte[] keyBytes, int keyStart, int keyLength, BytesWritable currentValue) { - - if (resizeThreshold <= keysAssigned) { - expandAndRehash(); - } - - long hashCode = HashCodeUtil.murmurHash(keyBytes, keyStart, keyLength); - int intHashCode = (int) hashCode; - int slot = (intHashCode & logicalHashBucketMask); - long probeSlot = slot; - int i = 0; - boolean isNewKey; - while (true) { - int tripleIndex = 3 * slot; - if (slotTriples[tripleIndex] == 0) { - // LOG.debug("VectorMapJoinFastBytesHashMap findWriteSlot slot " + slot + " tripleIndex " + tripleIndex + " empty"); - isNewKey = true;; - break; - } - if (hashCode == slotTriples[tripleIndex + 1] && - keyStore.unsafeEqualKey(slotTriples[tripleIndex], keyBytes, keyStart, keyLength)) { - // LOG.debug("VectorMapJoinFastBytesHashMap findWriteSlot slot " + slot + " tripleIndex " + tripleIndex + " existing"); - isNewKey = false; - break; - } - // TODO - ++metricPutConflict; - // Some other key (collision) - keep probing. - probeSlot += (++i); - slot = (int) (probeSlot & logicalHashBucketMask); - } - - if (largestNumberOfSteps < i) { - if (isLogDebugEnabled) { - LOG.debug("Probed " + i + " slots (the longest so far) to find space"); - } - largestNumberOfSteps = i; - // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); - } - - assignSlot(slot, keyBytes, keyStart, keyLength, hashCode, isNewKey, currentValue); - - if (isNewKey) { - keysAssigned++; - } - } - - private void expandAndRehash() { - - int newLogicalHashBucketCount = logicalHashBucketCount * 2; - int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; - int newMetricPutConflict = 0; - int newLargestNumberOfSteps = 0; - - int newSlotTripleArraySize = newLogicalHashBucketCount * 3; - long[] newSlotTriples = new long[newSlotTripleArraySize]; - - for (int slot = 0; slot < logicalHashBucketCount; slot++) { - int tripleIndex = slot * 3; - long keyRef = slotTriples[tripleIndex]; - if (keyRef != 0) { - long hashCode = slotTriples[tripleIndex + 1]; - long valueRef = slotTriples[tripleIndex + 2]; - - // Copy to new slot table. - int intHashCode = (int) hashCode; - int newSlot = intHashCode & newLogicalHashBucketMask; - long newProbeSlot = newSlot; - int newTripleIndex; - int i = 0; - while (true) { - newTripleIndex = newSlot * 3; - long newKeyRef = newSlotTriples[newTripleIndex]; - if (newKeyRef == 0) { - break; - } - ++newMetricPutConflict; - // Some other key (collision) - keep probing. - newProbeSlot += (++i); - newSlot = (int)(newProbeSlot & newLogicalHashBucketMask); - } - - if (newLargestNumberOfSteps < i) { - if (isLogDebugEnabled) { - LOG.debug("Probed " + i + " slots (the longest so far) to find space"); - } - newLargestNumberOfSteps = i; - // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); - } - - // Use old value reference word. - // LOG.debug("VectorMapJoinFastLongHashTable expandAndRehash key " + tableKey + " slot " + newSlot + " newPairIndex " + newPairIndex + " empty slot (i = " + i + ")"); - - newSlotTriples[newTripleIndex] = keyRef; - newSlotTriples[newTripleIndex + 1] = hashCode; - newSlotTriples[newTripleIndex + 2] = valueRef; - } + @VisibleForTesting + public void putRow(byte[] currentKey, byte[] currentValue) throws HiveException, IOException { + if (testValueBytesWritable == null) { + testValueBytesWritable = new BytesWritable(); } - - slotTriples = newSlotTriples; - logicalHashBucketCount = newLogicalHashBucketCount; - logicalHashBucketMask = newLogicalHashBucketMask; - metricPutConflict = newMetricPutConflict; - largestNumberOfSteps = newLargestNumberOfSteps; - resizeThreshold = (int)(logicalHashBucketCount * loadFactor); - metricExpands++; - // LOG.debug("VectorMapJoinFastLongHashTable expandAndRehash new logicalHashBucketCount " + logicalHashBucketCount + " resizeThreshold " + resizeThreshold + " metricExpands " + metricExpands); + testValueBytesWritable.set(currentValue, 0, currentValue.length); + int hashCode = HashCodeUtil.murmurHash(currentKey, 0, currentKey.length); + add(currentKey, 0, currentKey.length, hashCode, testValueBytesWritable); } - protected final long findReadSlot( - byte[] keyBytes, int keyStart, int keyLength, long hashCode, WriteBuffers.Position readPos) { - - int intHashCode = (int) hashCode; - int slot = (intHashCode & logicalHashBucketMask); - long probeSlot = slot; - int i = 0; - while (true) { - int tripleIndex = slot * 3; - // LOG.debug("VectorMapJoinFastBytesHashMap findReadSlot slot keyRefWord " + Long.toHexString(slotTriples[tripleIndex]) + " hashCode " + Long.toHexString(hashCode) + " entry hashCode " + Long.toHexString(slotTriples[tripleIndex + 1]) + " valueRefWord " + Long.toHexString(slotTriples[tripleIndex + 2])); - if (slotTriples[tripleIndex] == 0) { - // Given that we do not delete, an empty slot means no match. - return -1; - } else if (hashCode == slotTriples[tripleIndex + 1]) { - // Finally, verify the key bytes match. - - if (keyStore.equalKey(slotTriples[tripleIndex], keyBytes, keyStart, keyLength, readPos)) { - return slotTriples[tripleIndex + 2]; - } - } - // Some other key (collision) - keep probing. - probeSlot += (++i); - if (i > largestNumberOfSteps) { - // We know we never went that far when we were inserting. - return -1; - } - slot = (int)(probeSlot & logicalHashBucketMask); - } - } - - /* - * The hash table slots. For a bytes key hash table, each slot is 3 longs and the array is - * 3X sized. - * - * The slot triple is 1) a non-zero reference word to the key bytes, 2) the key hash code, and - * 3) a non-zero reference word to the first value bytes. - */ - protected long[] slotTriples; - - private void allocateBucketArray() { - int slotTripleArraySize = 3 * logicalHashBucketCount; - slotTriples = new long[slotTripleArraySize]; - } + protected abstract void add(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + BytesWritable currentValue); public VectorMapJoinFastBytesHashTable( - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); allocateBucketArray(); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashUtil.java index 80126ad..d0e303e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastBytesHashUtil.java @@ -18,8 +18,6 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import org.apache.hadoop.hive.serde2.WriteBuffers; - public class VectorMapJoinFastBytesHashUtil { public static String displayBytes(byte[] bytes, int start, int length) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMap.java index 262b619..836d0d8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMap.java @@ -18,21 +18,12 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; - -public abstract class VectorMapJoinFastHashMap - extends VectorMapJoinFastHashTable - implements VectorMapJoinHashMap { - - @Override - public VectorMapJoinHashMapResult createHashMapResult() { - return new VectorMapJoinFastValueStore.HashMapResult(); - } +public abstract class VectorMapJoinFastHashMap extends VectorMapJoinFastHashTable { public VectorMapJoinFastHashMap( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, int maxProbeSize) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, maxProbeSize); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMultiSet.java index 5f7c6a7..a0c11e1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashMultiSet.java @@ -18,31 +18,12 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; - -public abstract class VectorMapJoinFastHashMultiSet - extends VectorMapJoinFastHashTable implements VectorMapJoinHashMultiSet { - - @Override - public VectorMapJoinHashMultiSetResult createHashMultiSetResult() { - return new HashMultiSetResult(); - } - - public static class HashMultiSetResult extends VectorMapJoinHashMultiSetResult { - - HashMultiSetResult() { - super(); - } - - public void set(long count) { - this.count = count; - } - } +public abstract class VectorMapJoinFastHashMultiSet extends VectorMapJoinFastHashTable { public VectorMapJoinFastHashMultiSet( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, int maxProbeSize) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, maxProbeSize); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashSet.java index 8509971..d85d5c8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashSet.java @@ -18,27 +18,12 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; - -public abstract class VectorMapJoinFastHashSet - extends VectorMapJoinFastHashTable implements VectorMapJoinHashSet { - - @Override - public VectorMapJoinHashSetResult createHashSetResult() { - return new HashSetResult(); - } - - public static class HashSetResult extends VectorMapJoinHashSetResult { - - HashSetResult() { - super(); - } - } +public abstract class VectorMapJoinFastHashSet extends VectorMapJoinFastHashTable { public VectorMapJoinFastHashSet( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, int maxProbeSize) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, maxProbeSize); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTable.java index 7df9eed..24dd5b4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTable.java @@ -18,24 +18,69 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; +import java.io.IOException; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.serde2.SerDeException; + +/** + * Different Implementation Variations for the fast Vector Map Join hash tables: + * + * Key Hash Table Kind Key and/or Value Class longs/Slot Comments + * --- --------------- ---------------------- ---------- --------------------------------- + * + * Bytes HashMap ~KeyAndValueStore 2 96-bit key and value store + * reference plus 32 bits for hash + * code. + * + * HashMultiSet ~KeyStore 2 64 bit key store reference. + * 2nd 64 bits has 32 bit hash code + * and 31 bit multi-set counter. + * + * HashSet ~KeyStore 2 64 bit key store reference. + * 2nd 64 bits just has 32 bit hash + * code. Existence for set is + * implicit with non-zero key store + * reference. + * + * Long HashMap ~ValueStore 2 64 bit value store reference. + * 2nd 64 bits has long key. + * + * HashMultiSet (none) 2 64 bits has 31 bit multi-set + * counter. 2nd 64 bits has long + * key. + * + * HashSet (none) 2 64 bits has 1 bit of existence + * for set. 2nd 64 bits has long + * key. + * + */ +public abstract class VectorMapJoinFastHashTable implements MapJoinHashTable { -public abstract class VectorMapJoinFastHashTable implements VectorMapJoinHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastHashTable.class); + protected VectorMapJoinFastHashTableFactory mapJoinHashTableFactory; + protected int logicalHashBucketCount; protected int logicalHashBucketMask; protected float loadFactor; - protected final int writeBuffersSize; + protected int writeBuffersSize; - protected int metricPutConflict; protected int largestNumberOfSteps; protected int keysAssigned; + protected int numValues; protected int resizeThreshold; + + protected int metricPutConflict; + protected int metricGetConflict; protected int metricExpands; + protected int metricExpandsMs; private static void validateCapacity(long capacity) { if (Long.bitCount(capacity) != 1) { @@ -46,12 +91,19 @@ private static void validateCapacity(long capacity) { } } - private static int nextHighestPowerOfTwo(int v) { + protected static int nextHighestPowerOfTwo(int v) { return Integer.highestOneBit(v) << 1; } + protected abstract int getLongsPerSlot(); + + protected abstract void allocateBucketArray(); + public VectorMapJoinFastHashTable( - int initialCapacity, float loadFactor, int writeBuffersSize) { + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + + this.mapJoinHashTableFactory = mapJoinHashTableFactory; initialCapacity = (Long.bitCount(initialCapacity) == 1) ? initialCapacity : nextHighestPowerOfTwo(initialCapacity); @@ -64,10 +116,151 @@ public VectorMapJoinFastHashTable( this.loadFactor = loadFactor; this.writeBuffersSize = writeBuffersSize; + + keysAssigned = 0; + numValues = 0; + + metricPutConflict = 0; + metricGetConflict = 0; + metricExpands = 0; + metricExpandsMs= 0; + + allocateBucketArray(); + } + + protected void expandAndRehash() { + expandAndRehashImpl(logicalHashBucketCount << 1); + } + + protected abstract void expandAndRehashImpl(int capacity); + + @Override + public void expandAndRehashToTarget(int estimateNewRowCount) { + int oldCount = logicalHashBucketCount; + int newCount = oldCount + estimateNewRowCount; + if (resizeThreshold <= newCount) { + newCount = + (Long.bitCount(newCount) == 1) ? estimateNewRowCount : nextHighestPowerOfTwo(newCount); + expandAndRehashImpl(newCount); + LOG.info("Expand and rehash to " + newCount + " from " + oldCount); + } } + /** + * Number of keys in the hashmap + * @return number of keys + */ @Override public int size() { return keysAssigned; } + + /** + * Number of values in the hashmap + * This is equal to or bigger than number of keys, since some values may share the same key + * @return number of values + */ + @Override + public int getNumValues() { + return numValues; + } + + @Override + public void seal() { + // Nothing to seal in base class. + } + + @Override + public void clear() { + // This will make the object completely unusable. Semantics of clear are not defined... + this.keysAssigned = 0; + this.numValues = 0; + } + + //---------------------------- COMMON LONG METHODS (Begin)---------------------------------------- + + @Override + public boolean useMinMax() { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + @Override + public long min() { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + @Override + public long max() { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + //----------------------------- COMMON LONG METHODS (End)----------------------------------------- + + //-------------------------------- HASH MAP (Begin)----------------------------------------------- + + @Override + public void hashMapLookup(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMapResult hashMapResult) + throws IOException { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + @Override + public void hashMapLookup(long key, int hashCode, + MapJoinHashMapResult hashMapResult) throws IOException { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + //-------------------------------- HASH MAP (End) ------------------------------------------------ + + //---------------------------- HASH MULTI-SET (Begin) ------------------------------------------- + + @Override + public void hashMultiSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) + throws IOException { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + @Override + public void hashMultiSetContains(long key, int hashCode, + MapJoinHashMultiSetResult hashMultiSetResult) throws IOException { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + //----------------------------- HASH MULTI-SET (End) -------------------------------------------- + + //------------------------------- HASH SET (Begin) ---------------------------------------------- + + @Override + public void hashSetContains(byte[] keyBytes, int keyStart, int keyLength, int hashCode, + MapJoinHashSetResult hashSetResult) throws IOException { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + @Override + public void hashSetContains(long key, int hashCode, + MapJoinHashSetResult hashSetResult) throws IOException { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + //--------------------------------- HASH SET (End) ---------------------------------------------- + + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + @Override + public long memorySize() { + throw new RuntimeException("Expected this method to be overriden " + mapJoinHashTableFactory.toString()); + } + + @Override + public void debugDumpMetrics() { + LOG.info("Map metrics: keys allocated " + logicalHashBucketCount +", keys assigned " + keysAssigned + + ", write conflict " + metricPutConflict + ", write max dist " + largestNumberOfSteps + + ", read conflict " + metricGetConflict + + ", expanded " + metricExpands + " times in " + metricExpandsMs + "ms"); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableFactory.java new file mode 100644 index 0000000..155b7c6 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableFactory.java @@ -0,0 +1,288 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResultImpl; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResultImpl; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; + +import com.google.common.annotations.VisibleForTesting; + +/* + * Factory for creating a fast vector map join hash table (which could be a hash map, hash multi-set, + * or hash set) with long, string, or multi-key key. + * + * And, associated objects (e.g. hash map result). + * + * Implements the standard map join interface for creating hash tables. + * + */ +public class VectorMapJoinFastHashTableFactory implements MapJoinHashTableFactory { + + private final boolean isOuterJoin; + private final HashTableKind hashTableKind; + private final HashTableKeyType hashTableKeyType; + private final boolean useMinMax; + + @VisibleForTesting + public VectorMapJoinFastHashTableFactory(HashTableKind hashTableKind, + HashTableKeyType hashTableKeyType) { + isOuterJoin = false; + this.hashTableKind = hashTableKind; + this.hashTableKeyType = hashTableKeyType; + useMinMax = false; + } + + public VectorMapJoinFastHashTableFactory(MapJoinDesc desc) { + + VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); + + isOuterJoin = !desc.isNoOuterJoin(); + hashTableKind = vectorDesc.hashTableKind(); + hashTableKeyType = vectorDesc.hashTableKeyType(); + useMinMax = vectorDesc.minMaxEnabled() && + (hashTableKeyType == HashTableKeyType.BOOLEAN || + hashTableKeyType == HashTableKeyType.BYTE || + hashTableKeyType == HashTableKeyType.SHORT || + hashTableKeyType == HashTableKeyType.INT || + hashTableKeyType == HashTableKeyType.LONG); + } + + @Override + public MapJoinHashTable createHashTable(int initialCapacity, float loadFactor, + int writeBuffersSize, long memUsage) { + + MapJoinHashTable MapJoinHashTableFind = null; + switch (hashTableKeyType) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + switch (hashTableKind) { + case HASH_MAP: + MapJoinHashTableFind = new VectorMapJoinFastLongHashMap( + this, + useMinMax, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_MULTISET: + MapJoinHashTableFind = new VectorMapJoinFastLongHashMultiSet( + this, + useMinMax, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_SET: + MapJoinHashTableFind = new VectorMapJoinFastLongHashSet( + this, + useMinMax, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + break; + + case STRING: + switch (hashTableKind) { + case HASH_MAP: + MapJoinHashTableFind = new VectorMapJoinFastStringHashMap( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_MULTISET: + MapJoinHashTableFind = new VectorMapJoinFastStringHashMultiSet( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_SET: + MapJoinHashTableFind = new VectorMapJoinFastStringHashSet( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + break; + + case MULTI_KEY: + switch (hashTableKind) { + case HASH_MAP: + MapJoinHashTableFind = new VectorMapJoinFastMultiKeyHashMap( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_MULTISET: + MapJoinHashTableFind = new VectorMapJoinFastMultiKeyHashMultiSet( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + case HASH_SET: + MapJoinHashTableFind = new VectorMapJoinFastMultiKeyHashSet( + this, + isOuterJoin, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + break; + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + break; + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + + return MapJoinHashTableFind; + } + + /* + * @return A new hash map result implementation specific object. + * + * The object can be used to access the values when there is a match, or + * access spill information when the partition with the key is currently spilled. + */ + @Override + public MapJoinHashMapResult createHashMapResult() { + switch (hashTableKind) { + case HASH_MAP: + switch (hashTableKeyType) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + return new VectorMapJoinFastValueStore.HashMapResult(); + case STRING: + case MULTI_KEY: + return new VectorMapJoinFastKeyAndValueStore.HashMapResult(); + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + case HASH_MULTISET: + case HASH_SET: + throw new RuntimeException("Hash Map result only for Hash Map tables"); + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + } + + /* + * @return A new hash multi-set result implementation specific object. + * + * The object can be used to access the *count* of values when the key is contained in the + * multi-set, or access spill information when the partition with the key is currently spilled. + */ + @Override + public MapJoinHashMultiSetResult createHashMultiSetResult() { + switch (hashTableKind) { + case HASH_MULTISET: + switch (hashTableKeyType) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + return new MapJoinHashMultiSetResultImpl(); + case STRING: + case MULTI_KEY: + return new VectorMapJoinFastBytesHashMultiSet.HashMultiSetResult(); + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + case HASH_MAP: + case HASH_SET: + throw new RuntimeException("Hash Multi-Set result only for Hash Multi-Set tables"); + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + } + + /* + * @return A new hash set result implementation specific object. + * + * The object can be used to access access spill information when the partition with the key + * is currently spilled. + */ + @Override + public MapJoinHashSetResult createHashSetResult() { + switch (hashTableKind) { + case HASH_SET: + switch (hashTableKeyType) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + return new MapJoinHashSetResultImpl(); + case STRING: + case MULTI_KEY: + return new VectorMapJoinFastBytesHashSet.HashSetResult(); + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + case HASH_MAP: + case HASH_MULTISET: + throw new RuntimeException("Hash Set result only for Hash Set tables"); + default: + throw new RuntimeException("Unexpected vector map join hash table kind " + hashTableKind.name()); + } + } + + @Override + public boolean keyValuePutHelperIsExternal() { + return true; + } + + @Override + public KeyValuePut createKeyValuePut() { + return new VectorMapJoinFastKeyValuePut(hashTableKind, hashTableKeyType, isOuterJoin); + } + + @Override + public boolean useMinMax() { + return useMinMax; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("hashTableKind "); + sb.append(hashTableKind.name()); + sb.append(", hashTableKeyType "); + sb.append(hashTableKeyType.name()); + sb.append(", isOuterJoin "); + sb.append(isOuterJoin); + sb.append(", useMinMax "); + sb.append(useMinMax); + return sb.toString(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableLoader.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableLoader.java deleted file mode 100644 index 49ecdd1..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastHashTableLoader.java +++ /dev/null @@ -1,114 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; - -import java.io.IOException; -import java.util.Collections; -import java.util.Map; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.MapJoinOperator; -import org.apache.hadoop.hive.ql.exec.MapredContext; -import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; -import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe; -import org.apache.hadoop.hive.ql.exec.tez.TezContext; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.io.BytesWritable; -import org.apache.tez.runtime.api.Input; -import org.apache.tez.runtime.api.LogicalInput; -import org.apache.tez.runtime.library.api.KeyValueReader; - -/** - * HashTableLoader for Tez constructs the hashtable from records read from - * a broadcast edge. - */ -public class VectorMapJoinFastHashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTableLoader { - - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastHashTableLoader.class.getName()); - - private Configuration hconf; - protected MapJoinDesc desc; - private TezContext tezContext; - - @Override - public void init(ExecMapperContext context, MapredContext mrContext, - Configuration hconf, MapJoinOperator joinOp) { - this.tezContext = (TezContext) mrContext; - this.hconf = hconf; - this.desc = joinOp.getConf(); - } - - @Override - public void load(MapJoinTableContainer[] mapJoinTables, - MapJoinTableContainerSerDe[] mapJoinTableSerdes) - throws HiveException { - - Map parentToInput = desc.getParentToInput(); - Map parentKeyCounts = desc.getParentKeyCounts(); - - for (int pos = 0; pos < mapJoinTables.length; pos++) { - if (pos == desc.getPosBigTable()) { - continue; - } - - String inputName = parentToInput.get(pos); - LogicalInput input = tezContext.getInput(inputName); - - try { - input.start(); - tezContext.getTezProcessorContext().waitForAnyInputReady( - Collections. singletonList(input)); - } catch (Exception e) { - throw new HiveException(e); - } - - try { - KeyValueReader kvReader = (KeyValueReader) input.getReader(); - - Long keyCountObj = parentKeyCounts.get(pos); - long keyCount = (keyCountObj == null) ? -1 : keyCountObj.longValue(); - - VectorMapJoinFastTableContainer vectorMapJoinFastTableContainer = - new VectorMapJoinFastTableContainer(desc, hconf, keyCount); - - vectorMapJoinFastTableContainer.setSerde(null, null); // No SerDes here. - while (kvReader.next()) { - vectorMapJoinFastTableContainer.putRow((BytesWritable)kvReader.getCurrentKey(), - (BytesWritable)kvReader.getCurrentValue()); - } - - vectorMapJoinFastTableContainer.seal(); - mapJoinTables[pos] = (MapJoinTableContainer) vectorMapJoinFastTableContainer; - - } catch (IOException e) { - throw new HiveException(e); - } catch (SerDeException e) { - throw new HiveException(e); - } catch (Exception e) { - throw new HiveException(e); - } - } - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyAndValueStore.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyAndValueStore.java new file mode 100644 index 0000000..90e12d7 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyAndValueStore.java @@ -0,0 +1,720 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResultImpl; +import org.apache.hadoop.hive.serde2.WriteBuffers; +import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; +import org.apache.hadoop.hive.serde2.WriteBuffers.Position; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/* + * Used by VectorMapJoinFastBytesHashMap to store the key and values for a hash map with a bytes + * key. + */ +public class VectorMapJoinFastKeyAndValueStore { + + private static final String CLASS_NAME = VectorMapJoinFastKeyAndValueStore.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); + + private WriteBuffers writeBuffers; + + private WriteBuffers.Position readPos; + + /** + * A store for a key and a list of arbitrary length values in memory. + * + * The memory is a "infinite" byte array or WriteBuffers object. + * + * We give the client (i.e. hash map) a 128-bit key and value reference to keep that has + * the offset within the "infinite" byte array of the last value inserted in a list. The 128 bits + * includes the hash code. + * + * We optimize the common case when the key is short, the value is short, and the value list + * has 1 element and store that information in the 128 bits. + * + * We also support keeping the value count (up to a limit or cap) so help with join result + * generation algorithms. + * + * When there are more than 1 value, the zero padding is overwritten with a relative offset to + * the next value. The next value always includes the value length. + * + * Cases: + * + * 1) One element when key and value lengths are small (and stored in reference words): + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * <5 0's Padding for Next Rel Offset> + * NEXT (NONE) KEY VALUE + * + * 2) One element, general: + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * <5 0's Padding for Next Rel Offset> [Big Key Len] [Big Value Len] + * NEXT (NONE) optional KEY optional VALUE + * + * 3) Two elements when key and value lengths are small (and stored in reference words): + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * + * | NEXT KEY VALUE + * | + * | first record absolute offset + relative offset + * | + * -------- + * | + * v + * <5 0's Padding for Next Value Ref> + * NEXT (NONE) + * + * 4) Three elements showing how first record updated to point to new value and + * new value points to most recent (additional) value: + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * + * | NEXT KEY VALUE + * | + * | first record absolute offset + relative offset + * | + * | + * | <5 0's Padding for Next Value Ref> + * | ^ NEXT (NONE) VALUE + * | | + * | ------ + * | | + * | | new record absolute offset - (minus) relative offset + * | | + * -----> + * NEXT VALUE + * + * + * 5) Four elements showing how first record is again updated to point to new value and + * new value points to most recent (additional) value: + * + * Key and Value Reference + * | + * | absoluteOffset + * | + * v + * + * | NEXT KEY VALUE + * | + * | first record absolute offset + relative offset + * | + * | + * | <5 0's Padding for Next Value Ref> + * | ^ NEXT (NONE) VALUE + * | | + * | ------ + * | | record absolute offset - (minus) relative offset + * | | + * | + * | ^ NEXT VALUE + * | | + * | ------ + * | | + * | | new record absolute offset - (minus) relative offset + * | | + * -----> + * NEXT VALUE + * + * + * You get the idea. + */ + + public WriteBuffers writeBuffers() { + return writeBuffers; + } + + /** + * A hash map result that can read values stored by the key and value store, one-by-one. + * It also has support routines for checking the hash code and key equality. + * + * It implements the standard map join hash map result interface. + * + */ + public static class HashMapResult extends MapJoinHashTableResultImpl + implements MapJoinHashMapResult { + + private VectorMapJoinFastKeyAndValueStore keyAndValueStore; + + private long absoluteOffset; + private int keyLength; + + private boolean hasRows; + private long part1Word; + private long part2Word; + private boolean isSingleRow; + private int cappedCount; + private long keyAbsoluteOffset; + private int firstValueLength; + private long firstValueAbsoluteOffset; + + private int readIndex; + private boolean isNextEof; + + long nextAbsoluteValueOffset; + + private ByteSegmentRef byteSegmentRef; + private Position readPos; + + public HashMapResult() { + super(); + part1Word = -1; + part2Word = -1; + hasRows = false; + byteSegmentRef = new ByteSegmentRef(); + readPos = new Position(); + } + + /** + * Setup for reading the key of an entry with the equalKey method. + * @param keyAndValueStore + * @param part1Word + * @param part2Word + */ + public void setKey(VectorMapJoinFastKeyAndValueStore keyAndValueStore, long part1Word, + long part2Word) { + + Preconditions.checkState(!KeyAndValueRefPart1.getIsInvalidFlag(part1Word)); + Preconditions.checkState(!KeyAndValueRefPart2.getIsInvalidFlag(part2Word)); + + this.keyAndValueStore = keyAndValueStore; + + this.part1Word = part1Word; + this.part2Word = part2Word; + + absoluteOffset = KeyAndValueRefPart2.getAbsoluteOffset(part2Word); + + // Position after next relative offset (fixed length) to the key. + keyAndValueStore.writeBuffers.setReadPoint( + absoluteOffset + RelativeOffset.byteLength, readPos); + + keyLength = KeyAndValueRefPart1.getSmallKeyLength(part1Word); + boolean isKeyLengthSmall = (keyLength != KeyAndValueRefPart1.SmallKeyLength.allBitsOn); + if (!isKeyLengthSmall) { + // And, if current value is big we must read it. + keyLength = keyAndValueStore.writeBuffers.readVInt(readPos); + } + + // Reading is positioned before the key bytes. + keyAbsoluteOffset = keyAndValueStore.writeBuffers.getReadPoint(readPos); + } + + /** + * Compare a key with the key positioned with the setKey method. + * @param keyBytes + * @param keyStart + * @param keyLength + * @return + */ + public boolean equalKey(byte[] keyBytes, int keyStart, int keyLength) { + + if (this.keyLength != keyLength) { + return false; + } + + // Our reading was positioned to the key. + if (!keyAndValueStore.writeBuffers.isEqual(keyBytes, keyStart, readPos, keyLength)) { + return false; + } + + // NOTE: WriteBuffers.isEqual does not advance the read position... + + return true; + } + + /** + * Mark the key matched with equalKey as a match and set up for reading the values. + * Afterward, methods isSingleRow, cappedCount, first, next, etc may be called. + */ + public void setMatch() { + hasRows = true; + cappedCount = KeyAndValueRefPart1.getCappedCount(part1Word); + isSingleRow = (cappedCount == 1); + + // We must set the position since equalKey does not leave us posiitoned correctly. + keyAndValueStore.writeBuffers.setReadPoint( + keyAbsoluteOffset + keyLength, readPos); + + firstValueLength = KeyAndValueRefPart2.getSmallValueLength(part2Word); + boolean isFirstValueLengthSmall = + (firstValueLength != KeyAndValueRefPart2.SmallValueLength.allBitsOn); + if (!isFirstValueLengthSmall) { + + // And, if current value is big we must read it. + firstValueLength = keyAndValueStore.writeBuffers.readVInt(readPos); + } + + // Save first value absolute offset... + firstValueAbsoluteOffset = keyAndValueStore.writeBuffers.getReadPoint(readPos); + + // Position to beginning. + readIndex = 0; + isNextEof = false; + mapJoinResult = MapJoinResult.MATCH; + } + + @Override + public boolean hasRows() { + return hasRows; + } + + @Override + public boolean isSingleRow() { + if (!hasRows) { + return false; + } + + return isSingleRow; + } + + @Override + public boolean isCappedCountAvailable() { + return true; + } + + @Override + public int cappedCount() { + if (!hasRows) { + return 0; + } + + return cappedCount; + } + + @Override + public ByteSegmentRef first() { + if (!hasRows) { + return null; + } + + // Position to beginning. + readIndex = 0; + isNextEof = false; + + return internalRead(); + } + + @Override + public ByteSegmentRef next() { + if (!hasRows || isNextEof) { + return null; + } + + return internalRead(); + } + + public ByteSegmentRef internalRead() { + + int nextValueLength; + + if (readIndex == 0) { + if (isSingleRow) { + isNextEof = true; + nextAbsoluteValueOffset = -1; + } else { + + // Read the next relative offset the last inserted value record. + keyAndValueStore.writeBuffers.setReadPoint(absoluteOffset, readPos); + long relativeNextValueOffset = + keyAndValueStore.writeBuffers.readNByteLong( + KeyAndValueRefPart2.AbsoluteOffset.byteLength, readPos); + Preconditions.checkState(relativeNextValueOffset != 0); + isNextEof = false; + + // Use positive relative offset from first record to last inserted value record. + nextAbsoluteValueOffset = absoluteOffset + relativeNextValueOffset; + } + + // Position past the key to first value. + keyAndValueStore.writeBuffers.setReadPoint(firstValueAbsoluteOffset, readPos); + nextValueLength = firstValueLength; + } else { + + // Position to the next value record. + Preconditions.checkState(nextAbsoluteValueOffset >= 0); + keyAndValueStore.writeBuffers.setReadPoint(nextAbsoluteValueOffset, readPos); + + // Read the next relative offset. + long relativeNextValueOffset = + keyAndValueStore.writeBuffers.readNByteLong( + RelativeOffset.byteLength, readPos); + if (relativeNextValueOffset == 0) { + isNextEof = true; + nextAbsoluteValueOffset = -1; + } else { + isNextEof = false; + + // The way we insert causes our chain to backwards from the last inserted value record... + nextAbsoluteValueOffset = nextAbsoluteValueOffset - relativeNextValueOffset; + } + nextValueLength = keyAndValueStore.writeBuffers.readVInt(readPos); + + // Now positioned to the value. + } + + // Capture a ByteSegmentRef to the current value position and length. + keyAndValueStore.writeBuffers.getByteSegmentRefToCurrent(byteSegmentRef, nextValueLength, readPos); + + readIndex++; + return byteSegmentRef; + } + + @Override + public boolean isAliasFilterAvailable() { + return false; + } + + @Override + public byte aliasFilter() { + return 0; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("(" + super.toString() + ", "); + sb.append("cappedCount " + cappedCount() + ")"); + return sb.toString(); + } + + @Override + public String getDetailedHashMapResultPositionString() { + // TODO Auto-generated method stub + return "VectorMapJoinFastKeyAndValueStore.HashMapResult undone"; + } + } + + /** + * Retrieve the hash code for an entry. + * @param part1Word + * @return + */ + public static int getHashCode(long part1Word) { + return KeyAndValueRefPart1.getHashCode(part1Word); + } + + public boolean equalKey(long part1Word, long part2Word, byte[] keyBytes, int keyStart, + int keyLength) { + + Preconditions.checkState((part1Word & KeyAndValueRefPart1.IsInvalidFlag.flagOnMask) == 0); + Preconditions.checkState((part2Word & KeyAndValueRefPart2.IsInvalidFlag.flagOnMask) == 0); + + long absoluteOffset = KeyAndValueRefPart2.getAbsoluteOffset(part2Word); + + // Position after next relative offset (fixed length) to the key. + writeBuffers.setReadPoint( + absoluteOffset + RelativeOffset.byteLength, readPos); + + int actualKeyLength = KeyAndValueRefPart1.getSmallKeyLength(part1Word); + boolean isKeyLengthSmall = (actualKeyLength != KeyAndValueRefPart1.SmallKeyLength.allBitsOn); + if (!isKeyLengthSmall) { + + // And, if current value is big we must read it. + actualKeyLength = writeBuffers.readVInt(readPos); + } + + if (actualKeyLength != keyLength) { + return false; + } + + // Our reading was positioned to the key. + if (!writeBuffers.isEqual(keyBytes, keyStart, readPos, keyLength)) { + return false; + } + + return true; + } + + /** + * Bit-length fields within a 128-bit (2 long) key and value reference that includes the + * 32 bit hash code. + * + * First 64 bit long (Part 1): + * + * Lowest field: The 32 bit hash code. + * + * 2nd field: A value count, up to a limit (a cap). Have a count helps the join result + * algorithms determine which optimization to use for M x N result cross products. + * A special constant indicates if the value count is >= the cap. + * + * 3rd field: For short keys, the length of the key. Otherwise, a special constant + * indicating a big value whose length is stored with the key and value. + * + * (Invalid flag field: high bit indicating whether the word is valid). + * + * Second 64 bit long (Part 2): + * + * Lowest field: An absolute byte offset to the key and value in the WriteBuffers. + * + * 2nd field: For short values, the length of the value. Otherwise, a special constant + * indicating a big value whose length is stored with the value. + * + * (Invalid flag field: high bit indicating whether the word is valid). + */ + private final static class KeyAndValueRefPart1 { + + // Lowest field. + private final class HashCode { + private static final int bitLength = 32; + private static final long allBitsOn = (1L << bitLength) - 1; + private static final long bitMask = allBitsOn; + } + + public static int getHashCode(long part1Word) { + return (int) ((part1Word & HashCode.bitMask)); + } + + private final class CappedCount { + private static final int bitLength = 10; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int limit = allBitsOn; + private static final int bitShift = HashCode.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + } + + public static int getCappedCount(long part1Word) { + return (int) ((part1Word & CappedCount.bitMask) >> CappedCount.bitShift); + } + + private final class SmallKeyLength { + private static final int bitLength = 20; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int threshold = allBitsOn; // Lower this for big value testing. + private static final int bitShift = CappedCount.bitShift + CappedCount.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; + } + + public static int getSmallKeyLength(long part1Word) { + return (int) ((part1Word & SmallKeyLength.bitMask) >> SmallKeyLength.bitShift); + } + + // This bit should not be on for valid value references. We use -1 for a no value marker. + private final class IsInvalidFlag { + private static final int bitShift = 63; + private static final long flagOnMask = 1L << bitShift; + } + + public static boolean getIsInvalidFlag(long part1Word) { + return (part1Word & IsInvalidFlag.flagOnMask) != 0; + } + } + + private final static class KeyAndValueRefPart2 { + + // Lowest field. + private final class AbsoluteOffset { + private static final int bitLength = 40; + private static final int byteLength = (bitLength + Byte.SIZE -1) / Byte.SIZE; + private static final long allBitsOn = (1L << bitLength) - 1; + private static final long bitMask = allBitsOn; + + // Make it a power of 2. + private static final long maxSize = 1L << (bitLength - 2); + } + + public static long getAbsoluteOffset(long part2Word) { + return (part2Word & KeyAndValueRefPart2.AbsoluteOffset.bitMask); + } + + private final class SmallValueLength { + private static final int bitLength = 20; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int threshold = allBitsOn; // Lower this for big value testing. + private static final int bitShift = AbsoluteOffset.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; + } + + public static int getSmallValueLength(long part2Word) { + return (int) ((part2Word & SmallValueLength.bitMask) >> SmallValueLength.bitShift); + } + + // This bit should not be on for valid value references. We use -1 for a no value marker. + private final class IsInvalidFlag { + private static final int bitShift = 63; + private static final long flagOnMask = 1L << bitShift; + } + + public static boolean getIsInvalidFlag(long part2Word) { + return (part2Word & IsInvalidFlag.flagOnMask) != 0; + } + } + + private final static class RelativeOffset { + private static final int byteLength = KeyAndValueRefPart2.AbsoluteOffset.byteLength; + + // Relative offset zero padding. + private static final byte[] zeroPadding = new byte[] { 0,0,0,0,0 }; + } + + public long addPart1Word; + public long addPart2Word; + + /** + * Two 64-bit long results will be placed in addPart1Word and addPart2Word. + * @param hashCode + * @param keyBytes + * @param keyStart + * @param keyLength + * @param valueBytes + * @param valueStart + * @param valueLength + */ + public void addFirst(int hashCode, byte[] keyBytes, int keyStart, int keyLength, byte[] valueBytes, + int valueStart, int valueLength) { + + long absoluteOffset = writeBuffers.getWritePoint(); + Preconditions.checkState(absoluteOffset >= 0); + + // Zero pad out bytes for fixed size next relative offset if more values are added later. + writeBuffers.write(RelativeOffset.zeroPadding); + + boolean isKeyLengthBig = (keyLength >= KeyAndValueRefPart1.SmallKeyLength.threshold); + if (isKeyLengthBig) { + writeBuffers.writeVInt(keyLength); + } + writeBuffers.write(keyBytes, keyStart, keyLength); + + boolean isValueLengthBig = (valueLength >= KeyAndValueRefPart2.SmallValueLength.threshold); + if (isValueLengthBig) { + writeBuffers.writeVInt(valueLength); + } + writeBuffers.write(valueBytes, valueStart, valueLength); + + /* + * Form part 1. + */ + addPart1Word = ((long) hashCode) & KeyAndValueRefPart1.HashCode.bitMask; + + addPart1Word |= ((long) 1 << KeyAndValueRefPart1.CappedCount.bitShift); + + if (isKeyLengthBig) { + addPart1Word |= KeyAndValueRefPart1.SmallKeyLength.allBitsOnBitShifted; + } else { + addPart1Word |= ((long) keyLength) << KeyAndValueRefPart1.SmallKeyLength.bitShift; + } + Preconditions.checkState(!KeyAndValueRefPart1.getIsInvalidFlag(addPart1Word)); + + /* + * Form part 2. + */ + addPart2Word = absoluteOffset; + + if (isValueLengthBig) { + addPart2Word |= KeyAndValueRefPart2.SmallValueLength.allBitsOnBitShifted; + } else { + addPart2Word |= ((long) valueLength) << KeyAndValueRefPart2.SmallValueLength.bitShift; + } + Preconditions.checkState(!KeyAndValueRefPart2.getIsInvalidFlag(addPart2Word)); + } + + /** + * The part1 64-bit long updated will be placed in addPart1Word. + * @param part1Word + * @param part2Word + * @param valueBytes + * @param valueStart + * @param valueLength + */ + public void addMore(long part1Word, long part2Word, byte[] valueBytes, + int valueStart, int valueLength) { + + Preconditions.checkState(!KeyAndValueRefPart1.getIsInvalidFlag(part1Word)); + Preconditions.checkState(!KeyAndValueRefPart2.getIsInvalidFlag(part2Word)); + + /* + * Extract information from reference words. + */ + int oldCappedCount = KeyAndValueRefPart1.getCappedCount(part1Word); + + long absoluteOffset = KeyAndValueRefPart2.getAbsoluteOffset(part2Word); + + // Where the new value record will be written. + long nextAbsoluteValueOffset = writeBuffers.getWritePoint(); + + if (oldCappedCount == 1) { + // Write zeros to indicate no 3rd record. + writeBuffers.write(RelativeOffset.zeroPadding); + } else { + + // To insert next value record above count 2: + + // 1) Read next relative offset in first record (this is a positive relative offset) to + // last inserted value record. + long oldPrevRelativeValueOffset = + writeBuffers.readNByteLong( + absoluteOffset, RelativeOffset.byteLength, readPos); + + // 2) Relative offset is positive from first record to last inserted value record. + long prevAbsoluteValueOffset = absoluteOffset + oldPrevRelativeValueOffset; + + // 3) Since previous record is before the new one, subtract because we store relative offsets + // as unsigned. + long newPrevRelativeValueOffset = nextAbsoluteValueOffset - prevAbsoluteValueOffset; + Preconditions.checkState(newPrevRelativeValueOffset >= 0); + writeBuffers.writeFiveByteULong(newPrevRelativeValueOffset); + } + + writeBuffers.writeVInt(valueLength); + writeBuffers.write(valueBytes, valueStart, valueLength); + + // Overwrite relative offset in first record. + long newRelativeOffset = nextAbsoluteValueOffset - absoluteOffset; + Preconditions.checkState(newRelativeOffset >= 0); + writeBuffers.writeFiveByteULong(absoluteOffset, newRelativeOffset); + + // Update part1Word + addPart1Word = part1Word; + if (oldCappedCount < KeyAndValueRefPart1.CappedCount.limit) { + int newCappedCount = oldCappedCount + 1; + addPart1Word &= ~KeyAndValueRefPart1.CappedCount.bitMask; + addPart1Word |= ((long) newCappedCount) << KeyAndValueRefPart1.CappedCount.bitShift; + Preconditions.checkState(!KeyAndValueRefPart1.getIsInvalidFlag(addPart1Word)); + } + } + + public VectorMapJoinFastKeyAndValueStore(int writeBuffersSize) { + writeBuffers = new WriteBuffers(writeBuffersSize, KeyAndValueRefPart2.AbsoluteOffset.maxSize); + + readPos = new WriteBuffers.Position(); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyStore.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyStore.java index be51693..a0fb3aa 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyStore.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyStore.java @@ -22,11 +22,17 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.serde2.WriteBuffers; -// Optimized for sequential key lookup. +import com.google.common.base.Preconditions; +/* + * Used by VectorMapJoinFastBytesHash{MultiSet|Set} to store the key for a hash multi-set or set + * with a bytes key. Those hash tables do not store values but instead store a count or existence + * (respectively) in the hash table slot array. + */ public class VectorMapJoinFastKeyStore { - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastKeyStore.class.getName()); + private static final String CLASS_NAME = VectorMapJoinFastKeyStore.class.getName(); + private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME); private WriteBuffers writeBuffers; @@ -58,60 +64,86 @@ * Last field: an always on bit to insure the key reference non-zero when the offset and * length are zero. */ + private final static class KeyRef { + /* + * The absolute offset to the beginning of the key within the WriteBuffers. + */ + private final class AbsoluteKeyOffset { + private static final int bitLength = 40; + private static final long allBitsOn = (((long) 1) << bitLength) - 1; + private static final long bitMask = allBitsOn; + + // Make it a power of 2 by backing down (i.e. the -2). + private static final long maxSize = ((long) 1) << (bitLength - 2); + } - /* - * The absolute offset to the beginning of the key within the WriteBuffers. - */ - private final class AbsoluteKeyOffset { - private static final int bitLength = 40; - private static final long allBitsOn = (((long) 1) << bitLength) - 1; - private static final long bitMask = allBitsOn; + public static long getAbsoluteKeyOffset(long keyRef) { + return (keyRef & AbsoluteKeyOffset.bitMask); + } - // Make it a power of 2 by backing down (i.e. the -2). - private static final long maxSize = ((long) 1) << (bitLength - 2); - } + /* + * The small key length. + * + * If the key is big (i.e. length >= allBitsOn), then the key length is stored in the + * WriteBuffers. + */ + private final class SmallKeyLength { + private static final int bitLength = 20; + private static final int allBitsOn = (1 << bitLength) - 1; + private static final int threshold = allBitsOn; // Lower this for big key testing. + private static final int bitShift = AbsoluteKeyOffset.bitLength; + private static final long bitMask = ((long) allBitsOn) << bitShift; + private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; + } - /* - * The small key length. - * - * If the key is big (i.e. length >= allBitsOn), then the key length is stored in the - * WriteBuffers. - */ - private final class SmallKeyLength { - private static final int bitLength = 20; - private static final int allBitsOn = (1 << bitLength) - 1; - private static final int threshold = allBitsOn; // Lower this for big key testing. - private static final int bitShift = AbsoluteKeyOffset.bitLength; - private static final long bitMask = ((long) allBitsOn) << bitShift; - private static final long allBitsOnBitShifted = ((long) allBitsOn) << bitShift; - } + public static int getSmallKeyLength(long keyRef) { + return (int) ((keyRef & SmallKeyLength.bitMask) >> SmallKeyLength.bitShift); + } - /* - * An always on bit to insure the key reference non-zero. - */ - private final class IsNonZeroFlag { - private static final int bitShift = SmallKeyLength.bitShift + SmallKeyLength.bitLength;; - private static final long flagOnMask = ((long) 1) << bitShift; + /* + * An always on bit to insure the key reference non-zero. + */ + private final class IsNonZeroFlag { + private static final int bitShift = SmallKeyLength.bitShift + SmallKeyLength.bitLength;; + private static final long flagOnMask = ((long) 1) << bitShift; + } + + public static boolean getIsNonZeroFlag(long keyRef) { + return (keyRef & IsNonZeroFlag.flagOnMask) != 0; + } + + // This bit should not be on for valid value references. We use -1 for a no value marker. + private final class IsInvalidFlag { + private static final int bitShift = 63; + private static final long flagOnMask = 1L << bitShift; + } + + public static boolean getIsInvalidFlag(long keyRef) { + return (keyRef & IsInvalidFlag.flagOnMask) != 0; + } } public long add(byte[] keyBytes, int keyStart, int keyLength) { - boolean isKeyLengthBig = (keyLength >= SmallKeyLength.threshold); long absoluteKeyOffset = writeBuffers.getWritePoint(); + + boolean isKeyLengthBig = (keyLength >= KeyRef.SmallKeyLength.threshold); if (isKeyLengthBig) { writeBuffers.writeVInt(keyLength); } writeBuffers.write(keyBytes, keyStart, keyLength); - long keyRefWord = IsNonZeroFlag.flagOnMask; + long keyRefWord = KeyRef.IsNonZeroFlag.flagOnMask; if (isKeyLengthBig) { - keyRefWord |= SmallKeyLength.allBitsOnBitShifted; + keyRefWord |= KeyRef.SmallKeyLength.allBitsOnBitShifted; } else { - keyRefWord |= ((long) keyLength) << SmallKeyLength.bitShift; + keyRefWord |= ((long) keyLength) << KeyRef.SmallKeyLength.bitShift; } keyRefWord |= absoluteKeyOffset; - // LOG.debug("VectorMapJoinFastKeyStore add keyLength " + keyLength + " absoluteKeyOffset " + absoluteKeyOffset + " keyRefWord " + Long.toHexString(keyRefWord)); + Preconditions.checkState(KeyRef.getIsNonZeroFlag(keyRefWord)); + Preconditions.checkState(!KeyRef.getIsInvalidFlag(keyRefWord)); + return keyRefWord; } @@ -123,40 +155,40 @@ public boolean unsafeEqualKey(long keyRefWord, byte[] keyBytes, int keyStart, in public boolean equalKey(long keyRefWord, byte[] keyBytes, int keyStart, int keyLength, WriteBuffers.Position readPos) { - int storedKeyLengthLength = - (int) ((keyRefWord & SmallKeyLength.bitMask) >> SmallKeyLength.bitShift); - boolean isKeyLengthSmall = (storedKeyLengthLength != SmallKeyLength.allBitsOn); + Preconditions.checkState(KeyRef.getIsNonZeroFlag(keyRefWord)); + Preconditions.checkState(!KeyRef.getIsInvalidFlag(keyRefWord)); - // LOG.debug("VectorMapJoinFastKeyStore equalKey keyLength " + keyLength + " isKeyLengthSmall " + isKeyLengthSmall + " storedKeyLengthLength " + storedKeyLengthLength + " keyRefWord " + Long.toHexString(keyRefWord)); + int storedKeyLengthLength = KeyRef.getSmallKeyLength(keyRefWord); + boolean isKeyLengthSmall = (storedKeyLengthLength != KeyRef.SmallKeyLength.allBitsOn); if (isKeyLengthSmall && storedKeyLengthLength != keyLength) { return false; } - long absoluteKeyOffset = - (keyRefWord & AbsoluteKeyOffset.bitMask); + long absoluteKeyOffset = KeyRef.getAbsoluteKeyOffset(keyRefWord); writeBuffers.setReadPoint(absoluteKeyOffset, readPos); if (!isKeyLengthSmall) { // Read big value length we wrote with the value. storedKeyLengthLength = writeBuffers.readVInt(readPos); if (storedKeyLengthLength != keyLength) { - // LOG.debug("VectorMapJoinFastKeyStore equalKey no match big length"); return false; } } // Our reading is positioned to the key. if (!writeBuffers.isEqual(keyBytes, keyStart, readPos, keyLength)) { - // LOG.debug("VectorMapJoinFastKeyStore equalKey no match on bytes"); return false; } - - // LOG.debug("VectorMapJoinFastKeyStore equalKey match on bytes"); return true; } + public WriteBuffers writeBuffers() { + return writeBuffers; + } + public VectorMapJoinFastKeyStore(int writeBuffersSize) { - writeBuffers = new WriteBuffers(writeBuffersSize, AbsoluteKeyOffset.maxSize); + writeBuffers = new WriteBuffers(writeBuffersSize, KeyRef.AbsoluteKeyOffset.maxSize); + unsafeReadPos = new WriteBuffers.Position(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyValuePut.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyValuePut.java new file mode 100644 index 0000000..063ebf1 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastKeyValuePut.java @@ -0,0 +1,219 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HashCodeUtil; + +/* + * Helper object for putting a new key and value into a fast Vector Map Join hash table. + * + * One motivation for this object is fast Vector Map Join needs it for these + * purposes: + * + * 1) For a (single) integer key, it deserializes so the long value can be available for min/max + * determination during hash table load. Later, the fast Vector Map Join operators will be + * able to filter out long keys using min/max. + * + * And, compute the hash code on the long primitive. + * + * 2) For a (single) string key, it deserializes the string value so it can be saved in the hash + * table as a string instead of the serialized form. This improves performance in the + * fast Vector Map Join operator by allowing it to lookup it string key without having to + * serialize it. + * + * And, compute the hash code on the string. + * + * The class implements the standard Map Join interface for key value put. + * + */ +public class VectorMapJoinFastKeyValuePut implements KeyValuePut { + + private final HashTableKind hashTableKind; + private final HashTableKeyType hashTableKeyType; + private final boolean isOuterJoin; + + private final BinarySortableDeserializeRead keyBinarySortableDeserializeRead; + + private final boolean isLong; + + private BytesWritable keyBytesWritable; + private BytesWritable valueBytesWritable; + + private long longKey; + + private boolean hasHashCode; + private int hashCode; + + private boolean isNull; + + public VectorMapJoinFastKeyValuePut(HashTableKind hashTableKind, + HashTableKeyType hashTableKeyType, boolean isOuterJoin) { + this.hashTableKind = hashTableKind; + this.hashTableKeyType = hashTableKeyType; + this.isOuterJoin = isOuterJoin; + + switch (hashTableKeyType) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + { + isLong = true; + TypeInfo typeInfo; + switch (hashTableKeyType) { + case BOOLEAN: + typeInfo = TypeInfoFactory.booleanTypeInfo; + break; + case BYTE: + typeInfo = TypeInfoFactory.byteTypeInfo; + break; + case SHORT: + typeInfo = TypeInfoFactory.shortTypeInfo; + break; + case INT: + typeInfo = TypeInfoFactory.intTypeInfo; + break; + case LONG: + typeInfo = TypeInfoFactory.longTypeInfo; + break; + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + TypeInfo[] typeInfos = { typeInfo }; + keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(typeInfos); + } + break; + case STRING: + { + isLong = false; + TypeInfo[] typeInfos = { TypeInfoFactory.stringTypeInfo }; + keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(typeInfos); + } + break; + case MULTI_KEY: + isLong = false; + keyBinarySortableDeserializeRead = null; + break; + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + + hasHashCode = false; + } + + @Override + public void setKeyValue(Writable keyWritable, Writable valueWritable) + throws SerDeException, IOException { + + keyBytesWritable = (BytesWritable) keyWritable; + valueBytesWritable = (BytesWritable) valueWritable; + isNull = false; // Assume. + hasHashCode = false; + + if (isLong) { + // Deserialized the single long column. + keyBinarySortableDeserializeRead.set(keyBytesWritable.getBytes(), 0, + keyBytesWritable.getLength()); + if (keyBinarySortableDeserializeRead.readCheckNull()) { + isNull = true; + return; + } + longKey = VectorMapJoinFastLongHashUtil.deserializeLongKey( + keyBinarySortableDeserializeRead, hashTableKeyType); + hashCode = HashCodeUtil.calculateLongHashCode(longKey); + } else { + switch (hashTableKeyType) { + case STRING: + { + // Deserialize the single string column. + keyBinarySortableDeserializeRead.set(keyBytesWritable.getBytes(), 0, + keyBytesWritable.getLength()); + if (keyBinarySortableDeserializeRead.readCheckNull()) { + isNull = true; + return; + } + hashCode = HashCodeUtil.murmurHash( + keyBinarySortableDeserializeRead.currentBytes, + keyBinarySortableDeserializeRead.currentBytesStart, + keyBinarySortableDeserializeRead.currentBytesLength); + } + break; + + case MULTI_KEY: + // Leave the multi-key unserialized. And, let all NULL entries into the small table. + hashCode = HashCodeUtil.murmurHash(keyBytesWritable.getBytes(), 0, keyBytesWritable.getLength()); + break; + default: + throw new RuntimeException("Unexpected vector map join hash table key type " + hashTableKeyType.name()); + } + hasHashCode = true; + } + } + + @Override + public boolean hasHashCode() { + return hasHashCode; + } + + @Override + public int getKeyHashCode() throws SerDeException { + return hashCode; + } + + public boolean isNull() { + return isNull; + } + + @Override + public long getLongKey() { + return longKey; + } + + public byte[] getStringBytes() { + return keyBinarySortableDeserializeRead.currentBytes; + } + + public int getStringStart() { + return keyBinarySortableDeserializeRead.currentBytesStart; + } + + public int getStringLength() { + return keyBinarySortableDeserializeRead.currentBytesLength; + } + + public BytesWritable getKeyBytesWritable() { + return keyBytesWritable; + } + + public BytesWritable getValueBytesWritable() { + return valueBytesWritable; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMap.java index cd51d0d..9f72b76 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMap.java @@ -22,11 +22,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.io.BytesWritable; import org.apache.hive.common.util.HashCodeUtil; @@ -35,9 +34,7 @@ /* * An single LONG key hash map optimized for vector map join. */ -public class VectorMapJoinFastLongHashMap - extends VectorMapJoinFastLongHashTable - implements VectorMapJoinLongHashMap { +public class VectorMapJoinFastLongHashMap extends VectorMapJoinFastLongHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastLongHashMap.class); @@ -46,8 +43,15 @@ private BytesWritable testValueBytesWritable; @Override - public VectorMapJoinHashMapResult createHashMapResult() { - return new VectorMapJoinFastValueStore.HashMapResult(); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + add(internalKeyValuePut.getLongKey(), internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } /* @@ -60,7 +64,8 @@ public void testPutRow(long currentKey, byte[] currentValue) throws HiveExceptio testValueBytesWritable = new BytesWritable(); } testValueBytesWritable.set(currentValue, 0, currentValue.length); - add(currentKey, testValueBytesWritable); + int hashCode = HashCodeUtil.calculateLongHashCode(currentKey); + add(currentKey, hashCode, testValueBytesWritable); } @Override @@ -81,35 +86,64 @@ public void assignSlot(int slot, long key, boolean isNewKey, BytesWritable curre } @Override - public JoinUtil.JoinResult lookup(long key, VectorMapJoinHashMapResult hashMapResult) { + public void hashMapLookup(long key, int hashCode, MapJoinHashMapResult hashMapResult) { - VectorMapJoinFastValueStore.HashMapResult optimizedHashMapResult = + VectorMapJoinFastValueStore.HashMapResult internalHashMapResult = (VectorMapJoinFastValueStore.HashMapResult) hashMapResult; - optimizedHashMapResult.forget(); + internalHashMapResult.forget(); - long hashCode = HashCodeUtil.calculateLongHashCode(key); // LOG.debug("VectorMapJoinFastLongHashMap lookup " + key + " hashCode " + hashCode); long valueRef = findReadSlot(key, hashCode); - JoinUtil.JoinResult joinResult; if (valueRef == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; + internalHashMapResult.setNoMatch(); } else { - optimizedHashMapResult.set(valueStore, valueRef); - - joinResult = JoinUtil.JoinResult.MATCH; + internalHashMapResult.setMatch(valueStore, valueRef); } - - optimizedHashMapResult.setJoinResult(joinResult); - - return joinResult; } public VectorMapJoinFastLongHashMap( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(minMaxEnabled, isOuterJoin, hashTableKeyType, - initialCapacity, loadFactor, writeBuffersSize); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, minMaxEnabled, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); valueStore = new VectorMapJoinFastValueStore(writeBuffersSize); } + + @Override + public long memorySize() { + return valueStore.writeBuffers().size() + slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } + + @Override + public void clear() { + // UNDONE + } + + @Override + public void debugDumpTable() { + StringBuilder dump = new StringBuilder(keysAssigned + " keys\n"); + + int pairIndex; + long valueRef; + int examined = 0; + for (int i = 0; i < logicalHashBucketCount; i++) { + pairIndex = i * 2; + valueRef = slotPairs[pairIndex]; + if (valueRef != 0) { + long tableKey = slotPairs[pairIndex + 1]; + examined++; + dump.append("slot "); + dump.append(i); + dump.append("\n"); + + // UNDONE: Dump long key and values + } + } + if (examined != keysAssigned) { + dump.append("Found " + examined + " keys!\n"); + } + LOG.info("Vector MapJoin Fast LongHashMap dump:\n " + dump.toString()); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMultiSet.java index 032233a..210ada7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashMultiSet.java @@ -22,9 +22,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMultiSet; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.serde2.SerDeException; @@ -36,15 +34,20 @@ /* * An single LONG key hash multi-set optimized for vector map join. */ -public class VectorMapJoinFastLongHashMultiSet - extends VectorMapJoinFastLongHashTable - implements VectorMapJoinLongHashMultiSet { +public class VectorMapJoinFastLongHashMultiSet extends VectorMapJoinFastLongHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastLongHashMultiSet.class); @Override - public VectorMapJoinHashMultiSetResult createHashMultiSetResult() { - return new VectorMapJoinFastHashMultiSet.HashMultiSetResult(); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + add(internalKeyValuePut.getLongKey(), internalKeyValuePut.getKeyHashCode(), + null); } /* @@ -53,7 +56,8 @@ public VectorMapJoinHashMultiSetResult createHashMultiSetResult() { */ @VisibleForTesting public void testPutRow(long currentKey) throws HiveException, IOException { - add(currentKey, null); + int hashCode = HashCodeUtil.calculateLongHashCode(currentKey); + add(currentKey, hashCode, null); } @Override @@ -72,32 +76,59 @@ public void assignSlot(int slot, long key, boolean isNewKey, BytesWritable curre @Override - public JoinUtil.JoinResult contains(long key, VectorMapJoinHashMultiSetResult hashMultiSetResult) { + public void hashMultiSetContains(long key, int hashCode, MapJoinHashMultiSetResult hashMultiSetResult) { - VectorMapJoinFastHashMultiSet.HashMultiSetResult optimizedHashMultiSetResult = - (VectorMapJoinFastHashMultiSet.HashMultiSetResult) hashMultiSetResult; + hashMultiSetResult.forget(); - optimizedHashMultiSetResult.forget(); - - long hashCode = HashCodeUtil.calculateLongHashCode(key); long count = findReadSlot(key, hashCode); - JoinUtil.JoinResult joinResult; if (count == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; + hashMultiSetResult.setNoMatch(); } else { - optimizedHashMultiSetResult.set(count); - joinResult = JoinUtil.JoinResult.MATCH; + hashMultiSetResult.setMatch(count); } - - optimizedHashMultiSetResult.setJoinResult(joinResult); - - return joinResult; } public VectorMapJoinFastLongHashMultiSet( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(minMaxEnabled, isOuterJoin, hashTableKeyType, - initialCapacity, loadFactor, writeBuffersSize); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, minMaxEnabled, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + } + + @Override + public long memorySize() { + return slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; + } + + @Override + public void clear() { + // UNDONE + } + + @Override + public void debugDumpTable() { + StringBuilder dump = new StringBuilder(keysAssigned + " keys\n"); + + int pairIndex; + long count; + int examined = 0; + for (int i = 0; i < logicalHashBucketCount; i++) { + pairIndex = i * 2; + count = slotPairs[pairIndex]; + if (count > 0) { + long tableKey = slotPairs[pairIndex + 1]; + examined++; + dump.append("slot "); + dump.append(i); + dump.append("\n"); + + // UNDONE: Dump long key and count + } + } + if (examined != keysAssigned) { + dump.append("Found " + examined + " keys!\n"); + } + LOG.info("Vector MapJoin Fast LongHashMultiSet dump:\n " + dump.toString()); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashSet.java index 21701d4..e2b42ef 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashSet.java @@ -22,12 +22,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashSet; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.io.BytesWritable; import org.apache.hive.common.util.HashCodeUtil; @@ -36,15 +34,20 @@ /* * An single LONG key hash set optimized for vector map join. */ -public class VectorMapJoinFastLongHashSet - extends VectorMapJoinFastLongHashTable - implements VectorMapJoinLongHashSet { +public class VectorMapJoinFastLongHashSet extends VectorMapJoinFastLongHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastLongHashSet.class); @Override - public VectorMapJoinHashSetResult createHashSetResult() { - return new VectorMapJoinFastHashSet.HashSetResult(); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + add(internalKeyValuePut.getLongKey(), internalKeyValuePut.getKeyHashCode(), + null); } /* @@ -53,7 +56,8 @@ public VectorMapJoinHashSetResult createHashSetResult() { */ @VisibleForTesting public void testPutRow(long currentKey) throws HiveException, IOException { - add(currentKey, null); + int hashCode = HashCodeUtil.calculateLongHashCode(currentKey); + add(currentKey, hashCode, null); } @Override @@ -68,32 +72,61 @@ public void assignSlot(int slot, long key, boolean isNewKey, BytesWritable curre } @Override - public JoinResult contains(long key, VectorMapJoinHashSetResult hashSetResult) { + public void hashSetContains(long key, int hashCode, MapJoinHashSetResult hashSetResult) { - VectorMapJoinFastHashSet.HashSetResult optimizedHashSetResult = - (VectorMapJoinFastHashSet.HashSetResult) hashSetResult; + hashSetResult.forget(); - optimizedHashSetResult.forget(); - - long hashCode = HashCodeUtil.calculateLongHashCode(key); long existance = findReadSlot(key, hashCode); - JoinUtil.JoinResult joinResult; if (existance == -1) { - joinResult = JoinUtil.JoinResult.NOMATCH; + hashSetResult.setNoMatch(); } else { - joinResult = JoinUtil.JoinResult.MATCH; + hashSetResult.setMatch(); } - optimizedHashSetResult.setJoinResult(joinResult); - - return joinResult; - } public VectorMapJoinFastLongHashSet( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(minMaxEnabled, isOuterJoin, hashTableKeyType, - initialCapacity, loadFactor, writeBuffersSize); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, minMaxEnabled, isOuterJoin, hashTableKeyType, + initialCapacity, loadFactor, writeBuffersSize, memUsage); + } + + @Override + public long memorySize() { + return slotPairs.length * (Long.SIZE/Byte.SIZE) + 100; } + + @Override + public void clear() { + // UNDONE + } + + @Override + public void debugDumpTable() { + StringBuilder dump = new StringBuilder(keysAssigned + " keys\n"); + + int pairIndex; + long existence; + int examined = 0; + for (int i = 0; i < logicalHashBucketCount; i++) { + pairIndex = i * 2; + existence = slotPairs[pairIndex]; + if (existence == 1) { + long tableKey = slotPairs[pairIndex + 1]; + examined++; + dump.append("slot "); + dump.append(i); + dump.append("\n"); + + // UNDONE: Dump long key + } + } + if (examined != keysAssigned) { + dump.append("Found " + examined + " keys!\n"); + } + LOG.info("Vector MapJoin Fast LongHashSet dump:\n " + dump.toString()); + } + } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java index ee66d5b..fa4e83d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastLongHashTable.java @@ -22,42 +22,32 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashTable; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.io.BytesWritable; import org.apache.hive.common.util.HashCodeUtil; -import org.apache.tez.runtime.library.api.KeyValueReader; import com.google.common.annotations.VisibleForTesting; /* * An single long value map optimized for vector map join. */ -public abstract class VectorMapJoinFastLongHashTable - extends VectorMapJoinFastHashTable - implements VectorMapJoinLongHashTable { +public abstract class VectorMapJoinFastLongHashTable extends VectorMapJoinFastHashTable { public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastLongHashTable.class); private transient final boolean isLogDebugEnabled = LOG.isDebugEnabled(); - private final HashTableKeyType hashTableKeyType; + private BinarySortableDeserializeRead keyBinarySortableDeserializeRead; - private final boolean isOuterJoin; - - private final BinarySortableDeserializeRead keyBinarySortableDeserializeRead; - - private final boolean useMinMax; + private boolean useMinMax; private long min; private long max; + private BytesWritable testValueBytesWritable; + @Override public boolean useMinMax() { return useMinMax; @@ -73,39 +63,25 @@ public long max() { return max; } - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - try { - if (keyBinarySortableDeserializeRead.readCheckNull()) { - return; - } - } catch (Exception e) { - throw new HiveException( - "\nDeserializeRead details: " + - keyBinarySortableDeserializeRead.getDetailedReadPositionString() + - "\nException: " + e.toString()); + @VisibleForTesting + public void testPutRow(long currentKey, byte[] currentValue) throws HiveException, IOException { + if (testValueBytesWritable == null) { + testValueBytesWritable = new BytesWritable(); } - - long key = VectorMapJoinFastLongHashUtil.deserializeLongKey( - keyBinarySortableDeserializeRead, hashTableKeyType); - - add(key, currentValue); + testValueBytesWritable.set(currentValue, 0, currentValue.length); + int hashCode = HashCodeUtil.calculateLongHashCode(currentKey); + add(currentKey, hashCode, testValueBytesWritable); } protected abstract void assignSlot(int slot, long key, boolean isNewKey, BytesWritable currentValue); - public void add(long key, BytesWritable currentValue) { + public void add(long key, int hashCode, BytesWritable currentValue) { if (resizeThreshold <= keysAssigned) { expandAndRehash(); } - long hashCode = HashCodeUtil.calculateLongHashCode(key); - int intHashCode = (int) hashCode; - int slot = (intHashCode & logicalHashBucketMask); + int slot = (hashCode & logicalHashBucketMask); long probeSlot = slot; int i = 0; boolean isNewKey; @@ -113,18 +89,17 @@ public void add(long key, BytesWritable currentValue) { int pairIndex = 2 * slot; long valueRef = slotPairs[pairIndex]; if (valueRef == 0) { - // LOG.debug("VectorMapJoinFastLongHashTable add key " + key + " slot " + slot + " pairIndex " + pairIndex + " empty slot (i = " + i + ")"); isNewKey = true; break; } long tableKey = slotPairs[pairIndex + 1]; if (key == tableKey) { - // LOG.debug("VectorMapJoinFastLongHashTable add key " + key + " slot " + slot + " pairIndex " + pairIndex + " found key (i = " + i + ")"); isNewKey = false; break; } - ++metricPutConflict; + // Some other key (collision) - keep probing. + metricPutConflict++; probeSlot += (++i); slot = (int)(probeSlot & logicalHashBucketMask); } @@ -154,8 +129,10 @@ public void add(long key, BytesWritable currentValue) { } } - private void expandAndRehash() { + @Override + protected void expandAndRehashImpl(int capacity) { + long expandTime = System.currentTimeMillis(); int newLogicalHashBucketCount = logicalHashBucketCount * 2; int newLogicalHashBucketMask = newLogicalHashBucketCount - 1; int newMetricPutConflict = 0; @@ -171,9 +148,9 @@ private void expandAndRehash() { long tableKey = slotPairs[pairIndex + 1]; // Copy to new slot table. - long hashCode = HashCodeUtil.calculateLongHashCode(tableKey); - int intHashCode = (int) hashCode; - int newSlot = intHashCode & newLogicalHashBucketMask; + int hashCode = HashCodeUtil.calculateLongHashCode(tableKey); + + int newSlot = hashCode & newLogicalHashBucketMask; long newProbeSlot = newSlot; int newPairIndex; int i = 0; @@ -194,11 +171,9 @@ private void expandAndRehash() { LOG.debug("Probed " + i + " slots (the longest so far) to find space"); } newLargestNumberOfSteps = i; - // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot); } // Use old value reference word. - // LOG.debug("VectorMapJoinFastLongHashTable expandAndRehash key " + tableKey + " slot " + newSlot + " newPairIndex " + newPairIndex + " empty slot (i = " + i + ")"); newSlotPairs[newPairIndex] = valueRef; newSlotPairs[newPairIndex + 1] = tableKey; @@ -211,14 +186,13 @@ private void expandAndRehash() { metricPutConflict = newMetricPutConflict; largestNumberOfSteps = newLargestNumberOfSteps; resizeThreshold = (int)(logicalHashBucketCount * loadFactor); + metricExpandsMs += (System.currentTimeMillis() - expandTime); metricExpands++; - // LOG.debug("VectorMapJoinFastLongHashTable expandAndRehash new logicalHashBucketCount " + logicalHashBucketCount + " resizeThreshold " + resizeThreshold + " metricExpands " + metricExpands); } - protected long findReadSlot(long key, long hashCode) { + protected long findReadSlot(long key, int hashCode) { - int intHashCode = (int) hashCode; - int slot = intHashCode & logicalHashBucketMask; + int slot = hashCode & logicalHashBucketMask; long probeSlot = slot; int i = 0; @@ -227,7 +201,6 @@ protected long findReadSlot(long key, long hashCode) { long valueRef = slotPairs[pairIndex]; if (valueRef == 0) { // Given that we do not delete, an empty slot means no match. - // LOG.debug("VectorMapJoinFastLongHashTable findReadSlot key " + key + " slot " + slot + " pairIndex " + pairIndex + " empty slot (i = " + i + ")"); return -1; } long tableKey = slotPairs[pairIndex + 1]; @@ -240,13 +213,16 @@ protected long findReadSlot(long key, long hashCode) { if (i > largestNumberOfSteps) { // LOG.debug("VectorMapJoinFastLongHashTable findReadSlot returning not found"); // We know we never went that far when we were inserting. - // LOG.debug("VectorMapJoinFastLongHashTable findReadSlot key " + key + " slot " + slot + " pairIndex " + pairIndex + " largestNumberOfSteps " + largestNumberOfSteps + " (i = " + i + ")"); return -1; } slot = (int)(probeSlot & logicalHashBucketMask); } } + protected int getLongsPerSlot() { + return 2; + } + /* * The hash table slots. For a long key hash table, each slot is 2 longs and the array is * 2X sized. @@ -255,17 +231,16 @@ protected long findReadSlot(long key, long hashCode) { */ protected long[] slotPairs; - private void allocateBucketArray() { + protected void allocateBucketArray() { int slotPairArraySize = 2 * logicalHashBucketCount; slotPairs = new long[slotPairArraySize]; } public VectorMapJoinFastLongHashTable( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); - this.isOuterJoin = isOuterJoin; - this.hashTableKeyType = hashTableKeyType; + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); PrimitiveTypeInfo[] primitiveTypeInfos = { hashTableKeyType.getPrimitiveTypeInfo() }; keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); allocateBucketArray(); @@ -273,4 +248,9 @@ public VectorMapJoinFastLongHashTable( min = Long.MAX_VALUE; max = Long.MIN_VALUE; } + + @Override + public void clear() { + // UNDONE + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMap.java index cee3b3b..af13cbc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMap.java @@ -21,7 +21,9 @@ import java.io.IOException; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.io.BytesWritable; +import org.apache.hive.common.util.HashCodeUtil; import com.google.common.annotations.VisibleForTesting; @@ -33,24 +35,38 @@ public class VectorMapJoinFastMultiKeyHashMap extends VectorMapJoinFastBytesHashMap { + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + BytesWritable keyBytesWritable = internalKeyValuePut.getKeyBytesWritable(); + add(keyBytesWritable.getBytes(), 0, keyBytesWritable.getLength(), + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); + } + /* * A Unit Test convenience method for putting key and value into the hash table using the * actual types. */ @VisibleForTesting public void testPutRow(byte[] currentKey, byte[] currentValue) throws HiveException, IOException { - if (testKeyBytesWritable == null) { - testKeyBytesWritable = new BytesWritable(); + if (testValueBytesWritable == null) { testValueBytesWritable = new BytesWritable(); } - testKeyBytesWritable.set(currentKey, 0, currentKey.length); testValueBytesWritable.set(currentValue, 0, currentValue.length); - putRow(testKeyBytesWritable, testValueBytesWritable); + int hashCode = HashCodeUtil.murmurHash(currentKey, 0, currentKey.length); + add(currentKey, 0, currentKey.length, hashCode, testValueBytesWritable); } public VectorMapJoinFastMultiKeyHashMap( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMultiSet.java index ff82ac4..a1a20a1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashMultiSet.java @@ -20,8 +20,11 @@ import java.io.IOException; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.io.BytesWritable; +import org.apache.hive.common.util.HashCodeUtil; import com.google.common.annotations.VisibleForTesting; @@ -33,23 +36,35 @@ public class VectorMapJoinFastMultiKeyHashMultiSet extends VectorMapJoinFastBytesHashMultiSet { + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + BytesWritable keyBytesWritable = internalKeyValuePut.getKeyBytesWritable(); + add(keyBytesWritable.getBytes(), 0, keyBytesWritable.getLength(), + internalKeyValuePut.getKeyHashCode(), + null); + } + /* * A Unit Test convenience method for putting the key into the hash table using the * actual type. */ @VisibleForTesting public void testPutRow(byte[] currentKey) throws HiveException, IOException { - if (testKeyBytesWritable == null) { - testKeyBytesWritable = new BytesWritable(); - } - testKeyBytesWritable.set(currentKey, 0, currentKey.length); - putRow(testKeyBytesWritable, null); + int hashCode = HashCodeUtil.murmurHash(currentKey, 0, currentKey.length); + add(currentKey, 0, currentKey.length, hashCode, null); } public VectorMapJoinFastMultiKeyHashMultiSet( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashSet.java index de0666d..fa850fe 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastMultiKeyHashSet.java @@ -20,8 +20,11 @@ import java.io.IOException; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.io.BytesWritable; +import org.apache.hive.common.util.HashCodeUtil; import com.google.common.annotations.VisibleForTesting; @@ -33,24 +36,34 @@ public class VectorMapJoinFastMultiKeyHashSet extends VectorMapJoinFastBytesHashSet { + @Override + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + BytesWritable keyBytesWritable = internalKeyValuePut.getKeyBytesWritable(); + add(keyBytesWritable.getBytes(), 0, keyBytesWritable.getLength(), + internalKeyValuePut.getKeyHashCode(), + null); + } + /* * A Unit Test convenience method for putting the key into the hash table using the * actual type. */ @VisibleForTesting public void testPutRow(byte[] currentKey) throws HiveException, IOException { - if (testKeyBytesWritable == null) { - testKeyBytesWritable = new BytesWritable(); - } - testKeyBytesWritable.set(currentKey, 0, currentKey.length); - putRow(testKeyBytesWritable, null); + int hashCode = HashCodeUtil.murmurHash(currentKey, 0, currentKey.length); + add(currentKey, 0, currentKey.length, hashCode, null); } public VectorMapJoinFastMultiKeyHashSet( - boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, + boolean isOuterJoin, + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } - - } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java deleted file mode 100644 index bf378ac..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringCommon.java +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.BytesWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/* - * An single byte array value hash map optimized for vector map join. - */ -public class VectorMapJoinFastStringCommon { - - public static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinFastStringCommon.class); - - private boolean isOuterJoin; - - private BinarySortableDeserializeRead keyBinarySortableDeserializeRead; - - public void adaptPutRow(VectorMapJoinFastBytesHashTable hashTable, - BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - try { - if (keyBinarySortableDeserializeRead.readCheckNull()) { - return; - } - } catch (Exception e) { - throw new HiveException( - "\nDeserializeRead details: " + - keyBinarySortableDeserializeRead.getDetailedReadPositionString() + - "\nException: " + e.toString()); - } - - hashTable.add( - keyBinarySortableDeserializeRead.currentBytes, - keyBinarySortableDeserializeRead.currentBytesStart, - keyBinarySortableDeserializeRead.currentBytesLength, - currentValue); - } - - public VectorMapJoinFastStringCommon(boolean isOuterJoin) { - this.isOuterJoin = isOuterJoin; - PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.stringTypeInfo }; - keyBinarySortableDeserializeRead = - new BinarySortableDeserializeRead(primitiveTypeInfos); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMap.java index 35af1d1..50ac358 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMap.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMap.java @@ -18,10 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.hive.serde2.SerDeException; /* * An single STRING key hash map optimized for vector map join. @@ -30,17 +27,25 @@ */ public class VectorMapJoinFastStringHashMap extends VectorMapJoinFastBytesHashMap { - private VectorMapJoinFastStringCommon stringCommon; - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - stringCommon.adaptPutRow(this, currentKey, currentValue); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + add(internalKeyValuePut.getStringBytes(), + internalKeyValuePut.getStringStart(), + internalKeyValuePut.getStringLength(), + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } public VectorMapJoinFastStringHashMap( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); - stringCommon = new VectorMapJoinFastStringCommon(isOuterJoin); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMultiSet.java index 36120b7..1c711eb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMultiSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashMultiSet.java @@ -18,10 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.hive.serde2.SerDeException; /* * An single STRING key hash multi-set optimized for vector map join. @@ -30,17 +27,25 @@ */ public class VectorMapJoinFastStringHashMultiSet extends VectorMapJoinFastBytesHashMultiSet { - private VectorMapJoinFastStringCommon stringCommon; - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - stringCommon.adaptPutRow(this, currentKey, currentValue); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + add(internalKeyValuePut.getStringBytes(), + internalKeyValuePut.getStringStart(), + internalKeyValuePut.getStringLength(), + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } public VectorMapJoinFastStringHashMultiSet( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); - stringCommon = new VectorMapJoinFastStringCommon(isOuterJoin); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashSet.java index 2ed6ab3..887b05b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashSet.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastStringHashSet.java @@ -18,10 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.hive.serde2.SerDeException; /* * An single STRING key hash set optimized for vector map join. @@ -30,17 +27,25 @@ */ public class VectorMapJoinFastStringHashSet extends VectorMapJoinFastBytesHashSet { - private VectorMapJoinFastStringCommon stringCommon; - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) throws HiveException, IOException { - stringCommon.adaptPutRow(this, currentKey, currentValue); + public void put(KeyValuePut keyValuePut) throws SerDeException { + + VectorMapJoinFastKeyValuePut internalKeyValuePut = ((VectorMapJoinFastKeyValuePut) keyValuePut); + if (internalKeyValuePut.isNull()) { + return; + } + + add(internalKeyValuePut.getStringBytes(), + internalKeyValuePut.getStringStart(), + internalKeyValuePut.getStringLength(), + internalKeyValuePut.getKeyHashCode(), + internalKeyValuePut.getValueBytesWritable()); } public VectorMapJoinFastStringHashSet( + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory, boolean isOuterJoin, - int initialCapacity, float loadFactor, int writeBuffersSize) { - super(initialCapacity, loadFactor, writeBuffersSize); - stringCommon = new VectorMapJoinFastStringCommon(isOuterJoin); + int initialCapacity, float loadFactor, int writeBuffersSize, long memUsage) { + super(mapJoinHashTableFactory, initialCapacity, loadFactor, writeBuffersSize, memUsage); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java deleted file mode 100644 index 9f3b107..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java +++ /dev/null @@ -1,233 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; - -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext; -import org.apache.hadoop.hive.ql.exec.tez.HashTableLoader; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTable; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinTableContainer; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.Writable; -import org.apache.tez.runtime.library.api.KeyValueReader; - -/** - * HashTableLoader for Tez constructs the hashtable from records read from - * a broadcast edge. - */ -public class VectorMapJoinFastTableContainer implements VectorMapJoinTableContainer { - - private static final Logger LOG = LoggerFactory.getLogger(HashTableLoader.class.getName()); - - private final MapJoinDesc desc; - private final Configuration hconf; - - private final float keyCountAdj; - private final int threshold; - private final float loadFactor; - private final int wbSize; - private final long keyCount; - - - private final VectorMapJoinFastHashTable vectorMapJoinFastHashTable; - - public VectorMapJoinFastTableContainer(MapJoinDesc desc, Configuration hconf, - long keyCount) throws SerDeException { - - this.desc = desc; - this.hconf = hconf; - - keyCountAdj = HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEKEYCOUNTADJUSTMENT); - threshold = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD); - loadFactor = HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLELOADFACTOR); - wbSize = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEWBSIZE); - - this.keyCount = keyCount; - - // LOG.info("VectorMapJoinFastTableContainer load keyCountAdj " + keyCountAdj); - // LOG.info("VectorMapJoinFastTableContainer load threshold " + threshold); - // LOG.info("VectorMapJoinFastTableContainer load loadFactor " + loadFactor); - // LOG.info("VectorMapJoinFastTableContainer load wbSize " + wbSize); - - int newThreshold = HashMapWrapper.calculateTableSize( - keyCountAdj, threshold, loadFactor, keyCount); - - // LOG.debug("VectorMapJoinFastTableContainer load newThreshold " + newThreshold); - - vectorMapJoinFastHashTable = createHashTable(newThreshold); - } - - @Override - public VectorMapJoinHashTable vectorMapJoinHashTable() { - return vectorMapJoinFastHashTable; - } - - private VectorMapJoinFastHashTable createHashTable(int newThreshold) { - - boolean isOuterJoin = !desc.isNoOuterJoin(); - VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); - HashTableImplementationType hashTableImplementationType = vectorDesc.hashTableImplementationType(); - HashTableKind hashTableKind = vectorDesc.hashTableKind(); - HashTableKeyType hashTableKeyType = vectorDesc.hashTableKeyType(); - boolean minMaxEnabled = vectorDesc.minMaxEnabled(); - - int writeBufferSize = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEWBSIZE); - - VectorMapJoinFastHashTable hashTable = null; - - switch (hashTableKeyType) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinFastLongHashMap( - minMaxEnabled, isOuterJoin, hashTableKeyType, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinFastLongHashMultiSet( - minMaxEnabled, isOuterJoin, hashTableKeyType, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_SET: - hashTable = new VectorMapJoinFastLongHashSet( - minMaxEnabled, isOuterJoin, hashTableKeyType, - newThreshold, loadFactor, writeBufferSize); - break; - } - break; - - case STRING: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinFastStringHashMap( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinFastStringHashMultiSet( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_SET: - hashTable = new VectorMapJoinFastStringHashSet( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - } - break; - - case MULTI_KEY: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinFastMultiKeyHashMap( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinFastMultiKeyHashMultiSet( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - case HASH_SET: - hashTable = new VectorMapJoinFastMultiKeyHashSet( - isOuterJoin, - newThreshold, loadFactor, writeBufferSize); - break; - } - break; - } - - return hashTable; - } - - @Override - public MapJoinKey putRow(Writable currentKey, Writable currentValue) - throws SerDeException, HiveException, IOException { - - // We are not using the key and value contexts, nor do we support a MapJoinKey. - vectorMapJoinFastHashTable.putRow((BytesWritable) currentKey, (BytesWritable) currentValue); - return null; - } - - @Override - public void seal() { - // Do nothing - } - - @Override - public ReusableGetAdaptor createGetter(MapJoinKey keyTypeFromLoader) { - throw new RuntimeException("Not applicable"); - } - - @Override - public void clear() { - // Do nothing - } - - @Override - public MapJoinKey getAnyKey() { - throw new RuntimeException("Not applicable"); - } - - @Override - public void dumpMetrics() { - // TODO - } - - @Override - public boolean hasSpill() { - return false; - } - - @Override - public int size() { - return vectorMapJoinFastHashTable.size(); - } - - @Override - public void setSerde(MapJoinObjectSerDeContext keyCtx, MapJoinObjectSerDeContext valCtx) - throws SerDeException { - // Do nothing in this case. - - } - - /* - @Override - public com.esotericsoftware.kryo.io.Output getHybridBigTableSpillOutput(int partitionId) { - throw new RuntimeException("Not applicable"); - } - */ -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java index f9c5b34..05f694b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastValueStore.java @@ -20,7 +20,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResultImpl; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.serde2.WriteBuffers; import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; import org.apache.hadoop.hive.serde2.WriteBuffers.Position; @@ -113,7 +115,8 @@ public WriteBuffers writeBuffers() { return writeBuffers; } - public static class HashMapResult extends VectorMapJoinHashMapResult { + public static class HashMapResult extends MapJoinHashTableResultImpl + implements MapJoinHashMapResult { private VectorMapJoinFastValueStore valueStore; @@ -141,7 +144,7 @@ public HashMapResult() { readPos = new Position(); } - public void set(VectorMapJoinFastValueStore valueStore, long valueRefWord) { + public void setMatch(VectorMapJoinFastValueStore valueStore, long valueRefWord) { this.valueStore = valueStore; this.valueRefWord = valueRefWord; @@ -152,6 +155,8 @@ public void set(VectorMapJoinFastValueStore valueStore, long valueRefWord) { (int) ((valueRefWord & CappedCount.bitMask) >> CappedCount.bitShift); // Position to beginning. readIndex = 0; + + mapJoinResult = MapJoinResult.MATCH; } /** @@ -408,6 +413,16 @@ public String toString() { sb.append("cappedCount " + cappedCount() + ")"); return sb.toString(); } + + @Override + public boolean isAliasFilterAvailable() { + return false; + } + + @Override + public byte aliasFilter() { + return 0; + } } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMap.java deleted file mode 100644 index 512db1b..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMap.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single byte array key hash map lookup method. - */ -public interface VectorMapJoinBytesHashMap - extends VectorMapJoinBytesHashTable, VectorMapJoinHashMap { - - /* - * Lookup a byte array key in the hash map. - * - * @param keyBytes - * A byte array containing the key within a range. - * @param keyStart - * The offset the beginning of the key. - * @param keyLength - * The length of the key. - * @param hashMapResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spill (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult lookup(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMapResult hashMapResult) throws IOException; - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMultiSet.java deleted file mode 100644 index 196403d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashMultiSet.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single byte array key hash multi-set contains method. - */ -public interface VectorMapJoinBytesHashMultiSet - extends VectorMapJoinBytesHashTable, VectorMapJoinHashMultiSet { - - /* - * Lookup an byte array key in the hash multi-set. - * - * @param keyBytes - * A byte array containing the key within a range. - * @param keyStart - * The offset the beginning of the key. - * @param keyLength - * The length of the key. - * @param hashMultiSetResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException; - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashSet.java deleted file mode 100644 index a0c93e5..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashSet.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single byte array key hash multi-set contains method. - */ -public interface VectorMapJoinBytesHashSet - extends VectorMapJoinBytesHashTable, VectorMapJoinHashSet { - - /* - * Lookup an byte array key in the hash set. - * - * @param keyBytes - * A byte array containing the key within a range. - * @param keyStart - * The offset the beginning of the key. - * @param keyLength - * The length of the key. - * @param hashSetResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashSetResult hashSetResult) throws IOException; - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashTable.java deleted file mode 100644 index 7494e1d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinBytesHashTable.java +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * Interface for a vector map join hash table (which could be a hash map, hash multi-set, or - * hash set) for a single byte array key. - */ -public interface VectorMapJoinBytesHashTable extends VectorMapJoinHashTable { -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMap.java deleted file mode 100644 index 7abe2c8..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMap.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * The root interface for a vector map join hash map. - */ -public interface VectorMapJoinHashMap extends VectorMapJoinHashTable { - - /* - * @return A new hash map result implementation specific object. - * - * The object can be used to access the values when there is a match, or - * access spill information when the partition with the key is currently spilled. - */ - VectorMapJoinHashMapResult createHashMapResult(); - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java deleted file mode 100644 index a5dfba8..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMapResult.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; - -/* - * Abstract class for a hash map result. For reading the values, one-by-one. - */ -public abstract class VectorMapJoinHashMapResult extends VectorMapJoinHashTableResult { - - /** - * @return Whether there are any rows (i.e. true for match). - */ - public abstract boolean hasRows(); - - /** - * @return Whether there is 1 value row. - */ - public abstract boolean isSingleRow(); - - /** - * @return Whether there is a capped count available from cappedCount. - */ - public abstract boolean isCappedCountAvailable(); - - /** - * @return The count of values, up to a arbitrary cap limit. When available, the capped - * count can be used to make decisions on how to optimally generate join results. - */ - public abstract int cappedCount(); - - /** - * @return A reference to the first value, or null if there are no values. - */ - public abstract ByteSegmentRef first(); - - /** - * @return The next value, or null if there are no more values to be read. - */ - public abstract ByteSegmentRef next(); - - /** - * Get detailed HashMap result position information to help diagnose exceptions. - */ - public abstract String getDetailedHashMapResultPositionString(); -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSet.java deleted file mode 100644 index 210597d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSet.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -public interface VectorMapJoinHashMultiSet extends VectorMapJoinHashTable { - - /* - * @return A new hash multi-set result implementation specific object. - * - * The object can be used to access the *count* of values when the key is contained in the - * multi-set, or access spill information when the partition with the key is currently spilled. - */ - VectorMapJoinHashMultiSetResult createHashMultiSetResult(); - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSetResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSetResult.java deleted file mode 100644 index 0728f78..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashMultiSetResult.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * Abstract class for a hash multi-set result. - */ -public abstract class VectorMapJoinHashMultiSetResult extends VectorMapJoinHashTableResult { - - protected long count; - - /* - * @return The multi-set count for the lookup key. - */ - public long count() { - return count; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSet.java deleted file mode 100644 index a26f997..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSet.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * The root interface for a vector map join hash set. - */ -public interface VectorMapJoinHashSet extends VectorMapJoinHashTable { - - /* - * @return A new hash set result implementation specific object. - * - * The object can be used to access access spill information when the partition with the key - * is currently spilled. - */ - VectorMapJoinHashSetResult createHashSetResult(); - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSetResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSetResult.java deleted file mode 100644 index 467c4c1..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashSetResult.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * Abstract class for a hash set result. - */ -public abstract class VectorMapJoinHashSetResult extends VectorMapJoinHashTableResult { - - // Nothing currently available for hash sets. - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTable.java deleted file mode 100644 index c7e585c..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTable.java +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.io.BytesWritable; - -/* - * Root interface for a vector map join hash table (which could be a hash map, hash multi-set, or - * hash set). - */ -public interface VectorMapJoinHashTable { - - - /* - * @param currentKey - * The current key. - * @param currentValue - * The current value. - */ - void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException; - - /** - * Get hash table size - */ - int size(); -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTableResult.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTableResult.java deleted file mode 100644 index b51d6fe..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinHashTableResult.java +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.serde2.WriteBuffers; - -/* - * Root abstract class for a hash table result. - */ -public abstract class VectorMapJoinHashTableResult { - - private JoinUtil.JoinResult joinResult; - - private int spillPartitionId; - - private final WriteBuffers.Position readPos; - - public VectorMapJoinHashTableResult() { - joinResult = JoinUtil.JoinResult.NOMATCH; - spillPartitionId = -1; - readPos = new WriteBuffers.Position(); - } - - /** - * @return The join result from the most recent hash map match, or hash multi-set / set contains - * call. - */ - public JoinUtil.JoinResult joinResult() { - return joinResult; - } - - /** - * Set the current join result. - * @param joinResult - * The new join result. - */ - public void setJoinResult(JoinUtil.JoinResult joinResult) { - this.joinResult = joinResult; - } - - /** - * Forget about the most recent hash table lookup or contains call. - */ - public void forget() { - joinResult = JoinUtil.JoinResult.NOMATCH; - } - - /** - * Set the spill partition id. - */ - public void setSpillPartitionId(int spillPartitionId) { - this.spillPartitionId = spillPartitionId; - } - - /** - * @return The Hybrid Grace spill partition id. - */ - public int spillPartitionId() { - return spillPartitionId; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("joinResult " + joinResult.name()); - return sb.toString(); - } - - public WriteBuffers.Position getReadPos() { - return readPos; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMap.java deleted file mode 100644 index f180d02..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMap.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single long key hash map lookup method. - */ -public interface VectorMapJoinLongHashMap - extends VectorMapJoinLongHashTable, VectorMapJoinHashMap { - - /* - * Lookup an long in the hash map. - * - * @param key - * The long key. - * @param hashMapResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult lookup(long key, VectorMapJoinHashMapResult hashMapResult) throws IOException; - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMultiSet.java deleted file mode 100644 index 7477584..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashMultiSet.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface for a single long key hash multi-set contains method. - */ -public interface VectorMapJoinLongHashMultiSet - extends VectorMapJoinLongHashTable, VectorMapJoinHashMultiSet { - - /* - * Lookup an long in the hash multi-set. - * - * @param key - * The long key. - * @param hashMultiSetResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult contains(long key, VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException; - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashSet.java deleted file mode 100644 index 8c28bff..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashSet.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; - -/* - * The interface adds the single long key hash multi-set contains method. - */ -public interface VectorMapJoinLongHashSet - extends VectorMapJoinLongHashTable, VectorMapJoinHashSet { - - /* - * Lookup an long in the hash set. - * - * @param key - * The long key. - * @param hashSetResult - * The object to receive small table value(s) information on a MATCH. - * Or, for SPILL, it has information on where to spill the big table row. - * - * @return - * Whether the lookup was a match, no match, or spilled (the partition with the key - * is currently spilled). - */ - JoinUtil.JoinResult contains(long key, VectorMapJoinHashSetResult hashSetResult) throws IOException; - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashTable.java deleted file mode 100644 index 046a403..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinLongHashTable.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -/* - * Interface for a vector map join hash table (which could be a hash map, hash multi-set, or - * hash set) for a single long. - */ -public interface VectorMapJoinLongHashTable extends VectorMapJoinHashTable { - - boolean useMinMax(); - long min(); - long max(); - -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinTableContainer.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinTableContainer.java deleted file mode 100644 index 09631e4..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/hashtable/VectorMapJoinTableContainer.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable; - -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; - -public interface VectorMapJoinTableContainer extends MapJoinTableContainer { - - VectorMapJoinHashTable vectorMapJoinHashTable(); - - // com.esotericsoftware.kryo.io.Output getHybridBigTableSpillOutput(int partitionId); -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedCreateHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedCreateHashTable.java deleted file mode 100644 index f34b1cd..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedCreateHashTable.java +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; - -/** - */ -public class VectorMapJoinOptimizedCreateHashTable { - - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOptimizedCreateHashTable.class.getName()); - - public static VectorMapJoinOptimizedHashTable createHashTable(MapJoinDesc desc, - MapJoinTableContainer mapJoinTableContainer) { - - MapJoinKey refKey = mapJoinTableContainer.getAnyKey(); - ReusableGetAdaptor hashMapRowGetter = mapJoinTableContainer.createGetter(refKey); - - boolean isOuterJoin = !desc.isNoOuterJoin(); - VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); - HashTableKind hashTableKind = vectorDesc.hashTableKind(); - HashTableKeyType hashTableKeyType = vectorDesc.hashTableKeyType(); - boolean minMaxEnabled = vectorDesc.minMaxEnabled(); - - VectorMapJoinOptimizedHashTable hashTable = null; - - switch (hashTableKeyType) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinOptimizedLongHashMap( - minMaxEnabled, isOuterJoin, hashTableKeyType, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinOptimizedLongHashMultiSet( - minMaxEnabled, isOuterJoin, hashTableKeyType, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_SET: - hashTable = new VectorMapJoinOptimizedLongHashSet( - minMaxEnabled, isOuterJoin, hashTableKeyType, - mapJoinTableContainer, hashMapRowGetter); - break; - } - break; - - case STRING: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinOptimizedStringHashMap( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinOptimizedStringHashMultiSet( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_SET: - hashTable = new VectorMapJoinOptimizedStringHashSet( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - } - break; - - case MULTI_KEY: - switch (hashTableKind) { - case HASH_MAP: - hashTable = new VectorMapJoinOptimizedMultiKeyHashMap( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_MULTISET: - hashTable = new VectorMapJoinOptimizedMultiKeyHashMultiSet( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - case HASH_SET: - hashTable = new VectorMapJoinOptimizedMultiKeyHashSet( - isOuterJoin, - mapJoinTableContainer, hashMapRowGetter); - break; - } - break; - } - return hashTable; - } - - /* - @Override - public com.esotericsoftware.kryo.io.Output getHybridBigTableSpillOutput(int partitionId) { - - HybridHashTableContainer ht = (HybridHashTableContainer) mapJoinTableContainer; - - HashPartition hp = ht.getHashPartitions()[partitionId]; - - return hp.getMatchfileOutput(); - } - */ -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java deleted file mode 100644 index eada694..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMap.java +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; - -public class VectorMapJoinOptimizedHashMap - extends VectorMapJoinOptimizedHashTable - implements VectorMapJoinBytesHashMap { - - @Override - public VectorMapJoinHashMapResult createHashMapResult() { - return new HashMapResult(); - } - - public static class HashMapResult extends VectorMapJoinHashMapResult { - - private BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult; - - public HashMapResult() { - super(); - bytesBytesMultiHashMapResult = new BytesBytesMultiHashMap.Result(); - } - - public BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult() { - return bytesBytesMultiHashMapResult; - } - - @Override - public boolean hasRows() { - return (joinResult() == JoinUtil.JoinResult.MATCH); - } - - @Override - public boolean isSingleRow() { - if (joinResult() != JoinUtil.JoinResult.MATCH) { - throw new RuntimeException("HashMapResult is not a match"); - } - return bytesBytesMultiHashMapResult.isSingleRow(); - } - - @Override - public boolean isCappedCountAvailable() { - return true; - } - - @Override - public int cappedCount() { - // the return values are capped to return ==0, ==1 and >= 2 - return hasRows() ? (isSingleRow() ? 1 : 2) : 0; - } - - @Override - public ByteSegmentRef first() { - if (joinResult() != JoinUtil.JoinResult.MATCH) { - throw new RuntimeException("HashMapResult is not a match"); - } - return bytesBytesMultiHashMapResult.first(); - } - - @Override - public ByteSegmentRef next() { - return bytesBytesMultiHashMapResult.next(); - } - - @Override - public void forget() { - bytesBytesMultiHashMapResult.forget(); - super.forget(); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("(" + super.toString() + ", "); - sb.append("isSingleRow " + (joinResult() == JoinUtil.JoinResult.MATCH ? isSingleRow() : "") + ")"); - return sb.toString(); - } - - @Override - public String getDetailedHashMapResultPositionString() { - return "(Not supported yet)"; - } - } - - @Override - public JoinUtil.JoinResult lookup(byte[] keyBytes, int keyOffset, int keyLength, - VectorMapJoinHashMapResult hashMapResult) throws IOException { - - HashMapResult implementationHashMapResult = (HashMapResult) hashMapResult; - - JoinUtil.JoinResult joinResult = - doLookup(keyBytes, keyOffset, keyLength, - implementationHashMapResult.bytesBytesMultiHashMapResult(), - (VectorMapJoinHashTableResult) hashMapResult); - - return joinResult; - } - - public VectorMapJoinOptimizedHashMap( - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMultiSet.java deleted file mode 100644 index 34de7e1..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashMultiSet.java +++ /dev/null @@ -1,103 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.serde2.WriteBuffers.ByteSegmentRef; - -public class VectorMapJoinOptimizedHashMultiSet - extends VectorMapJoinOptimizedHashTable - implements VectorMapJoinBytesHashMultiSet { - - @Override - public VectorMapJoinHashMultiSetResult createHashMultiSetResult() { - return new HashMultiSetResult(); - } - - public static class HashMultiSetResult extends VectorMapJoinHashMultiSetResult { - - private BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult; - - private boolean haveCount; - - public HashMultiSetResult() { - super(); - bytesBytesMultiHashMapResult = new BytesBytesMultiHashMap.Result(); - } - - public BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult() { - return bytesBytesMultiHashMapResult; - } - - /* - * @return The multi-set count for the lookup key. - */ - @Override - public long count() { - if (!haveCount) { - if (bytesBytesMultiHashMapResult.isSingleRow()) { - count = 1; - } else { - count = 0; - ByteSegmentRef byteSegmentRef = bytesBytesMultiHashMapResult.first(); - while (byteSegmentRef != null) { - count++; - byteSegmentRef = bytesBytesMultiHashMapResult.next(); - } - } - haveCount = true; - } - return count; - } - - @Override - public void forget() { - haveCount = false; - bytesBytesMultiHashMapResult.forget(); - super.forget(); - } - } - - @Override - public JoinUtil.JoinResult contains(byte[] keyBytes, int keyOffset, int keyLength, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException { - - HashMultiSetResult implementationHashMultiSetResult = (HashMultiSetResult) hashMultiSetResult; - - JoinUtil.JoinResult joinResult = - doLookup(keyBytes, keyOffset, keyLength, - implementationHashMultiSetResult.bytesBytesMultiHashMapResult(), - (VectorMapJoinHashTableResult) hashMultiSetResult); - - return joinResult; - } - - public VectorMapJoinOptimizedHashMultiSet( - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashSet.java deleted file mode 100644 index 93a89d7..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashSet.java +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; - -public class VectorMapJoinOptimizedHashSet - extends VectorMapJoinOptimizedHashTable - implements VectorMapJoinBytesHashSet { - - @Override - public VectorMapJoinHashSetResult createHashSetResult() { - return new HashSetResult(); - } - - public static class HashSetResult extends VectorMapJoinHashSetResult { - - private BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult; - - public HashSetResult() { - super(); - bytesBytesMultiHashMapResult = new BytesBytesMultiHashMap.Result(); - } - - public BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult() { - return bytesBytesMultiHashMapResult; - } - - @Override - public void forget() { - bytesBytesMultiHashMapResult.forget(); - super.forget(); - } - } - - @Override - public JoinUtil.JoinResult contains(byte[] keyBytes, int keyOffset, int keyLength, - VectorMapJoinHashSetResult hashSetResult) throws IOException { - - HashSetResult implementationHashSetResult = (HashSetResult) hashSetResult; - - JoinUtil.JoinResult joinResult = - doLookup(keyBytes, keyOffset, keyLength, - implementationHashSetResult.bytesBytesMultiHashMapResult(), - (VectorMapJoinHashTableResult) hashSetResult); - - return joinResult; - } - - public VectorMapJoinOptimizedHashSet( - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashTable.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashTable.java deleted file mode 100644 index 5fe7861..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedHashTable.java +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerDirectAccess; -import org.apache.hadoop.hive.ql.exec.persistence.ReusableGetAdaptorDirectAccess; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTable; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.Writable; - -/* - * Root interface for a vector map join hash table (which could be a hash map, hash multi-set, or - * hash set). - */ -public abstract class VectorMapJoinOptimizedHashTable implements VectorMapJoinHashTable { - - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOptimizedMultiKeyHashMap.class.getName()); - - protected final MapJoinTableContainer originalTableContainer; - protected final MapJoinTableContainerDirectAccess containerDirectAccess; - protected final ReusableGetAdaptorDirectAccess adapatorDirectAccess; - - public static class SerializedBytes { - byte[] bytes; - int offset; - int length; - } - - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - putRowInternal(currentKey, currentValue); - } - - protected void putRowInternal(BytesWritable key, BytesWritable value) - throws SerDeException, HiveException, IOException { - - containerDirectAccess.put((Writable) key, (Writable) value); - } - - public JoinUtil.JoinResult doLookup(byte[] keyBytes, int keyOffset, int keyLength, - BytesBytesMultiHashMap.Result bytesBytesMultiHashMapResult, - VectorMapJoinHashTableResult hashTableResult) { - - hashTableResult.forget(); - - JoinUtil.JoinResult joinResult = - adapatorDirectAccess.setDirect(keyBytes, keyOffset, keyLength, - bytesBytesMultiHashMapResult); - if (joinResult == JoinUtil.JoinResult.SPILL) { - hashTableResult.setSpillPartitionId(adapatorDirectAccess.directSpillPartitionId()); - } - - hashTableResult.setJoinResult(joinResult); - - return joinResult; - } - - public VectorMapJoinOptimizedHashTable( - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - - this.originalTableContainer = originalTableContainer; - containerDirectAccess = (MapJoinTableContainerDirectAccess) originalTableContainer; - adapatorDirectAccess = (ReusableGetAdaptorDirectAccess) hashMapRowGetter; - } - - @Override - public int size() { - return originalTableContainer.size(); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java deleted file mode 100644 index 0eabc44..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongCommon.java +++ /dev/null @@ -1,171 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedHashTable.SerializedBytes; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; -import org.apache.hadoop.hive.serde2.ByteStream.Output; -import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; - -/* - * An single long value hash map based on the BytesBytesMultiHashMap. - * - * We serialize the long key into BinarySortable format into an output buffer accepted by - * BytesBytesMultiHashMap. - */ -public class VectorMapJoinOptimizedLongCommon { - - private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinOptimizedLongCommon.class.getName()); - - private boolean isOuterJoin; - - private HashTableKeyType hashTableKeyType; - - // private BinarySortableDeserializeRead keyBinarySortableDeserializeRead; - - private BinarySortableSerializeWrite keyBinarySortableSerializeWrite; - - private transient Output output; - - private transient SerializedBytes serializedBytes; - - // protected boolean useMinMax; - protected long min; - protected long max; - - public boolean useMinMax() { - return false; - } - - public long min() { - return min; - } - - public long max() { - return max; - } - - /* - * For now, just use MapJoinBytesTableContainer / HybridHashTableContainer directly. - - public void adaptPutRow(VectorMapJoinOptimizedHashTable hashTable, - BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - if (useMinMax) { - // Peek at the BinarySortable key to extract the long so we can determine min and max. - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - if (keyBinarySortableDeserializeRead.readCheckNull()) { - if (isOuterJoin) { - return; - } else { - // For inner join, we expect all NULL values to have been filtered out before now. - throw new HiveException("Unexpected NULL"); - } - } - long key = 0; - switch (hashTableKeyType) { - case BOOLEAN: - key = (keyBinarySortableDeserializeRead.readBoolean() ? 1 : 0); - break; - case BYTE: - key = (long) keyBinarySortableDeserializeRead.readByte(); - break; - case SHORT: - key = (long) keyBinarySortableDeserializeRead.readShort(); - break; - case INT: - key = (long) keyBinarySortableDeserializeRead.readInt(); - break; - case LONG: - key = keyBinarySortableDeserializeRead.readLong(); - break; - default: - throw new RuntimeException("Unexpected hash table key type " + hashTableKeyType.name()); - } - if (key < min) { - min = key; - } - if (key > max) { - max = key; - } - - // byte[] bytes = Arrays.copyOf(currentKey.get(), currentKey.getLength()); - // LOG.debug("VectorMapJoinOptimizedLongCommon adaptPutRow key " + key + " min " + min + " max " + max + " hashTableKeyType " + hashTableKeyType.name() + " hex " + Hex.encodeHexString(bytes)); - - } - - hashTable.putRowInternal(currentKey, currentValue); - } - */ - - public SerializedBytes serialize(long key) throws IOException { - keyBinarySortableSerializeWrite.reset(); - - switch (hashTableKeyType) { - case BOOLEAN: - keyBinarySortableSerializeWrite.writeBoolean(key == 1); - break; - case BYTE: - keyBinarySortableSerializeWrite.writeByte((byte) key); - break; - case SHORT: - keyBinarySortableSerializeWrite.writeShort((short) key); - break; - case INT: - keyBinarySortableSerializeWrite.writeInt((int) key); - break; - case LONG: - keyBinarySortableSerializeWrite.writeLong(key); - break; - default: - throw new RuntimeException("Unexpected hash table key type " + hashTableKeyType.name()); - } - - // byte[] bytes = Arrays.copyOf(output.getData(), output.getLength()); - // LOG.debug("VectorMapJoinOptimizedLongCommon serialize key " + key + " hashTableKeyType " + hashTableKeyType.name() + " hex " + Hex.encodeHexString(bytes)); - - serializedBytes.bytes = output.getData(); - serializedBytes.offset = 0; - serializedBytes.length = output.getLength(); - - return serializedBytes; - } - - public VectorMapJoinOptimizedLongCommon( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType) { - this.isOuterJoin = isOuterJoin; - // useMinMax = minMaxEnabled; - min = Long.MAX_VALUE; - max = Long.MIN_VALUE; - this.hashTableKeyType = hashTableKeyType; - // PrimitiveTypeInfo[] primitiveTypeInfos = { hashTableKeyType.getPrimitiveTypeInfo() }; - // keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); - keyBinarySortableSerializeWrite = new BinarySortableSerializeWrite(1); - output = new Output(); - keyBinarySortableSerializeWrite.set(output); - serializedBytes = new SerializedBytes(); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMap.java deleted file mode 100644 index 403d265..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMap.java +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMap; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; - -/* - * An single long value hash map based on the BytesBytesMultiHashMap. - * - * We serialize the long key into BinarySortable format into an output buffer accepted by - * BytesBytesMultiHashMap. - */ -public class VectorMapJoinOptimizedLongHashMap - extends VectorMapJoinOptimizedHashMap - implements VectorMapJoinLongHashMap { - - private VectorMapJoinOptimizedLongCommon longCommon; - - @Override - public boolean useMinMax() { - return longCommon.useMinMax(); - } - - @Override - public long min() { - return longCommon.min(); - } - - @Override - public long max() { - return longCommon.max(); - } - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - longCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult lookup(long key, - VectorMapJoinHashMapResult hashMapResult) throws IOException { - - SerializedBytes serializedBytes = longCommon.serialize(key); - - return super.lookup(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashMapResult); - } - - public VectorMapJoinOptimizedLongHashMap( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - longCommon = new VectorMapJoinOptimizedLongCommon(minMaxEnabled, isOuterJoin, hashTableKeyType); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMultiSet.java deleted file mode 100644 index 5fb8c3a..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashMultiSet.java +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashMultiSet; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; - -/* - * An single long value hash map based on the BytesBytesMultiHashMultiSet. - * - * We serialize the long key into BinarySortable format into an output buffer accepted by - * BytesBytesMultiHashMultiSet. - */ -public class VectorMapJoinOptimizedLongHashMultiSet - extends VectorMapJoinOptimizedHashMultiSet - implements VectorMapJoinLongHashMultiSet { - - private VectorMapJoinOptimizedLongCommon longCommon; - - @Override - public boolean useMinMax() { - return longCommon.useMinMax(); - } - - @Override - public long min() { - return longCommon.min(); - } - - @Override - public long max() { - return longCommon.max(); - } - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - longCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult contains(long key, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException { - - SerializedBytes serializedBytes = longCommon.serialize(key); - - return super.contains(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashMultiSetResult); - - } - - public VectorMapJoinOptimizedLongHashMultiSet( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - longCommon = new VectorMapJoinOptimizedLongCommon(minMaxEnabled, isOuterJoin, hashTableKeyType); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashSet.java deleted file mode 100644 index c41505a..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedLongHashSet.java +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinLongHashSet; -import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; - -/* - * An single long value hash map based on the BytesBytesMultiHashSet. - * - * We serialize the long key into BinarySortable format into an output buffer accepted by - * BytesBytesMultiHashSet. - */ -public class VectorMapJoinOptimizedLongHashSet - extends VectorMapJoinOptimizedHashSet - implements VectorMapJoinLongHashSet { - - private VectorMapJoinOptimizedLongCommon longCommon; - - @Override - public boolean useMinMax() { - return longCommon.useMinMax(); - } - - @Override - public long min() { - return longCommon.min(); - } - - @Override - public long max() { - return longCommon.max(); - } - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - longCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult contains(long key, - VectorMapJoinHashSetResult hashSetResult) throws IOException { - - SerializedBytes serializedBytes = longCommon.serialize(key); - - return super.contains(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashSetResult); - - } - - public VectorMapJoinOptimizedLongHashSet( - boolean minMaxEnabled, boolean isOuterJoin, HashTableKeyType hashTableKeyType, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - longCommon = new VectorMapJoinOptimizedLongCommon(minMaxEnabled, isOuterJoin, hashTableKeyType); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMap.java deleted file mode 100644 index 4f3e20e..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMap.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; - -/* - * An multi-key hash map based on the BytesBytesMultiHashMap. - */ -public class VectorMapJoinOptimizedMultiKeyHashMap - extends VectorMapJoinOptimizedHashMap { - - // UNDONE: How to look for all NULLs in a multi-key????? Let nulls through for now. - - public VectorMapJoinOptimizedMultiKeyHashMap(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMultiSet.java deleted file mode 100644 index b95a2dd..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashMultiSet.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; - -/* - * An multi-key hash map based on the BytesBytesMultiHashMultiSet. - */ -public class VectorMapJoinOptimizedMultiKeyHashMultiSet - extends VectorMapJoinOptimizedHashMultiSet { - - // UNDONE: How to look for all NULLs in a multi-key????? Let nulls through for now. - - public VectorMapJoinOptimizedMultiKeyHashMultiSet(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashSet.java deleted file mode 100644 index 35ecc2a..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedMultiKeyHashSet.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; - -/* - * An multi-key hash map based on the BytesBytesMultiHashSet. - */ -public class VectorMapJoinOptimizedMultiKeyHashSet - extends VectorMapJoinOptimizedHashSet { - - // UNDONE: How to look for all NULLs in a multi-key????? Let nulls through for now. - - public VectorMapJoinOptimizedMultiKeyHashSet(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringCommon.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringCommon.java deleted file mode 100644 index 39c2d49..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringCommon.java +++ /dev/null @@ -1,98 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedHashTable.SerializedBytes; -import org.apache.hadoop.hive.serde2.ByteStream.Output; -import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; - -/* - * An single byte array value hash map based on the BytesBytesMultiHashMap. - * - * Since BytesBytesMultiHashMap does not interpret the key as BinarySortable we optimize - * this case and just reference the byte array key directly for the lookup instead of serializing - * the byte array into BinarySortable. We rely on it just doing byte array equality comparisons. - */ -public class VectorMapJoinOptimizedStringCommon { - - // private boolean isOuterJoin; - - // private BinarySortableDeserializeRead keyBinarySortableDeserializeRead; - - // private ReadStringResults readStringResults; - - private BinarySortableSerializeWrite keyBinarySortableSerializeWrite; - - private transient Output output; - - private transient SerializedBytes serializedBytes; - - /* - private BytesWritable bytesWritable; - - public void adaptPutRow(VectorMapJoinOptimizedHashTable hashTable, - BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - byte[] keyBytes = currentKey.getBytes(); - int keyLength = currentKey.getLength(); - keyBinarySortableDeserializeRead.set(keyBytes, 0, keyLength); - if (keyBinarySortableDeserializeRead.readCheckNull()) { - if (isOuterJoin) { - return; - } else { - // For inner join, we expect all NULL values to have been filtered out before now. - throw new HiveException("Unexpected NULL"); - } - } - keyBinarySortableDeserializeRead.readString(readStringResults); - - bytesWritable.set(readStringResults.bytes, readStringResults.start, readStringResults.length); - - hashTable.putRowInternal(bytesWritable, currentValue); - } - */ - - public SerializedBytes serialize(byte[] keyBytes, int keyStart, int keyLength) throws IOException { - - keyBinarySortableSerializeWrite.reset(); - keyBinarySortableSerializeWrite.writeString(keyBytes, keyStart, keyLength); - - serializedBytes.bytes = output.getData(); - serializedBytes.offset = 0; - serializedBytes.length = output.getLength(); - - return serializedBytes; - - } - - public VectorMapJoinOptimizedStringCommon(boolean isOuterJoin) { - // this.isOuterJoin = isOuterJoin; - // PrimitiveTypeInfo[] primitiveTypeInfos = { TypeInfoFactory.stringTypeInfo }; - // keyBinarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos); - // readStringResults = keyBinarySortableDeserializeRead.createReadStringResults(); - // bytesWritable = new BytesWritable(); - keyBinarySortableSerializeWrite = new BinarySortableSerializeWrite(1); - output = new Output(); - keyBinarySortableSerializeWrite.set(output); - serializedBytes = new SerializedBytes(); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMap.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMap.java deleted file mode 100644 index 220c05e..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMap.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; - -/* - * An multi-key hash map based on the BytesBytesMultiHashMap. - */ -public class VectorMapJoinOptimizedStringHashMap - extends VectorMapJoinOptimizedHashMap - implements VectorMapJoinBytesHashMap { - - private VectorMapJoinOptimizedStringCommon stringCommon; - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - stringCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult lookup(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMapResult hashMapResult) throws IOException { - - SerializedBytes serializedBytes = stringCommon.serialize(keyBytes, keyStart, keyLength); - - return super.lookup(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashMapResult); - - } - - public VectorMapJoinOptimizedStringHashMap(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - stringCommon = new VectorMapJoinOptimizedStringCommon(isOuterJoin); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMultiSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMultiSet.java deleted file mode 100644 index b6c6958..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashMultiSet.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; - -/* - * An multi-key hash map based on the BytesBytesMultiHashMultiSet. - */ -public class VectorMapJoinOptimizedStringHashMultiSet - extends VectorMapJoinOptimizedHashMultiSet - implements VectorMapJoinBytesHashMultiSet { - - private VectorMapJoinOptimizedStringCommon stringCommon; - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - stringCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashMultiSetResult hashMultiSetResult) throws IOException { - - SerializedBytes serializedBytes = stringCommon.serialize(keyBytes, keyStart, keyLength); - - return super.contains(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashMultiSetResult); - - - } - - public VectorMapJoinOptimizedStringHashMultiSet(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - stringCommon = new VectorMapJoinOptimizedStringCommon(isOuterJoin); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashSet.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashSet.java deleted file mode 100644 index f921b9c..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/optimized/VectorMapJoinOptimizedStringHashSet.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized; - -import java.io.IOException; - -import org.apache.hadoop.hive.ql.exec.JoinUtil.JoinResult; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinBytesHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; - -/* - * An multi-key hash map based on the BytesBytesMultiHashSet. - */ -public class VectorMapJoinOptimizedStringHashSet - extends VectorMapJoinOptimizedHashSet - implements VectorMapJoinBytesHashSet { - - private VectorMapJoinOptimizedStringCommon stringCommon; - - /* - @Override - public void putRow(BytesWritable currentKey, BytesWritable currentValue) - throws SerDeException, HiveException, IOException { - - stringCommon.adaptPutRow((VectorMapJoinOptimizedHashTable) this, currentKey, currentValue); - } - */ - - @Override - public JoinResult contains(byte[] keyBytes, int keyStart, int keyLength, - VectorMapJoinHashSetResult hashSetResult) throws IOException { - - SerializedBytes serializedBytes = stringCommon.serialize(keyBytes, keyStart, keyLength); - - return super.contains(serializedBytes.bytes, serializedBytes.offset, serializedBytes.length, - hashSetResult); - - } - - public VectorMapJoinOptimizedStringHashSet(boolean isOuterJoin, - MapJoinTableContainer originalTableContainer, ReusableGetAdaptor hashMapRowGetter) { - super(originalTableContainer, hashMapRowGetter); - stringCommon = new VectorMapJoinOptimizedStringCommon(isOuterJoin); - } -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkCommonOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkCommonOperator.java index 8133aef..b559e0a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkCommonOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkCommonOperator.java @@ -33,9 +33,9 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesSerialized; import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.BaseWork; @@ -67,6 +67,32 @@ private static final String CLASS_NAME = VectorReduceSinkCommonOperator.class.getName(); private static final Log LOG = LogFactory.getLog(CLASS_NAME); + protected abstract String getLoggingPrefix(); + + // For debug tracing: information about the map or reduce task, operator, operator class, etc. + protected transient String loggingPrefix; + + protected String getLoggingPrefix(String className) { + if (loggingPrefix == null) { + initLoggingPrefix(className); + } + return loggingPrefix; + } + + protected void initLoggingPrefix(String className) { + if (hconf == null) { + // Constructor time... + loggingPrefix = className; + } else { + // Determine the name of our map or reduce task for debug tracing. + BaseWork work = Utilities.getMapWork(hconf); + if (work == null) { + work = Utilities.getReduceWork(hconf); + } + loggingPrefix = className + " " + work.getName() + " " + getOperatorId(); + } + } + protected VectorReduceSinkDesc vectorDesc; /** @@ -84,6 +110,7 @@ // This is map of which vectorized row batch columns are the key columns. // And, their types. protected int[] reduceSinkKeyColumnMap; + protected String[] reduceSinkKeyColumnNames; protected TypeInfo[] reduceSinkKeyTypeInfos; // Optional vectorized key expressions that need to be run on each batch. @@ -92,6 +119,7 @@ // This is map of which vectorized row batch columns are the value columns. // And, their types. protected int[] reduceSinkValueColumnMap; + protected String[] reduceSinkValueColumnNames; protected TypeInfo[] reduceSinkValueTypeInfos; // Optional vectorized value expressions that need to be run on each batch. @@ -101,6 +129,8 @@ // transient. //--------------------------------------------------------------------------- + protected transient Configuration hconf; + // Whether there is to be a tag added to the end of each key and the tag value. private transient boolean reduceSkipTag; private transient byte reduceTagByte; @@ -129,9 +159,6 @@ // Where to write our key and value pairs. private transient OutputCollector out; - // The object that determines equal key series. - protected transient VectorKeySeriesSerialized serializedKeySeries; - private transient long numRows = 0; private transient long cntr = 1; private transient long logEveryNRows = 0; @@ -167,12 +194,26 @@ public VectorReduceSinkCommonOperator(CompilationOpContext ctx, // Since a key expression can be a calculation and the key will go into a scratch column, // we need the mapping and type information. reduceSinkKeyColumnMap = vectorReduceSinkInfo.getReduceSinkKeyColumnMap(); + reduceSinkKeyColumnNames = vectorReduceSinkInfo.getReduceSinkKeyColumnNames(); reduceSinkKeyTypeInfos = vectorReduceSinkInfo.getReduceSinkKeyTypeInfos(); reduceSinkKeyExpressions = vectorReduceSinkInfo.getReduceSinkKeyExpressions(); reduceSinkValueColumnMap = vectorReduceSinkInfo.getReduceSinkValueColumnMap(); + reduceSinkValueColumnNames = vectorReduceSinkInfo.getReduceSinkValueColumnNames(); reduceSinkValueTypeInfos = vectorReduceSinkInfo.getReduceSinkValueTypeInfos(); reduceSinkValueExpressions = vectorReduceSinkInfo.getReduceSinkValueExpressions(); + + if (isLogDebugEnabled) { + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkKeyColumnMap " + Arrays.toString(reduceSinkKeyColumnMap)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkKeyColumnNames " + Arrays.toString(reduceSinkKeyColumnNames)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkKeyTypeInfos " + Arrays.toString(reduceSinkKeyTypeInfos)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkKeyExpressions " + Arrays.toString(reduceSinkKeyExpressions)); + + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkValueColumnMap " + Arrays.toString(reduceSinkValueColumnMap)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkValueColumnNames " + Arrays.toString(reduceSinkValueColumnNames)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkValueTypeInfos " + Arrays.toString(reduceSinkValueTypeInfos)); + LOG.debug(getLoggingPrefix() + " VectorReduceSinkCommonOperator constructor reduceSinkValueExpressions " + Arrays.toString(reduceSinkValueExpressions)); + } } // Get the sort order @@ -245,6 +286,7 @@ public VectorReduceSinkCommonOperator(CompilationOpContext ctx, @Override protected void initializeOp(Configuration hconf) throws HiveException { + this.hconf = hconf; super.initializeOp(hconf); if (LOG.isDebugEnabled()) { @@ -295,7 +337,8 @@ protected void initializeOp(Configuration hconf) throws HiveException { throw new HiveException(e); } - valueLazyBinarySerializeWrite = new LazyBinarySerializeWrite(reduceSinkValueColumnMap.length); + valueLazyBinarySerializeWrite = + new LazyBinarySerializeWrite(reduceSinkValueColumnMap.length); valueVectorSerializeRow = new VectorSerializeRow( @@ -312,6 +355,62 @@ protected void initializeOp(Configuration hconf) throws HiveException { batchCounter = 0; } + public abstract void processBatch(VectorizedRowBatch batch, int tag) throws IOException; + + protected void reduceSinkNull(VectorizedRowBatch batch, int tag, int logical, + int duplicateCount) throws IOException { + + // Use the same logic as ReduceSinkOperator.toHiveKey. + // + if (tag == -1 || reduceSkipTag) { + keyWritable.set(nullBytes, 0, nullBytes.length); + } else { + keyWritable.setSize(nullBytes.length + 1); + System.arraycopy(nullBytes, 0, keyWritable.get(), 0, nullBytes.length); + keyWritable.get()[nullBytes.length] = reduceTagByte; + } + keyWritable.setDistKeyLength(nullBytes.length); + keyWritable.setHashCode(nullKeyHashCode); + + reduceSinkKeyValue(batch, logical, duplicateCount); + } + + protected void reduceSinkKey(VectorizedRowBatch batch, int tag, + byte[] keyBytes, int keyStart, int keyLength, int hashCode, + int logical, int duplicateCount) throws IOException { + + // One serialized key for 1 or more rows for the duplicate keys. + if (tag == -1 || reduceSkipTag) { + keyWritable.set(keyBytes, keyStart, keyLength); + } else { + keyWritable.setSize(keyLength + 1); + System.arraycopy(keyBytes, keyStart, keyWritable.get(), 0, keyLength); + keyWritable.get()[keyLength] = reduceTagByte; + } + keyWritable.setDistKeyLength(keyLength); + keyWritable.setHashCode(hashCode); + + reduceSinkKeyValue(batch, logical, duplicateCount); + } + + private void reduceSinkKeyValue(VectorizedRowBatch batch, int logical, int duplicateCount) + throws IOException { + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; + int batchIndex; + final int end = logical + duplicateCount; + do { + batchIndex = (selectedInUse ? selected[logical] : logical); + + valueLazyBinarySerializeWrite.reset(); + valueVectorSerializeRow.serializeWrite(batch, batchIndex); + + valueBytesWritable.set(valueOutput.getData(), 0, valueOutput.getLength()); + + collect(keyWritable, valueBytesWritable); + } while (++logical < end); + } + @Override public void process(Object row, int tag) throws HiveException { @@ -322,7 +421,7 @@ public void process(Object row, int tag) throws HiveException { if (batch.size == 0) { if (LOG.isDebugEnabled()) { - LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); + LOG.debug(getLoggingPrefix() + " batch #" + batchCounter + " empty"); } return; } @@ -341,66 +440,7 @@ public void process(Object row, int tag) throws HiveException { } } - serializedKeySeries.processBatch(batch); - - boolean selectedInUse = batch.selectedInUse; - int[] selected = batch.selected; - - int keyLength; - int logical; - int end; - int batchIndex; - do { - if (serializedKeySeries.getCurrentIsAllNull()) { - - // Use the same logic as ReduceSinkOperator.toHiveKey. - // - if (tag == -1 || reduceSkipTag) { - keyWritable.set(nullBytes, 0, nullBytes.length); - } else { - keyWritable.setSize(nullBytes.length + 1); - System.arraycopy(nullBytes, 0, keyWritable.get(), 0, nullBytes.length); - keyWritable.get()[nullBytes.length] = reduceTagByte; - } - keyWritable.setDistKeyLength(nullBytes.length); - keyWritable.setHashCode(nullKeyHashCode); - - } else { - - // One serialized key for 1 or more rows for the duplicate keys. - // LOG.info("reduceSkipTag " + reduceSkipTag + " tag " + tag + " reduceTagByte " + (int) reduceTagByte + " keyLength " + serializedKeySeries.getSerializedLength()); - // LOG.info("process offset " + serializedKeySeries.getSerializedStart() + " length " + serializedKeySeries.getSerializedLength()); - keyLength = serializedKeySeries.getSerializedLength(); - if (tag == -1 || reduceSkipTag) { - keyWritable.set(serializedKeySeries.getSerializedBytes(), - serializedKeySeries.getSerializedStart(), keyLength); - } else { - keyWritable.setSize(keyLength + 1); - System.arraycopy(serializedKeySeries.getSerializedBytes(), - serializedKeySeries.getSerializedStart(), keyWritable.get(), 0, keyLength); - keyWritable.get()[keyLength] = reduceTagByte; - } - keyWritable.setDistKeyLength(keyLength); - keyWritable.setHashCode(serializedKeySeries.getCurrentHashCode()); - } - - logical = serializedKeySeries.getCurrentLogical(); - end = logical + serializedKeySeries.getCurrentDuplicateCount(); - do { - batchIndex = (selectedInUse ? selected[logical] : logical); - - valueLazyBinarySerializeWrite.reset(); - valueVectorSerializeRow.serializeWrite(batch, batchIndex); - - valueBytesWritable.set(valueOutput.getData(), 0, valueOutput.getLength()); - - collect(keyWritable, valueBytesWritable); - } while (++logical < end); - - if (!serializedKeySeries.next()) { - break; - } - } while (true); + processBatch(batch, tag); } catch (Exception e) { throw new HiveException(e); @@ -424,7 +464,7 @@ protected void collect(BytesWritable keyWritable, Writable valueWritable) throws } // BytesWritable valueBytesWritable = (BytesWritable) valueWritable; - // LOG.info("VectorReduceSinkCommonOperator collect keyWritable " + keyWritable.getLength() + " " + + // LOG.info(getLoggingPrefix() + " collect keyWritable " + keyWritable.getLength() + " " + // VectorizedBatchUtil.displayBytes(keyWritable.getBytes(), 0, keyWritable.getLength()) + // " valueWritable " + valueBytesWritable.getLength() + // VectorizedBatchUtil.displayBytes(valueBytesWritable.getBytes(), 0, valueBytesWritable.getLength())); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkLongOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkLongOperator.java index 325f773..2b6b86c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkLongOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkLongOperator.java @@ -18,12 +18,15 @@ package org.apache.hadoop.hive.ql.exec.vector.reducesink; +import java.io.IOException; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; -import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLongSerialized; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesLongFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; @@ -38,10 +41,17 @@ private static final String CLASS_NAME = VectorReduceSinkLongOperator.class.getName(); private static final Log LOG = LogFactory.getLog(CLASS_NAME); + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + // The column number and type information for this one column long reduce key. private transient int singleKeyColumn; private transient PrimitiveTypeInfo singleKeyColumnPrimitiveTypeInfo; + // The object that determines equal key series. + protected transient VectorKeySeriesLongFast serializedKeySeriesLongFast; + // The above members are initialized by the constructor and must not be // transient. //--------------------------------------------------------------------------- @@ -71,8 +81,36 @@ protected void initializeOp(Configuration hconf) throws HiveException { singleKeyColumn = reduceSinkKeyColumnMap[0]; singleKeyColumnPrimitiveTypeInfo = (PrimitiveTypeInfo) reduceSinkKeyTypeInfos[0]; - serializedKeySeries = - new VectorKeySeriesLongSerialized( + serializedKeySeriesLongFast = + new VectorKeySeriesLongFast( singleKeyColumn, singleKeyColumnPrimitiveTypeInfo, keyBinarySortableSerializeWrite); } + + @Override + public void processBatch(VectorizedRowBatch batch, int tag) throws IOException { + + serializedKeySeriesLongFast.processBatch(batch); + + do { + if (serializedKeySeriesLongFast.currentKeyIsNull) { + + reduceSinkNull(batch, tag, serializedKeySeriesLongFast.currentLogical, + serializedKeySeriesLongFast.currentDuplicateCount); + + } else { + + reduceSinkKey(batch, tag, + serializedKeySeriesLongFast.serializedBytes, + serializedKeySeriesLongFast.serializedStart, + serializedKeySeriesLongFast.serializedLength, + serializedKeySeriesLongFast.currentHashCode, + serializedKeySeriesLongFast.currentLogical, + serializedKeySeriesLongFast.currentDuplicateCount); + } + + if (!serializedKeySeriesLongFast.next()) { + break; + } + } while (true); + } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkMultiKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkMultiKeyOperator.java index 2027187..f400fea 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkMultiKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkMultiKeyOperator.java @@ -18,12 +18,16 @@ package org.apache.hadoop.hive.ql.exec.vector.reducesink; +import java.io.IOException; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; -import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesMultiSerialized; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesLongFast; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; @@ -38,6 +42,13 @@ private static final String CLASS_NAME = VectorReduceSinkMultiKeyOperator.class.getName(); private static final Log LOG = LogFactory.getLog(CLASS_NAME); + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + + // The object that determines equal key series. + protected transient VectorKeySeriesMultiFast serializedKeySeriesMultiFast; + // The above members are initialized by the constructor and must not be // transient. //--------------------------------------------------------------------------- @@ -64,11 +75,37 @@ public VectorReduceSinkMultiKeyOperator(CompilationOpContext ctx, protected void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); - VectorKeySeriesMultiSerialized serializedMultiKeySeries = - new VectorKeySeriesMultiSerialized( + serializedKeySeriesMultiFast = + new VectorKeySeriesMultiFast( keyBinarySortableSerializeWrite); - serializedMultiKeySeries.init(reduceSinkKeyTypeInfos, reduceSinkKeyColumnMap); + serializedKeySeriesMultiFast.init(reduceSinkKeyTypeInfos, reduceSinkKeyColumnMap); + } + + @Override + public void processBatch(VectorizedRowBatch batch, int tag) throws IOException { + + serializedKeySeriesMultiFast.processBatch(batch); + + do { + if (serializedKeySeriesMultiFast.currentKeyIsNull) { + + reduceSinkNull(batch, tag, serializedKeySeriesMultiFast.currentLogical, + serializedKeySeriesMultiFast.currentDuplicateCount); + + } else { + + reduceSinkKey(batch, tag, + serializedKeySeriesMultiFast.serializedBytes, + serializedKeySeriesMultiFast.serializedStart, + serializedKeySeriesMultiFast.serializedLength, + serializedKeySeriesMultiFast.currentHashCode, + serializedKeySeriesMultiFast.currentLogical, + serializedKeySeriesMultiFast.currentDuplicateCount); + } - serializedKeySeries = serializedMultiKeySeries; + if (!serializedKeySeriesMultiFast.next()) { + break; + } + } while (true); } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkStringOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkStringOperator.java index b655e6e..39fff0f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkStringOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/reducesink/VectorReduceSinkStringOperator.java @@ -18,16 +18,19 @@ package org.apache.hadoop.hive.ql.exec.vector.reducesink; +import java.io.IOException; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; -import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesBytesSerialized; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesBytesFast; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesLongFast; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /* * Specialized class for native vectorized reduce sink that is reducing on a single long key column. @@ -38,9 +41,16 @@ private static final String CLASS_NAME = VectorReduceSinkStringOperator.class.getName(); private static final Log LOG = LogFactory.getLog(CLASS_NAME); + protected String getLoggingPrefix() { + return super.getLoggingPrefix(CLASS_NAME); + } + // The column number and type information for this one column string reduce key. private transient int singleKeyColumn; + // The object that determines equal key series. + protected transient VectorKeySeriesBytesFast serializedKeySeriesBytesFast; + // The above members are initialized by the constructor and must not be // transient. //--------------------------------------------------------------------------- @@ -69,8 +79,36 @@ protected void initializeOp(Configuration hconf) throws HiveException { singleKeyColumn = reduceSinkKeyColumnMap[0]; - serializedKeySeries = - new VectorKeySeriesBytesSerialized( + serializedKeySeriesBytesFast = + new VectorKeySeriesBytesFast( singleKeyColumn, keyBinarySortableSerializeWrite); } + + @Override + public void processBatch(VectorizedRowBatch batch, int tag) throws IOException { + + serializedKeySeriesBytesFast.processBatch(batch); + + do { + if (serializedKeySeriesBytesFast.currentKeyIsNull) { + + reduceSinkNull(batch, tag, serializedKeySeriesBytesFast.currentLogical, + serializedKeySeriesBytesFast.currentDuplicateCount); + + } else { + + reduceSinkKey(batch, tag, + serializedKeySeriesBytesFast.serializedBytes, + serializedKeySeriesBytesFast.serializedStart, + serializedKeySeriesBytesFast.serializedLength, + serializedKeySeriesBytesFast.currentHashCode, + serializedKeySeriesBytesFast.currentLogical, + serializedKeySeriesBytesFast.currentDuplicateCount); + } + + if (!serializedKeySeriesBytesFast.next()) { + break; + } + } while (true); + } } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index b760988..8ccba51 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.*; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; +import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; import org.apache.hadoop.hive.ql.exec.spark.SparkTask; import org.apache.hadoop.hive.ql.exec.tez.TezTask; @@ -62,6 +63,8 @@ import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkStringOperator; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping; +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping; import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOuterFilteredOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorSMBMapJoinOperator; @@ -94,14 +97,12 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; -import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.JoinDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc.ProcessingMode; -import org.apache.hadoop.hive.ql.plan.VectorPartitionConversion; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; @@ -116,7 +117,9 @@ import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.OperatorVariation; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorDeserializeType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo; import org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.VectorReduceSinkInfo; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; @@ -163,11 +166,13 @@ import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.*; import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.NullStructSerDe; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; @@ -180,6 +185,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hive.common.util.ReflectionUtil; import com.google.common.base.Preconditions; @@ -2007,22 +2013,17 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { } Operator specializeMapJoinOperator(Operator op, - VectorizationContext vContext, MapJoinDesc desc) throws HiveException { + VectorizationContext vContext, MapJoinDesc desc, VectorMapJoinInfo vectorMapJoinInfo) + throws HiveException { Operator vectorOp = null; Class> opClass = null; - VectorMapJoinDesc.HashTableImplementationType hashTableImplementationType = HashTableImplementationType.NONE; - VectorMapJoinDesc.HashTableKind hashTableKind = HashTableKind.NONE; - VectorMapJoinDesc.HashTableKeyType hashTableKeyType = HashTableKeyType.NONE; + HashTableImplementationType hashTableImplementationType = HashTableImplementationType.NONE; + HashTableKind hashTableKind = HashTableKind.NONE; + HashTableKeyType hashTableKeyType = HashTableKeyType.NONE; + OperatorVariation operatorVariation = OperatorVariation.NONE; - if (HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) { - hashTableImplementationType = HashTableImplementationType.FAST; - } else { - // Restrict to using BytesBytesMultiHashMap via MapJoinBytesTableContainer or - // HybridHashTableContainer. - hashTableImplementationType = HashTableImplementationType.OPTIMIZED; - } + hashTableImplementationType = HashTableImplementationType.FAST; int joinType = desc.getConds()[0].getType(); @@ -2042,20 +2043,31 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { Map> keyExprs = desc.getKeys(); List bigTableKeyExprs = keyExprs.get(posBigTable); if (bigTableKeyExprs.size() == 1) { - String typeName = bigTableKeyExprs.get(0).getTypeString(); - LOG.info("Vectorizer vectorizeOperator map join typeName " + typeName); - if (typeName.equals("boolean")) { + TypeInfo typeInfo = bigTableKeyExprs.get(0).getTypeInfo(); + LOG.info("Vectorizer vectorizeOperator map join typeName " + typeInfo.getTypeName()); + switch (((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()) { + case BOOLEAN: hashTableKeyType = HashTableKeyType.BOOLEAN; - } else if (typeName.equals("tinyint")) { + break; + case BYTE: hashTableKeyType = HashTableKeyType.BYTE; - } else if (typeName.equals("smallint")) { + break; + case SHORT: hashTableKeyType = HashTableKeyType.SHORT; - } else if (typeName.equals("int")) { + break; + case INT: hashTableKeyType = HashTableKeyType.INT; - } else if (typeName.equals("bigint") || typeName.equals("long")) { + break; + case LONG: hashTableKeyType = HashTableKeyType.LONG; - } else if (VectorizationContext.isStringFamily(typeName)) { + break; + case STRING: + case CHAR: + case VARCHAR: + case BINARY: hashTableKeyType = HashTableKeyType.STRING; + default: + // Stay with multi-key. } } } @@ -2063,16 +2075,20 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { switch (joinType) { case JoinDesc.INNER_JOIN: if (!isInnerBigOnly) { + operatorVariation = OperatorVariation.INNER; hashTableKind = HashTableKind.HASH_MAP; } else { + operatorVariation = OperatorVariation.INNER_BIG_ONLY; hashTableKind = HashTableKind.HASH_MULTISET; } break; case JoinDesc.LEFT_OUTER_JOIN: case JoinDesc.RIGHT_OUTER_JOIN: + operatorVariation = OperatorVariation.OUTER; hashTableKind = HashTableKind.HASH_MAP; break; case JoinDesc.LEFT_SEMI_JOIN: + operatorVariation = OperatorVariation.LEFT_SEMI; hashTableKind = HashTableKind.HASH_SET; break; default: @@ -2087,71 +2103,63 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { case SHORT: case INT: case LONG: - switch (joinType) { - case JoinDesc.INNER_JOIN: - if (!isInnerBigOnly) { - opClass = VectorMapJoinInnerLongOperator.class; - } else { - opClass = VectorMapJoinInnerBigOnlyLongOperator.class; - } + switch (operatorVariation) { + case INNER: + opClass = VectorMapJoinInnerLongOperator.class; break; - case JoinDesc.LEFT_OUTER_JOIN: - case JoinDesc.RIGHT_OUTER_JOIN: - opClass = VectorMapJoinOuterLongOperator.class; + case INNER_BIG_ONLY: + opClass = VectorMapJoinInnerBigOnlyLongOperator.class; break; - case JoinDesc.LEFT_SEMI_JOIN: + case LEFT_SEMI: opClass = VectorMapJoinLeftSemiLongOperator.class; break; + case OUTER: + opClass = VectorMapJoinOuterLongOperator.class; + break; default: - throw new HiveException("Unknown join type " + joinType); + throw new HiveException("Unknown operator variation " + operatorVariation); } break; case STRING: - switch (joinType) { - case JoinDesc.INNER_JOIN: - if (!isInnerBigOnly) { - opClass = VectorMapJoinInnerStringOperator.class; - } else { - opClass = VectorMapJoinInnerBigOnlyStringOperator.class; - } + switch (operatorVariation) { + case INNER: + opClass = VectorMapJoinInnerStringOperator.class; break; - case JoinDesc.LEFT_OUTER_JOIN: - case JoinDesc.RIGHT_OUTER_JOIN: - opClass = VectorMapJoinOuterStringOperator.class; + case INNER_BIG_ONLY: + opClass = VectorMapJoinInnerBigOnlyStringOperator.class; break; - case JoinDesc.LEFT_SEMI_JOIN: + case LEFT_SEMI: opClass = VectorMapJoinLeftSemiStringOperator.class; break; + case OUTER: + opClass = VectorMapJoinOuterStringOperator.class; + break; default: - throw new HiveException("Unknown join type " + joinType); + throw new HiveException("Unknown operator variation " + operatorVariation); } break; case MULTI_KEY: - switch (joinType) { - case JoinDesc.INNER_JOIN: - if (!isInnerBigOnly) { - opClass = VectorMapJoinInnerMultiKeyOperator.class; - } else { - opClass = VectorMapJoinInnerBigOnlyMultiKeyOperator.class; - } + switch (operatorVariation) { + case INNER: + opClass = VectorMapJoinInnerMultiKeyOperator.class; break; - case JoinDesc.LEFT_OUTER_JOIN: - case JoinDesc.RIGHT_OUTER_JOIN: - opClass = VectorMapJoinOuterMultiKeyOperator.class; + case INNER_BIG_ONLY: + opClass = VectorMapJoinInnerBigOnlyMultiKeyOperator.class; break; - case JoinDesc.LEFT_SEMI_JOIN: + case LEFT_SEMI: opClass = VectorMapJoinLeftSemiMultiKeyOperator.class; break; + case OUTER: + opClass = VectorMapJoinOuterMultiKeyOperator.class; + break; default: - throw new HiveException("Unknown join type " + joinType); + throw new HiveException("Unknown operator variation " + operatorVariation); } break; + default: + throw new RuntimeException("Unexpected hash table key type " + hashTableKeyType.name()); } - vectorOp = OperatorFactory.getVectorOperator( - opClass, op.getCompilationOpContext(), op.getConf(), vContext); - LOG.info("Vectorizer vectorizeOperator map join class " + vectorOp.getClass().getSimpleName()); - boolean minMaxEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MINMAX_ENABLED); @@ -2159,14 +2167,21 @@ private boolean isBigTableOnlyResults(MapJoinDesc desc) { vectorDesc.setHashTableImplementationType(hashTableImplementationType); vectorDesc.setHashTableKind(hashTableKind); vectorDesc.setHashTableKeyType(hashTableKeyType); + vectorDesc.setOperatorVariation(operatorVariation); vectorDesc.setMinMaxEnabled(minMaxEnabled); + vectorDesc.setVectorMapJoinInfo(vectorMapJoinInfo); + + vectorOp = OperatorFactory.getVectorOperator( + opClass, op.getCompilationOpContext(), op.getConf(), vContext); + LOG.info("Vectorizer vectorizeOperator map join class " + vectorOp.getClass().getSimpleName()); + return vectorOp; } private boolean onExpressionHasNullSafes(MapJoinDesc desc) { boolean[] nullSafes = desc.getNullSafes(); if (nullSafes == null) { - return false; + return false; } for (boolean nullSafe : nullSafes) { if (nullSafe) { @@ -2177,53 +2192,322 @@ private boolean onExpressionHasNullSafes(MapJoinDesc desc) { } private boolean canSpecializeMapJoin(Operator op, MapJoinDesc desc, - boolean isTez) { - - boolean specialize = false; + boolean isTez, VectorizationContext vContext, VectorMapJoinInfo vectorMapJoinInfo) + throws HiveException { - if (op instanceof MapJoinOperator && - HiveConf.getBoolVar(hiveConf, + if (!(op instanceof MapJoinOperator) || + !HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED)) { + return false; + } - // Currently, only under Tez and non-N-way joins. - if (isTez && desc.getConds().length == 1 && !onExpressionHasNullSafes(desc)) { + // Currently, only under Tez and non-N-way joins. + if (!isTez || desc.getConds().length != 1 || onExpressionHasNullSafes(desc)) { + return false; + } - // Ok, all basic restrictions satisfied so far... - specialize = true; + // Verify we handle the key column types for an optimized table. This is the effectively the + // same check used in HashTableLoader. + try { + TableDesc keyTableDesc = desc.getKeyTblDesc(); + SerDe keySerializer = (SerDe) ReflectionUtil.newInstance( + keyTableDesc.getDeserializerClass(), null); + SerDeUtils.initializeSerDe(keySerializer, null, keyTableDesc.getProperties(), null); + ObjectInspector keyOi = keySerializer.getObjectInspector(); + if (!MapJoinBytesTableContainer.isSupportedKey(keyOi)) { + return false; + } + } catch (SerDeException e) { + throw new HiveException(e); + } - if (!HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) { + /* + * Populate vectorMapJoininfo. + */ - // We are using the optimized hash table we have further - // restrictions (using optimized and key type). + byte posBigTable = (byte) desc.getPosBigTable(); - if (!HiveConf.getBoolVar(hiveConf, - HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE)) { - specialize = false; - } else { - byte posBigTable = (byte) desc.getPosBigTable(); - Map> keyExprs = desc.getKeys(); - List bigTableKeyExprs = keyExprs.get(posBigTable); - for (ExprNodeDesc exprNodeDesc : bigTableKeyExprs) { - String typeName = exprNodeDesc.getTypeString(); - if (!MapJoinKey.isSupportedField(typeName)) { - specialize = false; - break; - } + List keyDesc = desc.getKeys().get(posBigTable); + VectorExpression[] allBigTableKeyExpressions = vContext.getVectorExpressions(keyDesc); + final int allBigTableKeyExpressionsLength = allBigTableKeyExpressions.length; + if (allBigTableKeyExpressionsLength == 0) { + return false; + } + // Since a key expression can be a calculation and the key will go into a scratch column, + // we need the mapping and type information. + int[] bigTableKeyColumnMap = new int[allBigTableKeyExpressionsLength]; + String[] bigTableKeyColumnNames = new String[allBigTableKeyExpressionsLength]; + TypeInfo[] bigTableKeyTypeInfos = new TypeInfo[allBigTableKeyExpressionsLength]; + ArrayList bigTableKeyExpressionsList = new ArrayList(); + VectorExpression[] bigTableKeyExpressions; + for (int i = 0; i < allBigTableKeyExpressionsLength; i++) { + VectorExpression ve = allBigTableKeyExpressions[i]; + if (!IdentityExpression.isColumnOnly(ve)) { + bigTableKeyExpressionsList.add(ve); + } + bigTableKeyColumnMap[i] = ve.getOutputColumn(); + + ExprNodeDesc exprNode = keyDesc.get(i); + bigTableKeyColumnNames[i] = exprNode.toString(); + + TypeInfo typeInfo = exprNode.getTypeInfo(); + // Verify we handle the key column types for an optimized table. This is the effectively the + // same check used in HashTableLoader. + if (!MapJoinKey.isSupportedField(typeInfo)) { + return false; + } + bigTableKeyTypeInfos[i] = typeInfo; + } + if (bigTableKeyExpressionsList.size() == 0) { + bigTableKeyExpressions = null; + } else { + bigTableKeyExpressions = bigTableKeyExpressionsList.toArray(new VectorExpression[0]); + } + + List bigTableExprs = desc.getExprs().get(posBigTable); + VectorExpression[] allBigTableValueExpressions = vContext.getVectorExpressions(bigTableExprs); + + /* + * Similarly, we need a mapping since a value expression can be a calculation and the value + * will go into a scratch column. + */ + int[] bigTableValueColumnMap = new int[allBigTableValueExpressions.length]; + String[] bigTableValueColumnNames = new String[allBigTableValueExpressions.length]; + TypeInfo[] bigTableValueTypeInfos = new TypeInfo[allBigTableValueExpressions.length]; + ArrayList bigTableValueExpressionsList = new ArrayList(); + VectorExpression[] bigTableValueExpressions; + for (int i = 0; i < bigTableValueColumnMap.length; i++) { + VectorExpression ve = allBigTableValueExpressions[i]; + if (!IdentityExpression.isColumnOnly(ve)) { + bigTableValueExpressionsList.add(ve); + } + bigTableValueColumnMap[i] = ve.getOutputColumn(); + + ExprNodeDesc exprNode = bigTableExprs.get(i); + bigTableValueColumnNames[i] = exprNode.toString(); + bigTableValueTypeInfos[i] = exprNode.getTypeInfo(); + } + if (bigTableValueExpressionsList.size() == 0) { + bigTableValueExpressions = null; + } else { + bigTableValueExpressions = bigTableValueExpressionsList.toArray(new VectorExpression[0]); + } + + vectorMapJoinInfo.setBigTableKeyColumnMap(bigTableKeyColumnMap); + vectorMapJoinInfo.setBigTableKeyColumnNames(bigTableKeyColumnNames); + vectorMapJoinInfo.setBigTableKeyTypeInfos(bigTableKeyTypeInfos); + vectorMapJoinInfo.setBigTableKeyExpressions(bigTableKeyExpressions); + + vectorMapJoinInfo.setBigTableValueColumnMap(bigTableValueColumnMap); + vectorMapJoinInfo.setBigTableValueColumnNames(bigTableValueColumnNames); + vectorMapJoinInfo.setBigTableValueTypeInfos(bigTableValueTypeInfos); + vectorMapJoinInfo.setBigTableValueExpressions(bigTableValueExpressions); + + /* + * Small table information. + */ + VectorColumnOutputMapping bigTableRetainedMapping = + new VectorColumnOutputMapping("Big Table Retained Mapping"); + + VectorColumnOutputMapping bigTableOuterKeyMapping = + new VectorColumnOutputMapping("Big Table Outer Key Mapping"); + + // The order of the fields in the LazyBinary small table value must be used, so + // we use the source ordering flavor for the mapping. + VectorColumnSourceMapping smallTableMapping = + new VectorColumnSourceMapping("Small Table Mapping"); + + Byte[] order = desc.getTagOrder(); + Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]); + boolean isOuterJoin = !desc.getNoOuterJoin(); + + /* + * Gather up big and small table output result information from the MapJoinDesc. + */ + List bigTableRetainList = desc.getRetainList().get(posBigTable); + int bigTableRetainSize = bigTableRetainList.size(); + + int[] smallTableIndices; + int smallTableIndicesSize; + List smallTableExprs = desc.getExprs().get(posSingleVectorMapJoinSmallTable); + if (desc.getValueIndices() != null && desc.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) { + smallTableIndices = desc.getValueIndices().get(posSingleVectorMapJoinSmallTable); + smallTableIndicesSize = smallTableIndices.length; + } else { + smallTableIndices = null; + smallTableIndicesSize = 0; + } + + List smallTableRetainList = desc.getRetainList().get(posSingleVectorMapJoinSmallTable); + int smallTableRetainSize = smallTableRetainList.size(); + + int smallTableResultSize = 0; + if (smallTableIndicesSize > 0) { + smallTableResultSize = smallTableIndicesSize; + } else if (smallTableRetainSize > 0) { + smallTableResultSize = smallTableRetainSize; + } + + /* + * Determine the big table retained mapping first so we can optimize out (with + * projection) copying inner join big table keys in the subsequent small table results section. + */ + + // We use a mapping object here so we can build the projection in any order and + // get the ordered by 0 to n-1 output columns at the end. + // + // Also, to avoid copying a big table key into the small table result area for inner joins, + // we reference it with the projection so there can be duplicate output columns + // in the projection. + VectorColumnSourceMapping projectionMapping = new VectorColumnSourceMapping("Projection Mapping"); + + int nextOutputColumn = (order[0] == posBigTable ? 0 : smallTableResultSize); + for (int i = 0; i < bigTableRetainSize; i++) { + + // Since bigTableValueExpressions may do a calculation and produce a scratch column, we + // need to map to the right batch column. + + int retainColumn = bigTableRetainList.get(i); + int batchColumnIndex = bigTableValueColumnMap[retainColumn]; + TypeInfo typeInfo = bigTableValueTypeInfos[i]; + + // With this map we project the big table batch to make it look like an output batch. + projectionMapping.add(nextOutputColumn, batchColumnIndex, typeInfo); + + // Collect columns we copy from the big table batch to the overflow batch. + if (!bigTableRetainedMapping.containsOutputColumn(batchColumnIndex)) { + // Tolerate repeated use of a big table column. + bigTableRetainedMapping.add(batchColumnIndex, batchColumnIndex, typeInfo); + } + + nextOutputColumn++; + } + + /* + * Now determine the small table results. + */ + int firstSmallTableOutputColumn; + firstSmallTableOutputColumn = (order[0] == posBigTable ? bigTableRetainSize : 0); + int smallTableOutputCount = 0; + nextOutputColumn = firstSmallTableOutputColumn; + + // Small table indices has more information (i.e. keys) than retain, so use it if it exists... + String[] bigTableRetainedNames; + if (smallTableIndicesSize > 0) { + smallTableOutputCount = smallTableIndicesSize; + bigTableRetainedNames = new String[smallTableOutputCount]; + + for (int i = 0; i < smallTableIndicesSize; i++) { + if (smallTableIndices[i] >= 0) { + + // Zero and above numbers indicate a big table key is needed for + // small table result "area". + + int keyIndex = smallTableIndices[i]; + + // Since bigTableKeyExpressions may do a calculation and produce a scratch column, we + // need to map the right column. + int batchKeyColumn = bigTableKeyColumnMap[keyIndex]; + bigTableRetainedNames[i] = bigTableKeyColumnNames[keyIndex]; + TypeInfo typeInfo = bigTableKeyTypeInfos[keyIndex]; + + if (!isOuterJoin) { + + // Optimize inner join keys of small table results. + + // Project the big table key into the small table result "area". + projectionMapping.add(nextOutputColumn, batchKeyColumn, typeInfo); + + if (!bigTableRetainedMapping.containsOutputColumn(batchKeyColumn)) { + // If necessary, copy the big table key into the overflow batch's small table + // result "area". + bigTableRetainedMapping.add(batchKeyColumn, batchKeyColumn, typeInfo); } + } else { + + // For outer joins, since the small table key can be null when there is no match, + // we must have a physical (scratch) column for those keys. We cannot use the + // projection optimization used by inner joins above. + + int scratchColumn = vContext.allocateScratchColumn(typeInfo.getTypeName()); + projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo); + + bigTableRetainedMapping.add(batchKeyColumn, scratchColumn, typeInfo); + + bigTableOuterKeyMapping.add(batchKeyColumn, scratchColumn, typeInfo); } } else { - // With the fast hash table implementation, we currently do not support - // Hybrid Grace Hash Join. + // Negative numbers indicate a column to be (deserialize) read from the small table's + // LazyBinary value row. + int smallTableValueIndex = -smallTableIndices[i] - 1; - if (desc.isHybridHashJoin()) { - specialize = false; + ExprNodeDesc smallTableExprNode = smallTableExprs.get(i); + if (!validateExprNodeDesc(smallTableExprNode)) { + return false; } + + bigTableRetainedNames[i] = smallTableExprNode.toString(); + + TypeInfo typeInfo = smallTableExprNode.getTypeInfo(); + + // Make a new big table scratch column for the small table value. + int scratchColumn = vContext.allocateScratchColumn(typeInfo.getTypeName()); + projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo); + + smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo); } + nextOutputColumn++; } + } else if (smallTableRetainSize > 0) { + smallTableOutputCount = smallTableRetainSize; + bigTableRetainedNames = new String[smallTableOutputCount]; + + // Only small table values appear in join output result. + + for (int i = 0; i < smallTableRetainSize; i++) { + int smallTableValueIndex = smallTableRetainList.get(i); + + ExprNodeDesc smallTableExprNode = smallTableExprs.get(i); + if (!validateExprNodeDesc(smallTableExprNode)) { + return false; + } + + bigTableRetainedNames[i] = smallTableExprNode.toString(); + + // Make a new big table scratch column for the small table value. + TypeInfo typeInfo = smallTableExprNode.getTypeInfo(); + int scratchColumn = vContext.allocateScratchColumn(typeInfo.getTypeName()); + + projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo); + + smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo); + nextOutputColumn++; + } + } else { + bigTableRetainedNames = new String[0]; } - return specialize; + + // Convert dynamic arrays and maps to simple arrays. + + bigTableRetainedMapping.finalize(); + + bigTableOuterKeyMapping.finalize(); + + smallTableMapping.finalize(); + + vectorMapJoinInfo.setBigTableRetainedMapping(bigTableRetainedMapping); + vectorMapJoinInfo.setBigTableOuterKeyMapping(bigTableOuterKeyMapping); + vectorMapJoinInfo.setSmallTableMapping(smallTableMapping); + + projectionMapping.finalize(); + + // Verify we added an entry for each output. + assert projectionMapping.isSourceSequenceGood(); + + vectorMapJoinInfo.setProjectionMapping(projectionMapping); + + return true; } private Operator specializeReduceSinkOperator( @@ -2353,6 +2637,7 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, // Since a key expression can be a calculation and the key will go into a scratch column, // we need the mapping and type information. int[] reduceSinkKeyColumnMap = new int[allKeyExpressions.length]; + String[] reduceSinkKeyColumnNames = new String[allKeyExpressions.length]; TypeInfo[] reduceSinkKeyTypeInfos = new TypeInfo[allKeyExpressions.length]; Type[] reduceSinkKeyColumnVectorTypes = new Type[allKeyExpressions.length]; ArrayList groupByKeyExpressionsList = new ArrayList(); @@ -2360,12 +2645,15 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, for (int i = 0; i < reduceSinkKeyColumnMap.length; i++) { VectorExpression ve = allKeyExpressions[i]; reduceSinkKeyColumnMap[i] = ve.getOutputColumn(); - reduceSinkKeyTypeInfos[i] = keysDescs.get(i).getTypeInfo(); - reduceSinkKeyColumnVectorTypes[i] = - VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkKeyTypeInfos[i]); if (!IdentityExpression.isColumnOnly(ve)) { groupByKeyExpressionsList.add(ve); } + + ExprNodeDesc exprNode = keysDescs.get(i); + reduceSinkKeyColumnNames[i] = exprNode.toString(); + reduceSinkKeyTypeInfos[i] = exprNode.getTypeInfo(); + reduceSinkKeyColumnVectorTypes[i] = + VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkKeyTypeInfos[i]); } if (groupByKeyExpressionsList.size() == 0) { reduceSinkKeyExpressions = null; @@ -2376,20 +2664,24 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, ArrayList valueDescs = desc.getValueCols(); VectorExpression[] allValueExpressions = vContext.getVectorExpressions(valueDescs); - int[] reduceSinkValueColumnMap = new int[valueDescs.size()]; - TypeInfo[] reduceSinkValueTypeInfos = new TypeInfo[valueDescs.size()]; - Type[] reduceSinkValueColumnVectorTypes = new Type[valueDescs.size()]; + int[] reduceSinkValueColumnMap = new int[allValueExpressions.length]; + String[] reduceSinkValueColumnNames = new String[allValueExpressions.length]; + TypeInfo[] reduceSinkValueTypeInfos = new TypeInfo[allValueExpressions.length]; + Type[] reduceSinkValueColumnVectorTypes = new Type[allValueExpressions.length]; ArrayList reduceSinkValueExpressionsList = new ArrayList(); VectorExpression[] reduceSinkValueExpressions; for (int i = 0; i < valueDescs.size(); ++i) { VectorExpression ve = allValueExpressions[i]; reduceSinkValueColumnMap[i] = ve.getOutputColumn(); - reduceSinkValueTypeInfos[i] = valueDescs.get(i).getTypeInfo(); - reduceSinkValueColumnVectorTypes[i] = - VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkValueTypeInfos[i]); if (!IdentityExpression.isColumnOnly(ve)) { reduceSinkValueExpressionsList.add(ve); } + + ExprNodeDesc exprNode = valueDescs.get(i); + reduceSinkValueColumnNames[i] = exprNode.toString(); + reduceSinkValueTypeInfos[i] = exprNode.getTypeInfo(); + reduceSinkValueColumnVectorTypes[i] = + VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkValueTypeInfos[i]); } if (reduceSinkValueExpressionsList.size() == 0) { reduceSinkValueExpressions = null; @@ -2398,11 +2690,13 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, } vectorReduceSinkInfo.setReduceSinkKeyColumnMap(reduceSinkKeyColumnMap); + vectorReduceSinkInfo.setReduceSinkKeyColumnNames(reduceSinkKeyColumnNames); vectorReduceSinkInfo.setReduceSinkKeyTypeInfos(reduceSinkKeyTypeInfos); vectorReduceSinkInfo.setReduceSinkKeyColumnVectorTypes(reduceSinkKeyColumnVectorTypes); vectorReduceSinkInfo.setReduceSinkKeyExpressions(reduceSinkKeyExpressions); vectorReduceSinkInfo.setReduceSinkValueColumnMap(reduceSinkValueColumnMap); + vectorReduceSinkInfo.setReduceSinkValueColumnNames(reduceSinkValueColumnNames); vectorReduceSinkInfo.setReduceSinkValueTypeInfos(reduceSinkValueTypeInfos); vectorReduceSinkInfo.setReduceSinkValueColumnVectorTypes(reduceSinkValueColumnVectorTypes); vectorReduceSinkInfo.setReduceSinkValueExpressions(reduceSinkValueExpressions); @@ -2417,8 +2711,9 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, switch (op.getType()) { case MAPJOIN: { + VectorMapJoinInfo vectorMapJoinInfo = new VectorMapJoinInfo(); MapJoinDesc desc = (MapJoinDesc) op.getConf(); - boolean specialize = canSpecializeMapJoin(op, desc, isTez || isSpark); + boolean specialize = canSpecializeMapJoin(op, desc, isTez || isSpark, vContext, vectorMapJoinInfo); if (!specialize) { @@ -2443,11 +2738,7 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, } else { - // TEMPORARY Until Native Vector Map Join with Hybrid passes tests... - // HiveConf.setBoolVar(physicalContext.getConf(), - // HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN, false); - - vectorOp = specializeMapJoinOperator(op, vContext, desc); + vectorOp = specializeMapJoinOperator(op, vContext, desc, vectorMapJoinInfo); } } break; diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java index 8ea230f..256ab65 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java @@ -20,6 +20,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import com.google.common.base.Preconditions; /** * VectorGroupByDesc. @@ -35,7 +36,6 @@ public static enum HashTableImplementationType { NONE, - OPTIMIZED, FAST } @@ -79,23 +79,38 @@ public PrimitiveTypeInfo getPrimitiveTypeInfo() { } } + public static enum OperatorVariation { + NONE, + INNER_BIG_ONLY, + INNER, + LEFT_SEMI, + OUTER + } + private HashTableImplementationType hashTableImplementationType; private HashTableKind hashTableKind; private HashTableKeyType hashTableKeyType; + private OperatorVariation operatorVariation; private boolean minMaxEnabled; + private VectorMapJoinInfo vectorMapJoinInfo; + public VectorMapJoinDesc() { hashTableImplementationType = HashTableImplementationType.NONE; hashTableKind = HashTableKind.NONE; hashTableKeyType = HashTableKeyType.NONE; + operatorVariation = OperatorVariation.NONE; minMaxEnabled = false; + vectorMapJoinInfo = null; } public VectorMapJoinDesc(VectorMapJoinDesc clone) { this.hashTableImplementationType = clone.hashTableImplementationType; this.hashTableKind = clone.hashTableKind; this.hashTableKeyType = clone.hashTableKeyType; + this.operatorVariation = clone.operatorVariation; this.minMaxEnabled = clone.minMaxEnabled; + this.vectorMapJoinInfo = clone.vectorMapJoinInfo; } public HashTableImplementationType hashTableImplementationType() { @@ -122,6 +137,14 @@ public void setHashTableKeyType(HashTableKeyType hashTableKeyType) { this.hashTableKeyType = hashTableKeyType; } + public OperatorVariation operatorVariation() { + return operatorVariation; + } + + public void setOperatorVariation(OperatorVariation operatorVariation) { + this.operatorVariation = operatorVariation; + } + public boolean minMaxEnabled() { return minMaxEnabled; } @@ -129,4 +152,13 @@ public boolean minMaxEnabled() { public void setMinMaxEnabled(boolean minMaxEnabled) { this.minMaxEnabled = minMaxEnabled; } + + public void setVectorMapJoinInfo(VectorMapJoinInfo vectorMapJoinInfo) { + Preconditions.checkState(vectorMapJoinInfo != null); + this.vectorMapJoinInfo = vectorMapJoinInfo; + } + + public VectorMapJoinInfo getVectorMapJoinInfo() { + return vectorMapJoinInfo; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinInfo.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinInfo.java new file mode 100644 index 0000000..2cf2e72 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinInfo.java @@ -0,0 +1,169 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping; +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * VectorMapJoinInfo. + * + * A convenience data structure that has information needed to vectorize map join. + * + * It is created by the Vectorizer when it is determining whether it can specialize so the + * information doesn't have to be recreated again and again by the VectorMapJoinOperator's + * constructors and later during execution. + */ +public class VectorMapJoinInfo { + + private static long serialVersionUID = 1L; + + private int[] bigTableKeyColumnMap; + private String[] bigTableKeyColumnNames; + private TypeInfo[] bigTableKeyTypeInfos; + private VectorExpression[] bigTableKeyExpressions; + + private int[] bigTableValueColumnMap; + private String[] bigTableValueColumnNames; + private TypeInfo[] bigTableValueTypeInfos; + private VectorExpression[] bigTableValueExpressions; + + private VectorColumnOutputMapping bigTableRetainedMapping; + private VectorColumnOutputMapping bigTableOuterKeyMapping; + private VectorColumnSourceMapping smallTableMapping; + + private VectorColumnSourceMapping projectionMapping; + + public VectorMapJoinInfo() { + bigTableKeyColumnMap = null; + bigTableKeyColumnNames = null; + bigTableKeyTypeInfos = null; + bigTableKeyExpressions = null; + + bigTableValueColumnMap = null; + bigTableValueColumnNames = null; + bigTableValueTypeInfos = null; + bigTableValueExpressions = null; + + bigTableRetainedMapping = null; + bigTableOuterKeyMapping = null; + smallTableMapping = null; + + projectionMapping = null; + } + + public int[] getBigTableKeyColumnMap() { + return bigTableKeyColumnMap; + } + + public void setBigTableKeyColumnMap(int[] bigTableKeyColumnMap) { + this.bigTableKeyColumnMap = bigTableKeyColumnMap; + } + + public String[] getBigTableKeyColumnNames() { + return bigTableKeyColumnNames; + } + + public void setBigTableKeyColumnNames(String[] bigTableKeyColumnNames) { + this.bigTableKeyColumnNames = bigTableKeyColumnNames; + } + + public TypeInfo[] getBigTableKeyTypeInfos() { + return bigTableKeyTypeInfos; + } + + public void setBigTableKeyTypeInfos(TypeInfo[] bigTableKeyTypeInfos) { + this.bigTableKeyTypeInfos = bigTableKeyTypeInfos; + } + + public VectorExpression[] getBigTableKeyExpressions() { + return bigTableKeyExpressions; + } + + public void setBigTableKeyExpressions(VectorExpression[] bigTableKeyExpressions) { + this.bigTableKeyExpressions = bigTableKeyExpressions; + } + + + public int[] getBigTableValueColumnMap() { + return bigTableValueColumnMap; + } + + public void setBigTableValueColumnMap(int[] bigTableValueColumnMap) { + this.bigTableValueColumnMap = bigTableValueColumnMap; + } + + public String[] getBigTableValueColumnNames() { + return bigTableValueColumnNames; + } + + public void setBigTableValueColumnNames(String[] bigTableValueColumnNames) { + this.bigTableValueColumnNames = bigTableValueColumnNames; + } + + public TypeInfo[] getBigTableValueTypeInfos() { + return bigTableValueTypeInfos; + } + + public void setBigTableValueTypeInfos(TypeInfo[] bigTableValueTypeInfos) { + this.bigTableValueTypeInfos = bigTableValueTypeInfos; + } + + public VectorExpression[] getBigTableValueExpressions() { + return bigTableValueExpressions; + } + + public void setBigTableValueExpressions(VectorExpression[] bigTableValueExpressions) { + this.bigTableValueExpressions = bigTableValueExpressions; + } + + public void setBigTableRetainedMapping(VectorColumnOutputMapping bigTableRetainedMapping) { + this.bigTableRetainedMapping = bigTableRetainedMapping; + } + + public VectorColumnOutputMapping getBigTableRetainedMapping() { + return bigTableRetainedMapping; + } + + public void setBigTableOuterKeyMapping(VectorColumnOutputMapping bigTableOuterKeyMapping) { + this.bigTableOuterKeyMapping = bigTableOuterKeyMapping; + } + + public VectorColumnOutputMapping getBigTableOuterKeyMapping() { + return bigTableOuterKeyMapping; + } + + public void setSmallTableMapping(VectorColumnSourceMapping smallTableMapping) { + this.smallTableMapping = smallTableMapping; + } + + public VectorColumnSourceMapping getSmallTableMapping() { + return smallTableMapping; + } + + public void setProjectionMapping(VectorColumnSourceMapping projectionMapping) { + this.projectionMapping = projectionMapping; + } + + public VectorColumnSourceMapping getProjectionMapping() { + return projectionMapping; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkDesc.java index c56bff6..487dd1c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkDesc.java @@ -31,10 +31,16 @@ private static long serialVersionUID = 1L; public static enum ReduceSinkKeyType { - NONE, - LONG, - STRING, - MULTI_KEY + NONE("none"), + LONG("long"), + STRING("string"), + MULTI_KEY("multi-key"); + + final String displayName; + + ReduceSinkKeyType(String displayName) { + this.displayName = displayName; + } } private ReduceSinkKeyType reduceSinkKeyType; @@ -61,4 +67,12 @@ public void setVectorReduceSinkInfo(VectorReduceSinkInfo vectorReduceSinkInfo) { public VectorReduceSinkInfo getVectorReduceSinkInfo() { return vectorReduceSinkInfo; } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("key type: "); + sb.append(reduceSinkKeyType.displayName); + return sb.toString(); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkInfo.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkInfo.java index 8c35415..807c26d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkInfo.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorReduceSinkInfo.java @@ -23,12 +23,12 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** - * VectorGroupByAggregrationInfo. + * VectorReduceSinkInfo. * * A convenience data structure that has information needed to vectorize reduce sink. * * It is created by the Vectorizer when it is determining whether it can specialize so the - * information doesn't have to be recreated again and agains by the VectorReduceSinkOperator's + * information doesn't have to be recreated again and again by the VectorReduceSinkOperator's * constructors and later during execution. */ public class VectorReduceSinkInfo { @@ -36,22 +36,26 @@ private static long serialVersionUID = 1L; private int[] reduceSinkKeyColumnMap; + private String[] reduceSinkKeyColumnNames; private TypeInfo[] reduceSinkKeyTypeInfos; private Type[] reduceSinkKeyColumnVectorTypes; private VectorExpression[] reduceSinkKeyExpressions; private int[] reduceSinkValueColumnMap; + private String[] reduceSinkValueColumnNames; private TypeInfo[] reduceSinkValueTypeInfos; private Type[] reduceSinkValueColumnVectorTypes; private VectorExpression[] reduceSinkValueExpressions; public VectorReduceSinkInfo() { reduceSinkKeyColumnMap = null; + reduceSinkKeyColumnNames = null; reduceSinkKeyTypeInfos = null; reduceSinkKeyColumnVectorTypes = null; reduceSinkKeyExpressions = null; reduceSinkValueColumnMap = null; + reduceSinkValueColumnNames = null; reduceSinkValueTypeInfos = null; reduceSinkValueColumnVectorTypes = null; reduceSinkValueExpressions = null; @@ -65,6 +69,14 @@ public void setReduceSinkKeyColumnMap(int[] reduceSinkKeyColumnMap) { this.reduceSinkKeyColumnMap = reduceSinkKeyColumnMap; } + public String[] getReduceSinkKeyColumnNames() { + return reduceSinkKeyColumnNames; + } + + public void setReduceSinkKeyColumnNames(String[] reduceSinkKeyColumnNames) { + this.reduceSinkKeyColumnNames = reduceSinkKeyColumnNames; + } + public TypeInfo[] getReduceSinkKeyTypeInfos() { return reduceSinkKeyTypeInfos; } @@ -97,6 +109,14 @@ public void setReduceSinkValueColumnMap(int[] reduceSinkValueColumnMap) { this.reduceSinkValueColumnMap = reduceSinkValueColumnMap; } + public String[] getReduceSinkValueColumnNames() { + return reduceSinkValueColumnNames; + } + + public void setReduceSinkValueColumnNames(String[] reduceSinkValueColumnNames) { + this.reduceSinkValueColumnNames = reduceSinkValueColumnNames; + } + public TypeInfo[] getReduceSinkValueTypeInfos() { return reduceSinkValueTypeInfos; } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java index c1d7c72..ae4ca2b 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestBytesBytesMultiHashMap.java @@ -25,11 +25,16 @@ import java.util.List; import java.util.Random; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePutWriter; import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.WriteBuffers; +import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; + import static org.junit.Assert.*; public class TestBytesBytesMultiHashMap { @@ -52,10 +57,10 @@ public void testCapacityValidation() { public void testPutGetOne() throws Exception { BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(CAPACITY, LOAD_FACTOR, WB_SIZE); RandomKvSource kv = new RandomKvSource(0, 0); - map.put(kv, -1); + map.put(kv); verifyHashMapResult(map, kv.getLastKey(), kv.getLastValue()); kv = new RandomKvSource(10, 100); - map.put(kv, -1); + map.put(kv); verifyHashMapResult(map, kv.getLastKey(), kv.getLastValue()); } @@ -63,12 +68,12 @@ public void testPutGetOne() throws Exception { public void testPutGetMultiple() throws Exception { BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(CAPACITY, LOAD_FACTOR, WB_SIZE); RandomKvSource kv = new RandomKvSource(0, 100); - map.put(kv, -1); + map.put(kv); verifyHashMapResult(map, kv.getLastKey(), kv.getLastValue()); FixedKeyKvSource kv2 = new FixedKeyKvSource(kv.getLastKey(), 0, 100); kv2.values.add(kv.getLastValue()); for (int i = 0; i < 3; ++i) { - map.put(kv2, -1); + map.put(kv2); verifyHashMapResult(map, kv2.key, kv2.values.toArray(new byte[kv2.values.size()][])); } } @@ -76,17 +81,21 @@ public void testPutGetMultiple() throws Exception { @Test public void testGetNonExistent() throws Exception { BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(CAPACITY, LOAD_FACTOR, WB_SIZE); + BytesBytesMultiHashMapFactory factory = new BytesBytesMultiHashMapFactory(); RandomKvSource kv = new RandomKvSource(1, 100); - map.put(kv, -1); + map.put(kv); byte[] key = kv.getLastKey(); key[0] = (byte)(key[0] + 1); FixedKeyKvSource kv2 = new FixedKeyKvSource(kv.getLastKey(), 0, 100); - map.put(kv2, -1); + map.put(kv2); key[0] = (byte)(key[0] + 1); - BytesBytesMultiHashMap.Result hashMapResult = new BytesBytesMultiHashMap.Result(); - map.getValueResult(key, 0, key.length, hashMapResult); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); assertTrue(!hashMapResult.hasRows()); - map.getValueResult(key, 0, 0, hashMapResult); + + hashCode = HashCodeUtil.murmurHash(key, 0, 0); + map.hashMapLookup(key, 0, 0, hashCode, hashMapResult); assertTrue(!hashMapResult.hasRows()); } @@ -94,17 +103,20 @@ public void testGetNonExistent() throws Exception { public void testPutWithFullMap() throws Exception { // Make sure the map does not expand; should be able to find space. BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(CAPACITY, 1f, WB_SIZE); + BytesBytesMultiHashMapFactory factory = new BytesBytesMultiHashMapFactory(); UniqueKeysKvSource kv = new UniqueKeysKvSource(); for (int i = 0; i < CAPACITY; ++i) { - map.put(kv, -1); + map.put(kv); } for (int i = 0; i < kv.keys.size(); ++i) { verifyHashMapResult(map, kv.keys.get(i), kv.values.get(i)); } assertEquals(CAPACITY, map.getCapacity()); // Get of non-existent key should terminate.. - BytesBytesMultiHashMap.Result hashMapResult = new BytesBytesMultiHashMap.Result(); - map.getValueResult(new byte[0], 0, 0, hashMapResult); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + byte[] key = new byte[0]; + int hashCode = HashCodeUtil.murmurHash(key, 0, 0); + map.hashMapLookup(key, 0, 0, hashCode, hashMapResult); } @Test @@ -113,7 +125,7 @@ public void testExpand() throws Exception { BytesBytesMultiHashMap map = new BytesBytesMultiHashMap(1, 0.0000001f, WB_SIZE); UniqueKeysKvSource kv = new UniqueKeysKvSource(); for (int i = 0; i < 18; ++i) { - map.put(kv, -1); + map.put(kv); for (int j = 0; j <= i; ++j) { verifyHashMapResult(map, kv.keys.get(j), kv.values.get(j)); } @@ -121,9 +133,14 @@ public void testExpand() throws Exception { assertEquals(1 << 18, map.getCapacity()); } - private void verifyHashMapResult(BytesBytesMultiHashMap map, byte[] key, byte[]... values) { - BytesBytesMultiHashMap.Result hashMapResult = new BytesBytesMultiHashMap.Result(); - byte state = map.getValueResult(key, 0, key.length, hashMapResult); + private void verifyHashMapResult(BytesBytesMultiHashMap map, byte[] key, byte[]... values) + throws IOException { + BytesBytesMultiHashMapFactory factory = new BytesBytesMultiHashMapFactory(); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); + byte state = hashMapResult.aliasFilter(); HashSet hs = new HashSet(); int count = 0; if (hashMapResult.hasRows()) { @@ -195,7 +212,7 @@ public void writeValue(RandomAccessOutput dest) throws SerDeException { } } - private static class RandomKvSource implements BytesBytesMultiHashMap.KvSource { + private static class RandomKvSource implements KeyValuePutWriter { private int minLength, maxLength; private final Random rdm = new Random(43); public List keys = new ArrayList(), values = new ArrayList(); @@ -239,5 +256,26 @@ public void writeValue(RandomAccessOutput dest) throws SerDeException { public byte updateStateByte(Byte previousValue) { return (byte)(previousValue == null ? 1 : previousValue + 1); } + + @Override + public void setKeyValue(Writable key, Writable value) + throws SerDeException, IOException { + throw new RuntimeException("Not used"); + } + + @Override + public boolean hasHashCode() { + return false; + } + + @Override + public int getKeyHashCode() throws SerDeException { + return 0; + } + + @Override + public long getLongKey() { + throw new RuntimeException("Not used"); + } } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestHashPartition.java ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestHashPartition.java index efabd2b..66976e9 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestHashPartition.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/persistence/TestHashPartition.java @@ -24,7 +24,8 @@ @Test public void testHashPartition() throws Exception { - // TODO: wtf? - HashPartition hashPartition = new HashPartition(1024, (float) 0.75, 524288, 1, true, null); + // TODO + HashPartition hashPartition = new HashPartition(1024, (float) 0.75, 524288, 1, true, + null, new BytesBytesMultiHashMapFactory()); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowUtil.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowUtil.java new file mode 100644 index 0000000..fa4faeb --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/RandomRowUtil.java @@ -0,0 +1,242 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.io.IOException; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Random; + +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; +import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.fast.SerializeWrite; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; +import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +/** + * Generate object inspector and random row object[]. + */ +public class RandomRowUtil { + + public static Output serializeRow(Object[] row, VectorRandomRowSource source, + SerializeWrite serializeWrite) throws HiveException, IOException { + Output output = new Output(); + serializeWrite.set(output); + PrimitiveTypeInfo[] primitiveTypeInfos = source.primitiveTypeInfos(); + for (int i = 0; i < primitiveTypeInfos.length; i++) { + Object object = row[i]; + if (object == null) { + serializeWrite.writeNull(); + continue; + } + PrimitiveCategory primitiveCategory = primitiveTypeInfos[i].getPrimitiveCategory(); + switch (primitiveCategory) { + case BOOLEAN: + { + BooleanWritable expectedWritable = (BooleanWritable) object; + boolean value = expectedWritable.get(); + serializeWrite.writeBoolean(value); + } + break; + case BYTE: + { + ByteWritable expectedWritable = (ByteWritable) object; + byte value = expectedWritable.get(); + serializeWrite.writeByte(value); + } + break; + case SHORT: + { + ShortWritable expectedWritable = (ShortWritable) object; + short value = expectedWritable.get(); + serializeWrite.writeShort(value); + } + break; + case INT: + { + IntWritable expectedWritable = (IntWritable) object; + int value = expectedWritable.get(); + serializeWrite.writeInt(value); + } + break; + case LONG: + { + LongWritable expectedWritable = (LongWritable) object; + long value = expectedWritable.get(); + serializeWrite.writeLong(value); + } + break; + case DATE: + { + DateWritable expectedWritable = (DateWritable) object; + Date value = expectedWritable.get(); + serializeWrite.writeDate(value); + } + break; + case FLOAT: + { + FloatWritable expectedWritable = (FloatWritable) object; + float value = expectedWritable.get(); + serializeWrite.writeFloat(value); + } + break; + case DOUBLE: + { + DoubleWritable expectedWritable = (DoubleWritable) object; + double value = expectedWritable.get(); + serializeWrite.writeDouble(value); + } + break; + case STRING: + { + Text text = (Text) object; + serializeWrite.writeString(text.getBytes(), 0, text.getLength()); + } + break; + case CHAR: + { + HiveCharWritable expectedWritable = (HiveCharWritable) object; + HiveChar value = expectedWritable.getHiveChar(); + serializeWrite.writeHiveChar(value); + } + break; + case VARCHAR: + { + HiveVarcharWritable expectedWritable = (HiveVarcharWritable) object; + HiveVarchar value = expectedWritable.getHiveVarchar(); + serializeWrite.writeHiveVarchar(value); + } + break; + case BINARY: + { + BytesWritable expectedWritable = (BytesWritable) object; + byte[] bytes = expectedWritable.getBytes(); + int length = expectedWritable.getLength(); + serializeWrite.writeBinary(bytes, 0, length); + } + break; + case TIMESTAMP: + { + TimestampWritable expectedWritable = (TimestampWritable) object; + Timestamp value = expectedWritable.getTimestamp(); + serializeWrite.writeTimestamp(value); + } + break; + case INTERVAL_YEAR_MONTH: + { + HiveIntervalYearMonthWritable expectedWritable = (HiveIntervalYearMonthWritable) object; + HiveIntervalYearMonth value = expectedWritable.getHiveIntervalYearMonth(); + serializeWrite.writeHiveIntervalYearMonth(value); + } + break; + case INTERVAL_DAY_TIME: + { + HiveIntervalDayTimeWritable expectedWritable = (HiveIntervalDayTimeWritable) object; + HiveIntervalDayTime value = expectedWritable.getHiveIntervalDayTime(); + serializeWrite.writeHiveIntervalDayTime(value); + } + break; + case DECIMAL: + { + HiveDecimalWritable expectedWritable = (HiveDecimalWritable) object; + HiveDecimal value = expectedWritable.getHiveDecimal(); + serializeWrite.writeHiveDecimal(value, ((DecimalTypeInfo)primitiveTypeInfos[i]).scale()); + } + break; + default: + throw new HiveException("Unexpected primitive category " + primitiveCategory); + } + } + return output; + } + + public static void addRandomNulls(Random rand, Object[][] randomRows) { + int randomRowsLength = randomRows.length; + int columnCount = randomRows[0].length; + int randomPasses = (1 + rand.nextInt(randomRowsLength)) * (1 + rand.nextInt(columnCount)); + for (int p = 0; p < randomPasses; p++) { + int column = rand.nextInt(columnCount); + int row = rand.nextInt(randomRowsLength); + randomRows[row][column] = null; + } + } + + public static void addRandomDuplicates(Random rand, Object[][] randomRows) { + int randomRowsLength = randomRows.length; + int columnCount = randomRows[0].length; + int randomPasses = (1 + rand.nextInt(randomRowsLength)) * (1 + rand.nextInt(columnCount)); + for (int p = 0; p < randomPasses; p++) { + int column = rand.nextInt(columnCount); + int row = rand.nextInt(randomRowsLength); + int duplicateCount = 1 + rand.nextInt(20); + Object duplicateObject = randomRows[row][column]; + for (int d = 1; d < duplicateCount; d++) { + int duplicateRow = row + d; + if (duplicateRow >= randomRowsLength) { + break; + } + randomRows[duplicateRow][column] = duplicateObject; + } + } + } + + public static void addRandomSelectToBatch(Random rand, VectorizedRowBatch batch) { + int factor = rand.nextInt(4); + batch.selectedInUse = true; + int numSel = 0; + for (int i = 0; i < batch.size; i++) { + if (rand.nextInt(4) >= factor) { + batch.selected[numSel++] = i; + } + } + batch.size = numSel; + } + + public static void addRandomNullsToColumnVector(Random rand, ColumnVector colVector, int batchSize) { + int factor = rand.nextInt(4); + for (int i = 0; i < batchSize; i++) { + if (rand.nextInt(4) >= factor) { + colVector.noNulls = false; + colVector.isNull[i] = true; + } + } + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java index c6704f9..82e75bf 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorSerDeRow.java @@ -84,7 +84,7 @@ } void deserializeAndVerify(Output output, DeserializeRead deserializeRead, - VectorRandomRowSource source, Object[] expectedRow) + VectorRandomRowSource source, Object[] expectedRow) throws HiveException, IOException { deserializeRead.set(output.getData(), 0, output.getLength()); PrimitiveCategory[] primitiveCategories = source.primitiveCategories(); @@ -595,7 +595,7 @@ void testVectorDeserializeRow(int caseNum, Random r, SerializationType serializa for (int i = 0; i < randomRows.length; i++) { Object[] row = randomRows[i]; - Output output = serializeRow(row, source, serializeWrite); + Output output = RandomRowUtil.serializeRow(row, source, serializeWrite); vectorDeserializeRow.setBytes(output.getData(), 0, output.getLength()); vectorDeserializeRow.deserialize(batch, batch.size); batch.size++; diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/batchgen/TestVectorBatchGenerate.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/batchgen/TestVectorBatchGenerate.java new file mode 100644 index 0000000..35199d4 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/batchgen/TestVectorBatchGenerate.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.batchgen; + +import org.apache.hadoop.hive.ql.exec.vector.VectorBatchDebug; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.batchgen.VectorBatchGenerator; +import org.apache.hadoop.hive.ql.exec.vector.batchgen.VectorBatchGenerator.GenerateType; +import org.apache.hadoop.hive.ql.exec.vector.batchgen.VectorBatchGenerator.GenerateType.Category; +import org.junit.Test; + +import java.util.Random; + +public class TestVectorBatchGenerate { + + @Test + public void testTryIt() throws Exception { + GenerateType[] generateTypes = + new GenerateType[] {new GenerateType(Category.INT), new GenerateType(Category.BYTE)}; + VectorBatchGenerator generator = new VectorBatchGenerator(generateTypes); + + VectorizedRowBatch batch = generator.createBatch(); + + Random random = new Random(); + generator.generateBatch(batch, random, VectorizedRowBatch.DEFAULT_SIZE); + VectorBatchDebug.debugDisplayBatch(batch, "testTryIt"); + } + + @Test + public void testTryIt2() throws Exception { + GenerateType[] generateTypes = + new GenerateType[] {new GenerateType(Category.BOOLEAN), new GenerateType(Category.LONG), new GenerateType(Category.DOUBLE)}; + VectorBatchGenerator generator = new VectorBatchGenerator(generateTypes); + + VectorizedRowBatch batch = generator.createBatch(); + + Random random = new Random(); + generator.generateBatch(batch, random, VectorizedRowBatch.DEFAULT_SIZE); + VectorBatchDebug.debugDisplayBatch(batch, "testTryIt2"); + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/batchgen/VectorBatchGenerator.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/batchgen/VectorBatchGenerator.java new file mode 100644 index 0000000..63e36c1 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/batchgen/VectorBatchGenerator.java @@ -0,0 +1,269 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.batchgen; + +import java.util.Arrays; +import java.util.Random; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import com.google.common.base.Preconditions; + +public class VectorBatchGenerator { + + public static class GenerateType { + + // UNDONE: Missing date/time interval data types + public enum Category { + BOOLEAN("boolean", true), + BYTE("tinyint", true), + SHORT("smallint", true), + INT("int", true), + LONG("bigint", true), + FLOAT("float", true), + DOUBLE("double", true), + STRING("string", true), + DATE("date", true), + TIMESTAMP("timestamp", true), + BINARY("binary", true), + DECIMAL("decimal", true), + VARCHAR("varchar", true), + CHAR("char", true), + LIST("array", false), + MAP("map", false), + STRUCT("struct", false), + UNION("uniontype", false); + + Category(String name, boolean isPrimitive) { + this.name = name; + this.isPrimitive = isPrimitive; + } + + final boolean isPrimitive; + final String name; + + public boolean isPrimitive() { + return isPrimitive; + } + + public String getName() { + return name; + } + } + + private Category category; + + public GenerateType(Category category) { + this.category = category; + } + + public Category getCategory() { + return category; + } + + /* + * BOOLEAN .. LONG: Min and max. + */ + private long integerMin; + private long integerMax; + + /* + * FLOAT: Min and max. + */ + private float floatMin; + private float floatMax; + + /* + * DOUBLE: Min and max. + */ + private double doubleMin; + private double doubleMax; + + /* + * STRING: + * Range, values, empty strings. + */ + + /* + * CHAR: strategic blanks, string length beyond max + */ + + /* + * VARCHAR: string length beyond max + */ + } + + private VectorColumnGroupGenerator[] columnGroups; + private boolean[] isGenerateSeries; + + public VectorBatchGenerator(GenerateType[] generateTypes) { + final int size = generateTypes.length; + columnGroups = new VectorColumnGroupGenerator[size]; + for (int i = 0; i < size; i++) { + columnGroups[i] = new VectorColumnGroupGenerator(i, generateTypes[i]); + } + isGenerateSeries = new boolean[size]; + // UNDONE: For now, all... + Arrays.fill(isGenerateSeries, true); + } + + public VectorBatchGenerator(VectorColumnGroupGenerator[] columnGroups) { + this.columnGroups = columnGroups; + } + + public void assignColumnVectors(VectorizedRowBatch batch, int columnNum, + VectorColumnGroupGenerator columnGroup) { + // UNDONE: Multiple types... + GenerateType[] generateTypes = columnGroup.generateTypes(); + GenerateType generateType = generateTypes[0]; + ColumnVector colVector; + switch (generateType.getCategory()) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + colVector = new LongColumnVector(); + break; + + case FLOAT: + case DOUBLE: + colVector = new DoubleColumnVector(); + break; + + case STRING: + colVector = new BytesColumnVector(); + break; + + // UNDONE + case DATE: + case TIMESTAMP: + case BINARY: + case DECIMAL: + case VARCHAR: + case CHAR: + case LIST: + case MAP: + case STRUCT: + case UNION: + default: + throw new RuntimeException("Unsupported catagory " + generateType.getCategory()); + } + colVector.init(); + batch.cols[columnNum] = colVector; + } + + public VectorizedRowBatch createBatch() { + final int size = columnGroups.length; + VectorizedRowBatch batch = new VectorizedRowBatch(size); + for (int i = 0; i < size; i++) { + assignColumnVectors(batch, i, columnGroups[i]); + } + return batch; + } + + public void generateBatch(VectorizedRowBatch batch, Random random, + int size) { + + // Clear value arrays. + for (int c = 0; c < columnGroups.length; c++) { + columnGroups[c].clearColumnValueArrays(); + } + + // Generate row values. + int i = 0; + while (true) { + for (int c = 0; c < columnGroups.length; c++) { + columnGroups[c].generateRowValues(i, random); + } + if (i + 1 >= size) { + break; + } + + // Null out some row column entries. + // UNDONE + + // Consider generating a column group equal value series? + if (i < size - 1) { + for (int c = 0; c < columnGroups.length; c++) { + if (isGenerateSeries[c]) { + int seriesCount = getSeriesCount(random); + if (seriesCount == 1) { + continue; + } + seriesCount = Math.min(seriesCount, size - i); + Preconditions.checkState(seriesCount > 1); + + // Fill values down for equal value series. + VectorColumnGroupGenerator columnGroup = columnGroups[c]; + columnGroup.fillDownRowValues(i, seriesCount, random); + + // For all the other column groups, generate new values down. + for (int other = 0; other < columnGroups.length; other++) { + if (other != c) { + VectorColumnGroupGenerator otherColumnGroup = columnGroups[other]; + otherColumnGroup.generateDownRowValues(i, seriesCount, random); + + // Also, null down. + // UNDONE + } + } + + // Fill down null flags. + // UNDONE + + i += (seriesCount - 1); + break; + } + } + } + // Recheck. + i++; + if (i >= size) { + break; + } + } + + // Optionally, do some filtering of rows... + // UNDONE + + // From the value arrays and our isRepeated, selected, isNull arrays, generate the batch! + for (int c = 0; c < columnGroups.length; c++) { + VectorColumnGroupGenerator columnGroup = columnGroups[c]; + + // UNDONE: Provide isRepeated, selected, isNull + columnGroup.populateBatch(batch, size, false); + } + + batch.size = size; + } + + private int getSeriesCount(Random random) { + // UNDONE: For now... + if (random.nextBoolean()) { + return 1; + } else { + return 1 + random.nextInt(10); + } + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/batchgen/VectorColumnGroupGenerator.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/batchgen/VectorColumnGroupGenerator.java new file mode 100644 index 0000000..9421e8d --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/batchgen/VectorColumnGroupGenerator.java @@ -0,0 +1,492 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.batchgen; + +import java.sql.Timestamp; +import java.util.Arrays; +import java.util.Random; + +import org.apache.hadoop.hive.common.type.RandomTypeUtil; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.batchgen.VectorBatchGenerator.GenerateType; +import org.apache.hadoop.hive.ql.exec.vector.batchgen.VectorBatchGenerator.GenerateType.Category; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.Text; + +public class VectorColumnGroupGenerator { + + private GenerateType[] generateTypes; + private int[] columnNums; + private Object[] arrays; + + public VectorColumnGroupGenerator(int columnNum, GenerateType generateType) { + columnNums = new int[] {columnNum}; + generateTypes = new GenerateType[] {generateType}; + allocateArrays(VectorizedRowBatch.DEFAULT_SIZE); + } + + public VectorColumnGroupGenerator(int startColumnNum, GenerateType[] generateTypes) { + columnNums = new int[generateTypes.length]; + for (int i = 0; i < generateTypes.length; i++) { + columnNums[i] = startColumnNum + i; + } + this.generateTypes = generateTypes; + allocateArrays(VectorizedRowBatch.DEFAULT_SIZE); + } + + public GenerateType[] generateTypes() { + return generateTypes; + } + + private void allocateArrays(int size) { + arrays = new Object[generateTypes.length]; + for (int i = 0; i < generateTypes.length; i++) { + GenerateType generateType = generateTypes[i]; + Category category = generateType.getCategory(); + Object array = null; + switch (category) { + case BOOLEAN: + array = new boolean[size]; + break; + case BYTE: + array = new byte[size]; + break; + case SHORT: + array = new short[size]; + break; + case INT: + array = new int[size]; + break; + case LONG: + array = new long[size]; + break; + case FLOAT: + array = new float[size]; + break; + case DOUBLE: + array = new double[size]; + break; + case STRING: + array = new String[size]; + break; + case TIMESTAMP: + array = new Timestamp[size]; + break; + + // UNDONE + case DATE: + case BINARY: + case DECIMAL: + case VARCHAR: + case CHAR: + + case LIST: + case MAP: + case STRUCT: + case UNION: + default: + } + arrays[i] = array; + } + } + + public void clearColumnValueArrays() { + for (int i = 0; i < generateTypes.length; i++) { + GenerateType generateType = generateTypes[i]; + Category category = generateType.getCategory(); + Object array = arrays[i]; + switch (category) { + case BOOLEAN: + Arrays.fill(((boolean[]) array), false); + break; + case BYTE: + Arrays.fill(((byte[]) array), (byte) 0); + break; + case SHORT: + Arrays.fill(((short[]) array), (short) 0); + break; + case INT: + Arrays.fill(((int[]) array), 0); + break; + case LONG: + Arrays.fill(((long[]) array), 0); + break; + case FLOAT: + Arrays.fill(((float[]) array), 0); + break; + case DOUBLE: + Arrays.fill(((double[]) array), 0); + break; + case STRING: + Arrays.fill(((String[]) array), null); + break; + case TIMESTAMP: + Arrays.fill(((Timestamp[]) array), null); + break; + + // UNDONE + case DATE: + case BINARY: + case DECIMAL: + case VARCHAR: + case CHAR: + + case LIST: + case MAP: + case STRUCT: + case UNION: + default: + } + } + } + + public void generateRowValues(int rowIndex, Random random) { + for (int i = 0; i < generateTypes.length; i++) { + generateRowColumnValue(rowIndex, i, random); + } + } + + private void generateRowColumnValue(int rowIndex, int columnIndex, Random random) { + GenerateType generateType = generateTypes[columnIndex]; + Category category = generateType.getCategory(); + Object array = arrays[columnIndex]; + switch (category) { + case BOOLEAN: + { + boolean value = random.nextBoolean(); + ((boolean[]) array)[rowIndex] = value; + } + break; + case BYTE: + { + byte value = + (byte) + (random.nextBoolean() ? + -random.nextInt(-((int) Byte.MIN_VALUE) + 1) : + random.nextInt((int) Byte.MAX_VALUE + 1)); + ((byte[]) array)[rowIndex] = value; + } + break; + case SHORT: + { + short value = + (short) + (random.nextBoolean() ? + -random.nextInt(-((int) Short.MIN_VALUE) + 1) : + random.nextInt((int) Short.MAX_VALUE + 1)); + ((short[]) array)[rowIndex] = value; + } + break; + case INT: + { + int value = random.nextInt(); + ((int[]) array)[rowIndex] = value; + } + break; + case LONG: + { + long value = random.nextLong(); + ((long[]) array)[rowIndex] = value; + } + break; + case FLOAT: + { + float value = random.nextLong(); + ((float[]) array)[rowIndex] = value; + } + break; + case DOUBLE: + { + double value = random.nextLong(); + ((double[]) array)[rowIndex] = value; + } + break; + + case STRING: + { + String value = RandomTypeUtil.getRandString(random); + ((String[]) array)[rowIndex] = value; + } + break; + + case TIMESTAMP: + { + Timestamp value = RandomTypeUtil.getRandTimestamp(random); + ((Timestamp[]) array)[rowIndex] = value; + } + break; + + // UNDONE + case DATE: + case BINARY: + case DECIMAL: + case VARCHAR: + case CHAR: + + case LIST: + case MAP: + case STRUCT: + case UNION: + default: + } + } + + public void fillDownRowValues(int rowIndex, int seriesCount, Random random) { + for (int i = 0; i < generateTypes.length; i++) { + fillDownRowColumnValue(rowIndex, i, seriesCount, random); + } + } + + private void fillDownRowColumnValue(int rowIndex, int columnIndex, int seriesCount, Random random) { + GenerateType generateType = generateTypes[columnIndex]; + Category category = generateType.getCategory(); + Object array = arrays[columnIndex]; + switch (category) { + case BOOLEAN: + { + boolean[] booleanArray = ((boolean[]) array); + boolean value = booleanArray[rowIndex]; + for (int i = 1; i < seriesCount; i++) { + booleanArray[rowIndex + i] = value; + } + } + break; + case BYTE: + { + byte[] byteArray = ((byte[]) array); + byte value = byteArray[rowIndex]; + for (int i = 1; i < seriesCount; i++) { + byteArray[rowIndex + i] = value; + } + } + break; + case SHORT: + { + short[] shortArray = ((short[]) array); + short value = shortArray[rowIndex]; + for (int i = 1; i < seriesCount; i++) { + shortArray[rowIndex + i] = value; + } + } + break; + case INT: + { + int[] intArray = ((int[]) array); + int value = intArray[rowIndex]; + for (int i = 1; i < seriesCount; i++) { + intArray[rowIndex + i] = value; + } + } + break; + case LONG: + { + long[] longArray = ((long[]) array); + long value = longArray[rowIndex]; + for (int i = 1; i < seriesCount; i++) { + longArray[rowIndex + i] = value; + } + } + break; + case FLOAT: + { + float[] floatArray = ((float[]) array); + float value = floatArray[rowIndex]; + for (int i = 1; i < seriesCount; i++) { + floatArray[rowIndex + i] = value; + } + } + break; + case DOUBLE: + { + double[] doubleArray = ((double[]) array); + double value = doubleArray[rowIndex]; + for (int i = 1; i < seriesCount; i++) { + doubleArray[rowIndex + i] = value; + } + } + break; + case STRING: + { + String[] stringArray = ((String[]) array); + String value = stringArray[rowIndex]; + for (int i = 1; i < seriesCount; i++) { + stringArray[rowIndex + i] = value; + } + } + break; + case TIMESTAMP: + { + Timestamp[] timestampArray = ((Timestamp[]) array); + Timestamp value = timestampArray[rowIndex]; + for (int i = 1; i < seriesCount; i++) { + timestampArray[rowIndex + i] = value; + } + } + break; + + // UNDONE + case DATE: + + case BINARY: + case DECIMAL: + case VARCHAR: + case CHAR: + + case LIST: + case MAP: + case STRUCT: + case UNION: + default: + } + } + + public void generateDownRowValues(int rowIndex, int seriesCount, Random random) { + for (int i = 0; i < generateTypes.length; i++) { + for (int g = 1; g < seriesCount; g++) { + generateRowColumnValue(rowIndex + g, i, random); + } + } + } + + public void populateBatch(VectorizedRowBatch batch, int size, boolean isRepeated) { + + // UNDONE: Haven't finished isRepeated + assert !isRepeated; + + for (int i = 0; i < size; i++) { + for (int g = 0; g < generateTypes.length; g++) { + populateBatchColumn(batch, g, size); + } + } + } + + private void populateBatchColumn(VectorizedRowBatch batch, int logicalColumnIndex, int size) { + int columnNum = columnNums[logicalColumnIndex]; + ColumnVector colVector = batch.cols[columnNum]; + + GenerateType generateType = generateTypes[logicalColumnIndex]; + Category category = generateType.getCategory(); + Object array = arrays[logicalColumnIndex]; + switch (category) { + case BOOLEAN: + { + boolean[] booleanArray = ((boolean[]) array); + long[] vector = ((LongColumnVector) colVector).vector; + for (int i = 0; i < size; i++) { + vector[i] = (booleanArray[i] ? 1 : 0); + } + } + break; + case BYTE: + { + byte[] byteArray = ((byte[]) array); + long[] vector = ((LongColumnVector) colVector).vector; + for (int i = 0; i < size; i++) { + vector[i] = byteArray[i]; + } + } + break; + case SHORT: + { + short[] shortArray = ((short[]) array); + long[] vector = ((LongColumnVector) colVector).vector; + for (int i = 0; i < size; i++) { + vector[i] = shortArray[i]; + } + } + break; + case INT: + { + int[] intArray = ((int[]) array); + long[] vector = ((LongColumnVector) colVector).vector; + for (int i = 0; i < size; i++) { + vector[i] = intArray[i]; + } + } + break; + case LONG: + { + long[] longArray = ((long[]) array); + long[] vector = ((LongColumnVector) colVector).vector; + for (int i = 0; i < size; i++) { + vector[i] = longArray[i]; + } + } + break; + case FLOAT: + { + float[] floatArray = ((float[]) array); + double[] vector = ((DoubleColumnVector) colVector).vector; + for (int i = 0; i < size; i++) { + vector[i] = floatArray[i]; + } + } + break; + case DOUBLE: + { + double[] doubleArray = ((double[]) array); + double[] vector = ((DoubleColumnVector) colVector).vector; + for (int i = 0; i < size; i++) { + vector[i] = doubleArray[i]; + } + } + break; + case STRING: + { + String[] stringArray = ((String[]) array); + BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); + for (int i = 0; i < size; i++) { + byte[] bytes = stringArray[i].getBytes(); + bytesColVec.setVal(i, bytes); + } + } + break; + case TIMESTAMP: + { + Timestamp[] timestampArray = ((Timestamp[]) array); + TimestampColumnVector timestampColVec = ((TimestampColumnVector) colVector); + for (int i = 0; i < size; i++) { + Timestamp timestamp = timestampArray[i]; + timestampColVec.set(i, timestamp); + } + } + break; + + // UNDONE + + case DATE: + + case BINARY: + case DECIMAL: + case VARCHAR: + case CHAR: + + case LIST: + case MAP: + case STRUCT: + case UNION: + default: + } + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeries.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeries.java new file mode 100644 index 0000000..162fd1b --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeries.java @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Random; + +import org.apache.hadoop.hive.common.type.RandomTypeUtil; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.RandomRowUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesLong; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import junit.framework.TestCase; + +/** + * Unit test for the vectorized key series. + */ +public class TestVectorKeySeries extends TestCase { + + private void validateKeySeriesLongArray(VectorizedRowBatch batch, long[] test) throws IOException { + VectorKeySeriesLong longKeySeries = + new VectorKeySeriesLong(0, TypeInfoFactory.longTypeInfo); + longKeySeries.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < longKeySeries.keyCount; k++) { + assertTrue(logicalIndex == longKeySeries.currentLogical); + int duplicateCount = longKeySeries.currentDuplicateCount; + if (!longKeySeries.currentKeyIsNull) { + long currentKey = longKeySeries.getCurrentKey(); + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + assertTrue(test[batchIndex] == currentKey); + } + } + logicalIndex += duplicateCount; + boolean isNext = longKeySeries.next(); + if (k + 1 < longKeySeries.keyCount) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateLongArray(Random rand, long[] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + LongColumnVector longColVector = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = longColVector; + System.arraycopy(test, 0, longColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesLongArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesLongArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + } + + private void validateLongRepeating(Random rand, int count, int value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + LongColumnVector longColVector = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = longColVector; + longColVector.isRepeating = true; + longColVector.vector[0] = value; + batch.size = count; + long[] test = new long[count]; + Arrays.fill(test, value); + + validateKeySeriesLongArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + } + public void testVectorKeySeriesLong() throws Throwable { + + Random rand = new Random(812); + + validateLongArray(rand, TestVectorKeySeriesUtil.longValues); + + validateLongRepeating(rand, 1, 5); + validateLongRepeating(rand, 20, 0); + validateLongRepeating(rand, 1024, 9383844); + } + + private void validateKeySeriesDoubleArray(VectorizedRowBatch batch, double[] test) throws IOException { + VectorKeySeriesDouble doubleKeySeries = new VectorKeySeriesDouble(0, TypeInfoFactory.doubleTypeInfo); + doubleKeySeries.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < doubleKeySeries.keyCount; k++) { + assertTrue(logicalIndex == doubleKeySeries.currentLogical); + int duplicateCount = doubleKeySeries.currentDuplicateCount; + if (!doubleKeySeries.currentKeyIsNull) { + double currentKey = doubleKeySeries.getCurrentDoubleKey(); + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + assertTrue(test[batchIndex] == currentKey); + } + } + logicalIndex += duplicateCount; + boolean isNext = doubleKeySeries.next(); + if (k + 1 < doubleKeySeries.keyCount) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateDoubleArray(Random rand, double[] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + DoubleColumnVector doubleColVector = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = doubleColVector; + System.arraycopy(test, 0, doubleColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesDoubleArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesDoubleArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesDoubleArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesDoubleArray(batch, test); + } + } + + private void validateDoubleRepeating(Random rand, int count, int value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + DoubleColumnVector doubleColVector = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = doubleColVector; + doubleColVector.isRepeating = true; + doubleColVector.vector[0] = value; + batch.size = count; + double[] test = new double[count]; + Arrays.fill(test, value); + + validateKeySeriesDoubleArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesDoubleArray(batch, test); + } + } + + public void testVectorKeySeriesDouble() throws Throwable { + + Random rand = new Random(3452); + + validateDoubleArray(rand, TestVectorKeySeriesUtil.doubleValues); + + validateDoubleRepeating(rand, 1, 5); + validateDoubleRepeating(rand, 20, 0); + validateDoubleRepeating(rand, 1024, 9383844); + } + + private void validateKeySeriesBytesArray(VectorizedRowBatch batch, byte[][] test) throws IOException { + VectorKeySeriesBytes bytesKeySeries = new VectorKeySeriesBytes(0); + bytesKeySeries.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < bytesKeySeries.keyCount; k++) { + assertTrue(logicalIndex == bytesKeySeries.currentLogical); + int duplicateCount = bytesKeySeries.currentDuplicateCount; + if (!bytesKeySeries.currentKeyIsNull) { + byte[] currentKey = bytesKeySeries.currentBytes; + int start = bytesKeySeries.currentStart; + int length = bytesKeySeries.currentLength; + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + byte[] testBytes = test[batchIndex]; + if (!StringExpr.equal(testBytes, 0, testBytes.length, currentKey, start, length)) { + assertTrue(false); + } + } + } + logicalIndex += duplicateCount; + boolean isNext = bytesKeySeries.next(); + if (k + 1 < bytesKeySeries.keyCount) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateBytesArray(Random rand, byte[][] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + BytesColumnVector bytesColVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = bytesColVector; + for (int i = 0; i < test.length; i++) { + bytesColVector.start[i] = 0; + bytesColVector.length[i] = test[i].length; + bytesColVector.vector[i] = test[i]; + } + System.arraycopy(test, 0, bytesColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesBytesArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesBytesArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + } + + private void validateBytesRepeating(Random rand, int count, byte[] value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + BytesColumnVector bytesColVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = bytesColVector; + bytesColVector.isRepeating = true; + bytesColVector.vector[0] = value; + bytesColVector.start[0] = 0; + bytesColVector.length[0] = value.length; + batch.size = count; + byte[][] test = new byte[count][]; + for (int i = 0; i < count; i++) { + test[i] = value; + } + + validateKeySeriesBytesArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + } + + public void testVectorKeySeriesBytes() throws Throwable { + Random rand = new Random(933); + + byte[][] test = new byte[200][]; + for (int i = 0; i < test.length; i++) { + if (i > 0 && rand.nextInt(10) == 2) { + test[i] = test[i - 1]; + continue; + } + test[i] = RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes(); + } + + validateBytesArray(rand, test); + + validateBytesRepeating(rand, 1, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + validateBytesRepeating(rand, 20, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + validateBytesRepeating(rand, 1024, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeriesFast.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeriesFast.java new file mode 100644 index 0000000..016e7d0 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeriesFast.java @@ -0,0 +1,431 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Random; + +import org.apache.hadoop.hive.common.type.RandomTypeUtil; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.RandomRowUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorAssignRow; +import org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesBytesFast; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesLongFast; +import org.apache.hadoop.hive.ql.exec.vector.keyseries.fast.VectorKeySeriesMultiFast; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; +import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import junit.framework.TestCase; + +/** + * Unit test for the vectorized key series. + */ +public class TestVectorKeySeriesFast extends TestCase { + + private void validateKeySeriesLongArray(VectorizedRowBatch batch, long[] test) throws IOException { + // Lazy binary key serializer. + LazyBinarySerializeWrite keyLazyBinarySerializeWrite = + new LazyBinarySerializeWrite(1); + LazyBinaryDeserializeRead keyLazyBinarySerializeRead = + new LazyBinaryDeserializeRead(new TypeInfo[] {TypeInfoFactory.longTypeInfo}); + + VectorKeySeriesLongFast longKeySeriesFast = + new VectorKeySeriesLongFast( + 0, TypeInfoFactory.longTypeInfo, keyLazyBinarySerializeWrite); + + longKeySeriesFast.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < longKeySeriesFast.keyCount; k++) { + assertTrue(logicalIndex == longKeySeriesFast.currentLogical); + int duplicateCount = longKeySeriesFast.currentDuplicateCount; + + if (!longKeySeriesFast.currentKeyIsNull) { + byte[] serializedBytes = longKeySeriesFast.serializedBytes; + int serializedStart = longKeySeriesFast.serializedStart; + int serializedLength = longKeySeriesFast.serializedLength; + + keyLazyBinarySerializeRead.set(serializedBytes, serializedStart, serializedLength); + assertTrue(!keyLazyBinarySerializeRead.readCheckNull()); + + long currentKey = keyLazyBinarySerializeRead.currentLong; + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + assertTrue(test[batchIndex] == currentKey); + } + } + logicalIndex += duplicateCount; + boolean isNext = longKeySeriesFast.next(); + if (k + 1 < longKeySeriesFast.keyCount) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateLongArray(Random rand, long[] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + LongColumnVector longColVector = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = longColVector; + System.arraycopy(test, 0, longColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesLongArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesLongArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + } + + private void validateLongRepeating(Random rand, int count, int value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + LongColumnVector longColVector = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = longColVector; + longColVector.isRepeating = true; + longColVector.vector[0] = value; + batch.size = count; + long[] test = new long[count]; + Arrays.fill(test, value); + + validateKeySeriesLongArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesLongArray(batch, test); + } + } + + public void testVectorKeySeriesLongFast() throws Throwable { + + Random rand = new Random(36777); + + validateLongArray(rand, TestVectorKeySeriesUtil.longValues); + + validateLongRepeating(rand, 1, 5); + validateLongRepeating(rand, 20, 0); + validateLongRepeating(rand, 1024, 9383844); + } + + private void validateKeySeriesBytesArray(VectorizedRowBatch batch, byte[][] test) throws IOException { + // Lazy binary key serializer. + LazyBinarySerializeWrite keyLazyBinarySerializeWrite = + new LazyBinarySerializeWrite(1); + LazyBinaryDeserializeRead keyLazyBinarySerializeRead = + new LazyBinaryDeserializeRead(new TypeInfo[] {TypeInfoFactory.stringTypeInfo}); + + VectorKeySeriesBytesFast bytesKeySeriesFast = + new VectorKeySeriesBytesFast( + 0, keyLazyBinarySerializeWrite); + bytesKeySeriesFast.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < bytesKeySeriesFast.keyCount; k++) { + assertTrue(logicalIndex == bytesKeySeriesFast.currentLogical); + int duplicateCount = bytesKeySeriesFast.currentDuplicateCount; + + if (!bytesKeySeriesFast.currentKeyIsNull) { + + byte[] serializedBytes = bytesKeySeriesFast.serializedBytes; + int serializedStart = bytesKeySeriesFast.serializedStart; + int serializedLength = bytesKeySeriesFast.serializedLength; + + keyLazyBinarySerializeRead.set(serializedBytes, serializedStart, serializedLength); + assertTrue(!keyLazyBinarySerializeRead.readCheckNull()); + + byte[] currentKey = keyLazyBinarySerializeRead.currentBytes; + int start = keyLazyBinarySerializeRead.currentBytesStart; + int length = keyLazyBinarySerializeRead.currentBytesLength; + + for (int d = 0; d < duplicateCount; d++) { + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex + d] : logicalIndex + d; + assertTrue(batch.cols[0].noNulls || !batch.cols[0].isNull[batchIndex]); + byte[] testBytes = test[batchIndex]; + if (!StringExpr.equal(testBytes, 0, testBytes.length, currentKey, start, length)) { + assertTrue(false); + } + } + } + logicalIndex += duplicateCount; + boolean isNext = bytesKeySeriesFast.next(); + if (k + 1 < bytesKeySeriesFast.keyCount) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + private void validateBytesArray(Random rand, byte[][] test) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + BytesColumnVector bytesColVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = bytesColVector; + for (int i = 0; i < test.length; i++) { + bytesColVector.start[i] = 0; + bytesColVector.length[i] = test[i].length; + bytesColVector.vector[i] = test[i]; + } + System.arraycopy(test, 0, bytesColVector.vector, 0, test.length); + batch.size = test.length; + + validateKeySeriesBytesArray(batch, test); + int saveBatchSize = batch.size; + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + batch.selectedInUse = false; + batch.size = saveBatchSize; + RandomRowUtil.addRandomNullsToColumnVector(rand, batch.cols[0], batch.size); + validateKeySeriesBytesArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + } + + private void validateBytesRepeating(Random rand, int count, byte[] value) throws IOException { + VectorizedRowBatch batch = new VectorizedRowBatch(1); + + BytesColumnVector bytesColVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + batch.cols[0] = bytesColVector; + bytesColVector.isRepeating = true; + bytesColVector.vector[0] = value; + bytesColVector.start[0] = 0; + bytesColVector.length[0] = value.length; + batch.size = count; + byte[][] test = new byte[count][]; + for (int i = 0; i < count; i++) { + test[i] = value; + } + + validateKeySeriesBytesArray(batch, test); + RandomRowUtil.addRandomSelectToBatch(rand, batch); + if (batch.size > 0) { + validateKeySeriesBytesArray(batch, test); + } + } + + public void testVectorKeySeriesBytesFast() throws Throwable { + Random rand = new Random(933); + + byte[][] test = new byte[200][]; + for (int i = 0; i < test.length; i++) { + if (i > 0 && rand.nextInt(10) == 2) { + test[i] = test[i - 1]; + continue; + } + test[i] = RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes(); + } + + validateBytesArray(rand, test); + + validateBytesRepeating(rand, 1, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + validateBytesRepeating(rand, 20, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + validateBytesRepeating(rand, 1024, RandomTypeUtil.getRandString(rand, null, rand.nextInt(100)).getBytes()); + } + + private void validateMulti(VectorizedRowBatch batch, + VectorKeySeriesMultiFast multiKeySeriesFast, + LazyBinaryDeserializeRead keyLazyBinarySerializeRead, + LazyBinarySerializeWrite keyLazyBinarySerializeWrite, + VectorRandomRowSource source, Object[][] randomRows, + int firstRandomRowIndex) throws IOException, HiveException { + + multiKeySeriesFast.processBatch(batch); + int logicalIndex = 0; + for (int k = 0; k < multiKeySeriesFast.keyCount; k++) { + assertTrue(logicalIndex == multiKeySeriesFast.currentLogical); + + int batchIndex = batch.selectedInUse ? batch.selected[logicalIndex] : logicalIndex; + + Object[] row = randomRows[firstRandomRowIndex + batchIndex]; + Output output = RandomRowUtil.serializeRow(row, source, keyLazyBinarySerializeWrite); + byte[] testBytes = output.getData(); + int length = output.getLength(); + + int nullCount = 0; + for (int c = 0; c < row.length; c++) { + if (row[c] == null) { + nullCount++; + } + } + int duplicateCount = multiKeySeriesFast.currentDuplicateCount; + boolean keyAllNulls = multiKeySeriesFast.currentKeyIsNull; + if (keyAllNulls) { + assertEquals(nullCount, row.length); + } else { + boolean keyHasAnyNulls = multiKeySeriesFast.currentKeyIsNull; + if (keyHasAnyNulls) { + assertTrue(nullCount > 0); + } + + byte[] serializedBytes = multiKeySeriesFast.serializedBytes; + int serializedStart = multiKeySeriesFast.serializedStart; + int serializedLength = multiKeySeriesFast.serializedLength; + + for (int d = 0; d < duplicateCount; d++) { + if (!StringExpr.equal(testBytes, 0, length, serializedBytes, serializedStart, serializedLength)) { + assertTrue(false); + } + } + } + logicalIndex += duplicateCount; + boolean isNext = multiKeySeriesFast.next(); + if (k + 1 < multiKeySeriesFast.keyCount) { + assertTrue(isNext); + } else { + assertTrue(!isNext); + } + } + assertEquals(logicalIndex, batch.size); + } + + public void testVectorKeySeriesMultiFastOne(Random rand, boolean addRandomNulls, + boolean addRandomDuplicates, boolean addRandomSelectToBatch) throws Throwable { + + String[] emptyScratchTypeNames = new String[0]; + + VectorRandomRowSource source = new VectorRandomRowSource(); + source.init(rand); + + VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); + batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); + VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); + + // junk the destination for the 1st pass + for (ColumnVector cv : batch.cols) { + Arrays.fill(cv.isNull, true); + } + + int fieldCount = source.typeNames().size(); + LazyBinaryDeserializeRead deserializeRead = new LazyBinaryDeserializeRead(source.primitiveTypeInfos()); + LazyBinarySerializeWrite serializeWrite = new LazyBinarySerializeWrite(fieldCount); + + // junk the destination for the 1st pass + for (ColumnVector cv : batch.cols) { + Arrays.fill(cv.isNull, true); + cv.noNulls = false; + } + + VectorAssignRow vectorAssignRow = new VectorAssignRow(); + vectorAssignRow.init(source.typeNames()); + + VectorKeySeriesMultiFast multiKeySeriesFast = + new VectorKeySeriesMultiFast( + serializeWrite); + int[] columnNums = new int[source.typeNames().size()]; + for (int i = 0; i < columnNums.length; i++) { + columnNums[i] = i; + } + multiKeySeriesFast.init(source.primitiveTypeInfos(), columnNums); + + Object[][] randomRows = source.randomRows(100000); + if (addRandomNulls) { + RandomRowUtil.addRandomNulls(rand, randomRows); + } + if (addRandomDuplicates) { + RandomRowUtil.addRandomDuplicates(rand, randomRows); + } + int firstRandomRowIndex = 0; + for (int i = 0; i < randomRows.length; i++) { + Object[] row = randomRows[i]; + + vectorAssignRow.assignRow(batch, batch.size, row); + batch.size++; + if (batch.size == batch.DEFAULT_SIZE) { + if (addRandomSelectToBatch) { + RandomRowUtil.addRandomSelectToBatch(rand, batch); + } + validateMulti(batch, multiKeySeriesFast, deserializeRead, serializeWrite, + source, randomRows, firstRandomRowIndex); + firstRandomRowIndex = i + 1; + batch.reset(); + } + } + if (batch.size > 0) { + if (addRandomSelectToBatch) { + RandomRowUtil.addRandomSelectToBatch(rand, batch); + } + validateMulti(batch, multiKeySeriesFast, deserializeRead, serializeWrite, + source, randomRows, firstRandomRowIndex); + } + } + + public void testVectorKeySeriesMultiFast() throws Throwable { + Random rand = new Random(933); + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ false, /* addRandomDuplicates */ false, /* addRandomSelectToBatch */ false); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ true, /* addRandomDuplicates */ false, /* addRandomSelectToBatch */ false); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ false, /* addRandomDuplicates */ true, /* addRandomSelectToBatch */ false); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ true, /* addRandomDuplicates */ true, /* addRandomSelectToBatch */ false); + } + + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ false, /* addRandomDuplicates */ false, /* addRandomSelectToBatch */ true); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ true, /* addRandomDuplicates */ false, /* addRandomSelectToBatch */ true); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ false, /* addRandomDuplicates */ true, /* addRandomSelectToBatch */ true); + } + for (int i = 0; i < 10; i++) { + testVectorKeySeriesMultiFastOne( + rand, /* addRandomNulls */ true, /* addRandomDuplicates */ true, /* addRandomSelectToBatch */ true); + } + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeriesUtil.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeriesUtil.java new file mode 100644 index 0000000..b6c2c43 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/keyseries/TestVectorKeySeriesUtil.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.keyseries; + +/** + * Utility definitions for testing the vectorized key series. + */ +public class TestVectorKeySeriesUtil { + + public static long[] longValues = {1906L, -7838598833900584960L, 1165L, -7456869587112255488L, 2013L, 7333512171174223872L, -7571293705217687552L, -8523434203900674048L, 8099215208813903872L, 9040958359122640896L, -7356685674003021824L, 2072L, 2073L, 871L, -7551394356730339328L, -8172827216441573376L, -8082793390939193344L, 8854495099223375872L, -8358130693961195520L, 9050032047355125760L, -7162299524557471744L, 809L, -8946656952763777024L, 1053L, 482L, -6968892545529896960L, 203L, 1614L, -8593419958317056000L, 8190967051000659968L, 808L, 1L, 412L, 8656571350884048896L, 8769199243315814400L, -8546758906409312256L, 8829545979081744384L, 7545689659010949120L, 618L, 8573305425181941760L, 94L, -7266719102957125632L, 2772L, 379L, 8302473563519950848L, -7802538500225777664L, -7273694358642851840L, 8987827141270880256L, 914L, 723L, -7104310188119834624L, 154L, -8494118409594650624L, 3781L, 1466L, 724L, -7270034223527993344L, 913L, 1704L, 1L, -7883252982752665600L, 412L, 4024L, 7226360892091416576L, -8244657976255889408L, 2862L, 1521L, 1L, 7386087924003676160L, 3907L, 7818464507324121088L, -8293833565967810560L, -7892780594910871552L, 1509L, 7592440105065308160L, -7600138468036386816L, 9064847977742032896L, -8379964450833367040L, 7217123582035116032L, -7879864376629567488L, 2878L, 2412L, 524L, 784L, -7046180371529351168L, 471L, 612L, 8368012468775608320L, -7547245548870025216L, 3841L, 8752150411997356032L, -8623965248051789824L, 7637152193832886272L, 9191943992860327936L, 2700L, 9180098147855769600L, 1775L, 797L, -7773957003968675840L, -8660149447361404928L, 8641221723991433216L, 392L, 1L, 8489735221193138176L, 7944741547145502720L, 6933731240564056064L, 9083704659251798016L, -9084940280061485056L, 8222714144797368320L, 8817665768680906752L, 1995L, 1561L, 2485L, 1826L, 845L, 8376440110255243264L, 9075404705968840704L, -8379109122834997248L, -6938706403992854528L, 961L, 1422L, 9149216169284091904L, 2752L, 2255L, -9080568167841226752L, 1046L, 7926898770090491904L, 7784489776013295616L, 6991316084916879360L, 1566L, 1671L, -8543982423727128576L, -8832750849949892608L, 6963217546192322560L, 236L, 7086206629592252416L, 9053187076403060736L, -8067243114610532352L, 1751L, 2502L, 294L, 7892281003266408448L, 8577096957495025664L, -8665764757143658496L, 2855L, 2811L, 8785153741735616512L, 1726L, 7186401810812059648L, -7603569103205916672L, 4018L, 3566L, 2725L, 1234L, 346L, 7961515985722605568L, 7274777328897802240L, -6933565857643814912L, -8330233444291084288L, 34L, 7080269176324218880L, 2941L, 9117063974299148288L, -6917607783359897600L, -8566940231897874432L, -8710298418608619520L, 1520L, 3728L, -8835408234247168000L, 7705445437881278464L, 6926925215281774592L, 835L, 1L, 3232L, -7840338174858199040L, 7748799008146366464L, 7410096605330227200L, 188L, 1L, -7709958788604936192L, -6920172215209426944L, -9109392978217484288L, 3608L, -8214462866994339840L, 2306L, -7759238919361888256L, -8922409715403112448L, 3664L, -9203942396257984512L, 8116738401948377088L, 1791L, -7419068456205385728L, 8795069490394882048L, 3043L, 3174L, 7625728883085025280L, -8585134536083660800L, 8558000156325707776L, -8572949572756774912L, 661L, 2393L, -7800879252150779904L, 7534549597202194432L, -7642381493746483200L, -7330413050756235264L, 7596563216912211968L, 3307L, 2971L, 2285L, 1880L, 4088L, 743L, -8317591428117274624L, 8854715632851345408L, 7768984605670604800L, 2900L, 7062605127422894080L, 7394967727502467072L, 1781L, 7238339720750948352L, 1638L, 1L, -8522878384019169280L, -8051587217208967168L, -7425160895830573056L, 7344029858387820544L, -8013397854633648128L, 8808467247666241536L, -8768744394742235136L, 9185458640237641728L, -7686220526274502656L, -8203075743525806080L, 3462L, 6964585306125008896L, 3418L, 3366L, -7867219225874571264L, 8367680396909404160L, 7524958388842078208L, 2897L, 8391785334471589888L, -8581979259158929408L, 587L, 130L, 1030L, 8362046808797306880L, 3691L, 7454632396542074880L, 7125231541858205696L, 2580L, 2512L, 7061498706968428544L, -7255686273677328384L, 9048002942653710336L, 8868529429494071296L, 8815398225009967104L, 7128222874437238784L, 8371939471056470016L, -8335810316927213568L, -7144791190333546496L, 1L, 1L, -7572962089372991488L, 8850055384477401088L, 2626L, 3599L, 213L, 2232L, -8297230235506343936L, 3430L, 391L, -7395343938785738752L, 9038087402564657152L, -9013952631912325120L, 3446L, -8703026916864802816L, -7833618000492109824L, 1541L, 8759184090543857664L, -7042183597114081280L, -7147490721376591872L, 3725L, 7961909238130270208L, -8930307926221807616L, 2719L, -6988970700649168896L, -7155539549555105792L, 3625L, 8113585123802529792L, 9207927479837319168L, -8387347109404286976L, 1L, 8896237972875370496L, 8372408423196270592L, 922L, 7255302164215013376L, -8585966098173870080L, 8424515140664360960L, -6997233584896229376L, 8087737899452432384L, 1493L, 8779711700787298304L, 2533L, 1L, 8017403886247927808L, 1282L, 2177L, -8632237187473088512L, 8109381965028548608L, 1157L, 7378993334503694336L, 1L, 2560L, 4037L, -8562524688907485184L, 2325L, 6962726713896484864L, 8120593157178228736L, 6924820982050758656L, -7366430883634929664L, -7209060152494817280L, -8689606130068611072L, 3190L, 3725L, -8581765103969312768L, 1L, 3542L, 8553195689344991232L, 1789L, 8698055291501543424L, 296L, -9095689235523264512L, 7998687089080467456L, 8160569434550403072L, 489L, -9175038118837149696L, 8571268359622172672L, -7916510129632296960L, 8323460620425330688L, 346L, 3980L, -7707242953271500800L, 1811L, 2803L, 7370078518278397952L, 7497276415392407552L, 2323L, 8467976965865799680L, 691L, 1914L, 6982145326341423104L, -9203804401302323200L, 7823874904139849728L, 7534145866886782976L, 9085434340468473856L, 8579974641030365184L, 8536948829863198720L, 341L, -9102482277760983040L, 658L, 1L, 2843L, 7584007864107778048L, 590L, 8899122608190930944L, 3588L, 3609L, 3824L, 7690986322714066944L, 7765456790394871808L, -8649711322250362880L, 1948L, -9101953184875757568L, 2463L, 1813L, 7054271419461812224L, 7548958830580563968L, -9206329156028112896L, 2637L, -7661250850555633664L, 664L, 2487L, 8221561626658881536L, 8169878743136043008L, 6927260280037097472L, 342L, -7501803640821456896L, 2745L, 677L, 8435912708683087872L, 7412924364686458880L, 3563L, 1L, 7153922334283776000L, 8849475396952514560L, 2977L, -7910019233726242816L, 2835L, 2335L, 1L, 2515L, -7617860842651017216L, -7637755520917741568L, 2647L, 707L, 8856674723376668672L, 7857878068300898304L, -8887058200926093312L, 108L, 2762L, 3622L, 868L, 138L, 1786L, 9116137265342169088L, 7955126053367119872L, 491L, -7778829032042790912L, -7753051494275432448L, 8962097525980225536L, 8163948965373386752L, 1145L, -8438554249514491904L, 522L, 1785L, 1545L, 999L, 1941L, 1L, 7454442625055145984L, 3510L, 2373L, -8127494999848919040L, 1643L, -7819437864839495680L, -7822452149325094912L, 7411793502161182720L, 2274L, 8783241818558193664L, 8316336224427483136L, -7669169138124275712L, 2984L, -7772064021830574080L, 3397L, 1L, 8523972434954510336L, -7127548949860818944L, 8286706213485297664L, 3147L, -7536330682873937920L, -7115054815375073280L, -7319315187617587200L, 1099L, -8989473881707921408L, 2816L, -6986178228432322560L, -7759425383684849664L, -7893577088764174336L, 8091421389575282688L, -7409653086454030336L, 7348598907182800896L, -7362189611124563968L, 1L, 2465L, 350L, 2619L, 3722L, 898L, 782L, 1780L, 2186L, -6921654334727036928L, 4020L, 8325227661920133120L, -7036607470351654912L, 7304839835188609024L, 8792059919353348096L, -8856821118526734336L, 8720504651219001344L, 1055L, 1368L, 8736061027343859712L, 7919597361814577152L, 7381659098423926784L, 8731960288562044928L, -7594824008626372608L, -9178166810751909888L, 3083L, -8948335470186373120L, 2569L, 823L, 259L, 8461498293348065280L, -8961059046745669632L, -8607195685207408640L, -8754966081778565120L, -8418913260807217152L, -8877053610728161280L, -6935548339131138048L, -8219876839318716416L, 1132L, 1337L, 1341L, 976L, -7557017910095650816L, 1L, -8683802826440105984L, 1845L, 1965L, -8104684579106914304L, 1835L, 7345991518378442752L, 3212L, -7081500255163727872L, 1074L, 8372588378498777088L, -7593363318079610880L, -7451660755269853184L, 1983L, 8514851182589771776L, 1864L, 8463868417649524736L, 3094L, -8858063395050110976L, 1981L, -8140349174954893312L, -7041362811802148864L, 8972161729142095872L, 7989119273552158720L, 2469L, 1481L, -8566856504746352640L, 8272001752345690112L, -7094827141662539776L, 8396433451610652672L, -7679894005808693248L, 8613562211893919744L, 3407L, 7686992843032010752L, 1048L, 3507L, 7784169796350730240L, 8551446856960942080L, 3467L, 1458L, 213L, 735L, 9190466190353661952L, -8280276629934981120L, -7895991410072928256L, -9145593811310010368L, 8059284960252731392L, 367L, 7614435638888210432L, 9174894805640142848L, -8941201923743703040L, 1075L, 7492436934952574976L, -8714995808835444736L, 7782245855193874432L, 8525894870444638208L, -7661192563533062144L, 1L, 8995562121346260992L, 7626715182847090688L, 8146492373537660928L, 7682327310082531328L, 2968L, 7309156463509061632L, 1955L, 1L, 7022349041913978880L, 7045967493826387968L, 3006L, 65L, 8168742078705262592L, 7212016545671348224L, 8079573715140485120L, 3965L, 8555933456197828608L, 2903L, 7648729477297987584L, 1L, 8223732800007864320L, -7412431471807283200L, 2560L, 2988L, 1243L, 1837L, 7014537632150224896L, 3747L, 2682L, 8073733016154431488L, 2938L, 1312L, 7006803044329021440L, 7701723309715685376L, 7528074274555305984L, -7532751268425261056L, 8000440057238052864L, -7964801953178091520L, 2846L, 8723248113030782976L, 7440265908266827776L, 927L, -7063777488249085952L, 9194388393453060096L, 7720187583697502208L, 8557218322962644992L, 950L, 2189L, 1371L, 7370803940448305152L, -8914039133569400832L, 3663L, 2341L, -8877431933441327104L, 8171188598958407680L, 8525336514806317056L, 1608L, -7094189393339678720L, 1752L, 3084L, 3673L, 9169248521377374208L, -7866079955473989632L, -9004892183139811328L, 1892L, 6928080429732536320L, -7623047151287754752L, 2492L, -7695491171376291840L, -7797151404935618560L, 8208354137450766336L, -7395553021620731904L, -8453491903284994048L, -7140008543769042944L, 2724L, 3443L, -7512297136103800832L, 9136234417125007360L, 8192304692696383488L, 8199513544090730496L, 311L, -8488247955875618816L, 1L, 2540L, 586L, -7444070205513138176L, 1141L, -8076479329071955968L, 3103L, -7629401308029976576L, -7507424948896415744L, 2821L, 2017L, 1134L, 347L, -7246123871306244096L, 2020L, 1693L, 2020L, 8570983266408103936L, 2919L, 2283L, 7534042483076857856L, 1L, 8991442360387584000L, -7240213957902663680L, 3365L, 1899L, 7199539820886958080L, 7165364563962191872L, 8407869317250220032L, 1489L, 2400L, -7037375807670501376L, 7235109456886816768L, 8569030475428511744L, 2067L, 8332670681629106176L, 168L, 1L, -83}; + + public static double[] doubleValues = {-1234567890.12350D,-4400.00000D,-4400.00000D,-1255.49000D,-1255.49000D,-1.12200D,-1.12200D,-1.12000D,-1.12000D,-0.33300D,-0.33300D,-0.30000D,-0.30000D,0.00000D,0.00000D,0.00000D,0.00000D,0.33300D,0.33300D,1.00000D,1.00000D,1.00000D,1.00000D,1.12000D,1.12000D,1.12200D,1.12200D,2.00000D,2.00000D,3.14000D,3.14000D,3.14000D,3.14000D,3.14000D,3.14000D,10.00000D,10.00000D,10.73430D,10.73433D,124.00000D,124.00000D,125.20000D,125.20000D,23232.23435D,23232.23440D,2389432.23750D,2389432.23750D,1234567890.12350D}; +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOperator.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOperator.java new file mode 100644 index 0000000..3d32bc5 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOperator.java @@ -0,0 +1,1482 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMapFactory; +import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer; +import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext; +import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorBatchDebug; +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping; +import org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping; +import org.apache.hadoop.hive.ql.exec.vector.VectorExtractRow; +import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator; +import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOuterFilteredOperator; +import org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource; +import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; +import org.apache.hadoop.hive.ql.exec.vector.batchgen.VectorBatchGenerator; +import org.apache.hadoop.hive.ql.exec.vector.batchgen.VectorBatchGenerator.GenerateType; +import org.apache.hadoop.hive.ql.exec.vector.batchgen.VectorBatchGenerator.GenerateType.Category; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastHashTableFactory; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastMultiKeyHashMap; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VerifyFastRow; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.JoinCondDesc; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.OperatorVariation; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; +import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HashCodeUtil; +import org.apache.hive.common.util.ReflectionUtil; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.SortedMap; +import java.util.TreeMap; + +import junit.framework.Assert; + +public class TestVectorMapJoinOperator { + + public static final class TestRow implements Comparable{ + + private final Object[] row; + + // Not included in equals. + private int index; + + public TestRow(Object[] row) { + this.row = row; + index = -1; // Not used value. + } + + public Object[] getRow() { + return row; + } + + public void setIndex(int index) { + this.index = index; + } + + public int getIndex() { + return index; + } + + @Override + public int hashCode() { + int hashCode = Arrays.hashCode(row); + return hashCode; + } + + @Override + public Object clone() { + return new TestRow(row); + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (!(obj instanceof TestRow)) { + return false; + } + final TestRow other = (TestRow) obj; + return Arrays.equals(this.row, other.row); + } + + @Override + public String toString() { + return Arrays.toString(row); + } + + @Override + public int compareTo(TestRow obj) { + final TestRow other = (TestRow) obj; + int thisLength = this.row.length; + int otherLength = other.row.length; + if (thisLength != otherLength) { + return (thisLength < otherLength ? -1 : 1); + } + for (int i = 0; i < thisLength; i++) { + Object thisObject = this.row[i]; + Object otherObject = other.row[i]; + if (thisObject == null || otherObject == null) { + if (thisObject == null && otherObject == null) { + continue; + } + // Does this make sense? + return (thisObject == null ? -1 : 1); + } + int compareTo = ((Comparable) thisObject).compareTo((Comparable) otherObject); + if (compareTo != 0) { + return compareTo; + } + } + return 0; + } + } + + public class TestRowMultiSet { + private SortedMap sortedMap; + + public TestRowMultiSet() { + sortedMap = new TreeMap(); + } + + public void add(TestRow testRow) { + if (sortedMap.containsKey(testRow)) { + Integer count = sortedMap.get(testRow); + count++; + } else { + sortedMap.put(testRow, 1); + } + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (!(obj instanceof TestRowMultiSet)) { + return false; + } + final TestRowMultiSet other = (TestRowMultiSet) obj; + final int thisSize = this.sortedMap.size(); + final int otherSize = other.sortedMap.size(); + Iterator> thisIterator = this.sortedMap.entrySet().iterator(); + Iterator> otherIterator = other.sortedMap.entrySet().iterator(); + for (int i = 0; i < thisSize; i++) { + Entry thisEntry = thisIterator.next(); + Entry otherEntry = otherIterator.next(); + if (!thisEntry.getKey().equals(otherEntry.getKey())) { + return false; + } + // Check multi-set count. + if (!thisEntry.getValue().equals(otherEntry.getValue())) { + return false; + } + } + if (thisSize != otherSize) { + return false; + } + return true; + } + + @Override + public String toString() { + return sortedMap.toString(); + } + } + + public VectorizedRowBatch[] createBigTableBatches(VectorBatchGenerator generator, int bigTableBatchesLength) { + + VectorizedRowBatch[] batches = new VectorizedRowBatch[bigTableBatchesLength]; + // UNDONE: Only up to rowCount + for (int i = 0; i < bigTableBatchesLength; i++) { + batches[i] = generator.createBatch(); + } + return batches; + } + + private TestRow[] getTestKeys(VectorizedRowBatch[] batches, VectorExtractRow vectorExtractRow, + int columnCount, ObjectInspector[] objectInspectors) { + TestRow[] testKeys = new TestRow[batches.length * VectorizedRowBatch.DEFAULT_SIZE]; + int index = 0; + for (int b = 0; b < batches.length; b++) { + VectorizedRowBatch batch = batches[b]; + for (int i = 0; i < batch.size; i++) { + Object[] rowObjects = new Object[columnCount]; + vectorExtractRow.extractRow(batch, i, rowObjects); + for (int c = 0; c < rowObjects.length; c++) { + rowObjects[c] = ((PrimitiveObjectInspector) objectInspectors[c]).copyObject(rowObjects[c]); + } + testKeys[index++] = new TestRow(rowObjects); + } + } + return testKeys; + } + + private class TestCollectorOperator extends Operator { + + private int rowCount; + private final TestRowMultiSet testRowMultiSet; + private final ObjectInspector[] outputObjectInspectors; + + public TestCollectorOperator(ObjectInspector[] outputObjectInspectors) { + super(); + rowCount = 0; + testRowMultiSet = new TestRowMultiSet(); + this.outputObjectInspectors = outputObjectInspectors; + } + + public int getRowCount() { + return rowCount; + } + + public TestRowMultiSet getTestRowMultiSet() { + return testRowMultiSet; + } + + @Override + public void process(Object row, int tag) throws HiveException { + rowCount++; + Object[] rowObjects = (Object[]) row; + for (int c = 0; c < rowObjects.length; c++) { + rowObjects[c] = ((PrimitiveObjectInspector) outputObjectInspectors[c]).copyObject(rowObjects[c]); + } + testRowMultiSet.add(new TestRow(rowObjects)); + } + + @Override + public String getName() { + return "TEST"; + } + + @Override + public OperatorType getType() { + // TODO Auto-generated method stub + return null; + } + + } + + private class TestVectorCollectorOperator extends Operator { + + private int rowCount; + private final TestRowMultiSet testRowMultiSet; + private final ObjectInspector[] outputObjectInspectors; + private final VectorExtractRow vectorExtractRow; + + public int getRowCount() { + return rowCount; + } + + public TestRowMultiSet getTestRowMultiSet() { + return testRowMultiSet; + } + + public TestVectorCollectorOperator(TypeInfo[] outputTypeInfos, + ObjectInspector[] outputObjectInspectors) throws HiveException { + super(); + rowCount = 0; + testRowMultiSet = new TestRowMultiSet(); + this.outputObjectInspectors = outputObjectInspectors; + vectorExtractRow = new VectorExtractRow(); + vectorExtractRow.init(outputTypeInfos); + } + + @Override + public void process(Object row, int tag) throws HiveException { + VectorizedRowBatch batch = (VectorizedRowBatch) row; + rowCount += batch.size; + boolean selectedInUse = batch.selectedInUse; + int[] selected = batch.selected; + for (int logical = 0; logical < batch.size; logical++) { + int batchIndex = (selectedInUse ? selected[logical] : logical); + Object[] rowObjects = new Object[outputObjectInspectors.length]; + vectorExtractRow.extractRow(batch, batchIndex, rowObjects); + for (int c = 0; c < rowObjects.length; c++) { + rowObjects[c] = ((PrimitiveObjectInspector) outputObjectInspectors[c]).copyObject(rowObjects[c]); + } + testRowMultiSet.add(new TestRow(rowObjects)); + } + } + + @Override + public String getName() { + return "TEST"; + } + + @Override + public OperatorType getType() { + // TODO Auto-generated method stub + return null; + } + + } + + private Category generateCategoryFromPrimitiveCategory(PrimitiveCategory primitiveCategory) { + switch (primitiveCategory) { + case BOOLEAN: + return Category.BOOLEAN; + case BYTE: + return Category.BYTE; + case SHORT: + return Category.SHORT; + case INT: + return Category.INT; + case LONG: + return Category.LONG; + case FLOAT: + return Category.FLOAT; + case DOUBLE: + return Category.DOUBLE; + case STRING: + return Category.STRING; + case DATE: + return Category.DATE; + case TIMESTAMP: + return Category.TIMESTAMP; + case BINARY: + return Category.BINARY; + case DECIMAL: + return Category.DECIMAL; + case VARCHAR: + return Category.VARCHAR; + case CHAR: + return Category.CHAR; + default: + return null; + } + } + + private GenerateType[] generateTypesFromTypeInfos(TypeInfo[] typeInfos) { + final int size = typeInfos.length; + GenerateType[] generateTypes = new GenerateType[size]; + for (int i = 0; i < size; i++) { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfos[i]; + Category category = + generateCategoryFromPrimitiveCategory(primitiveTypeInfo.getPrimitiveCategory()); + generateTypes[i] = new GenerateType(category); + } + return generateTypes; + } + + private List intArrayToList(int[] intArray) { + List intList = new ArrayList(intArray.length); + for (int i = 0; i < intArray.length; i++) { + intList.add(intArray[i]); + } + return intList; + } + + @Test + public void testLong() throws Exception { + + Random random = new Random(82303); + + final int batchCount = 10; + + HiveConf hiveConf = new HiveConf(); + + String[] bigTableColumnNames = new String[] {"b1"}; + TypeInfo[] bigTableTypeInfos = + new TypeInfo[] { + TypeInfoFactory.longTypeInfo}; + int[] bigTableKeyColumnNums = new int[] {0}; + + String[] smallTableValueColumnNames = new String[] {"sv1", "sv2"}; + TypeInfo[] smallTableValueTypeInfos = + new TypeInfo[] {TypeInfoFactory.dateTypeInfo, TypeInfoFactory.stringTypeInfo}; + + int[] bigTableRetainColumnNums = new int[] {0}; + + int[] smallTableRetainKeyColumnNums = new int[] {}; + int[] smallTableRetainValueColumnNums = new int[] {0, 1}; + + for (OperatorVariation operatorVariation : OperatorVariation.values()) { + if (operatorVariation != OperatorVariation.OUTER) { + continue; + } + TestMapJoinDescription testMapJoinDescription = new TestMapJoinDescription( + batchCount, hiveConf, operatorVariation, + bigTableColumnNames, bigTableTypeInfos, + bigTableKeyColumnNums, + smallTableValueColumnNames, smallTableValueTypeInfos, + bigTableRetainColumnNums, + smallTableRetainKeyColumnNums, smallTableRetainValueColumnNums); + execute(testMapJoinDescription, random); + } + + } + + @Test + public void testMultiKey() throws Exception { + + Random random = new Random(4459); + + final int batchCount = 1; + + HiveConf hiveConf = new HiveConf(); + + String[] bigTableColumnNames = new String[] {"b1", "b2", "b3"}; + TypeInfo[] bigTableTypeInfos = + new TypeInfo[] { + TypeInfoFactory.booleanTypeInfo, + TypeInfoFactory.longTypeInfo, + TypeInfoFactory.doubleTypeInfo}; + int[] bigTableKeyColumnNums = new int[] {0, 1, 2}; + + String[] smallTableValueColumnNames = new String[] {"sv1"}; + TypeInfo[] smallTableValueTypeInfos = + new TypeInfo[] {TypeInfoFactory.stringTypeInfo}; + + int[] bigTableRetainColumnNums = new int[] {0, 1, 2}; + + int[] smallTableRetainKeyColumnNums = new int[] {}; + int[] smallTableRetainValueColumnNums = new int[] {0}; + + for (OperatorVariation operatorVariation : OperatorVariation.values()) { + if (operatorVariation != OperatorVariation.OUTER) { + continue; + } + TestMapJoinDescription testMapJoinDescription = new TestMapJoinDescription( + batchCount, hiveConf, operatorVariation, + bigTableColumnNames, bigTableTypeInfos, + bigTableKeyColumnNums, + smallTableValueColumnNames, smallTableValueTypeInfos, + bigTableRetainColumnNums, + smallTableRetainKeyColumnNums, smallTableRetainValueColumnNums); + execute(testMapJoinDescription, random); + } + + } + + @Test + public void testString() throws Exception { + + Random random = new Random(82303); + + final int batchCount = 10; + + HiveConf hiveConf = new HiveConf(); + + String[] bigTableColumnNames = new String[] {"b1"}; + TypeInfo[] bigTableTypeInfos = + new TypeInfo[] { + TypeInfoFactory.stringTypeInfo}; + int[] bigTableKeyColumnNums = new int[] {0}; + + String[] smallTableValueColumnNames = new String[] {"sv1", "sv2"}; + TypeInfo[] smallTableValueTypeInfos = + new TypeInfo[] {TypeInfoFactory.dateTypeInfo, TypeInfoFactory.timestampTypeInfo}; + + int[] bigTableRetainColumnNums = new int[] {0}; + + int[] smallTableRetainKeyColumnNums = new int[] {}; + int[] smallTableRetainValueColumnNums = new int[] {0, 1}; + + for (OperatorVariation operatorVariation : OperatorVariation.values()) { + if (operatorVariation != OperatorVariation.OUTER) { + continue; + } + TestMapJoinDescription testMapJoinDescription = new TestMapJoinDescription( + batchCount, hiveConf, operatorVariation, + bigTableColumnNames, bigTableTypeInfos, + bigTableKeyColumnNums, + smallTableValueColumnNames, smallTableValueTypeInfos, + bigTableRetainColumnNums, + smallTableRetainKeyColumnNums, smallTableRetainValueColumnNums); + execute(testMapJoinDescription, random); + } + + } + + public class TestMapJoinDescription { + final int batchCount; + final HiveConf hiveConf; + final OperatorVariation operatorVariation; + + // Adjustable. + String[] bigTableColumnNames; + TypeInfo[] bigTableTypeInfos; + int[] bigTableKeyColumnNums; + String[] smallTableValueColumnNames; + TypeInfo[] smallTableValueTypeInfos; + int[] bigTableRetainColumnNums; + int[] smallTableRetainKeyColumnNums; + int[] smallTableRetainValueColumnNums; + + // Derived. + List bigTableColumnNamesList; + String[] bigTableKeyColumnNames; + TypeInfo[] bigTableKeyTypeInfos; + List smallTableValueColumnNamesList; + ObjectInspector[] bigTableObjectInspectors; + List bigTableObjectInspectorsList; + StandardStructObjectInspector bigTableStandardObjectInspector; + PrimitiveTypeInfo[] smallTableValuePrimitiveTypeInfos; + ObjectInspector[] smallTableObjectInspectors; + PrimitiveCategory[] smallTablePrimitiveCategories; + List smallTableObjectInspectorsList; + StandardStructObjectInspector smallTableStandardObjectInspector; + ObjectInspector[] inputObjectInspectors; + String[] outputColumnNames; + TypeInfo[] outputTypeInfos; + ObjectInspector[] outputObjectInspectors; + + public TestMapJoinDescription ( + int batchCount, + HiveConf hiveConf, + OperatorVariation operatorVariation, + String[] bigTableColumnNames, TypeInfo[] bigTableTypeInfos, + int[] bigTableKeyColumnNums, + String[] smallTableValueColumnNames, TypeInfo[] smallTableValueTypeInfos, + int[] bigTableRetainColumnNums, + int[] smallTableRetainKeyColumnNums, int[] smallTableRetainValueColumnNums) { + + this.batchCount = batchCount; + this.hiveConf = hiveConf; + this.operatorVariation = operatorVariation; + + this.bigTableColumnNames = bigTableColumnNames; + this.bigTableTypeInfos = bigTableTypeInfos; + this.bigTableKeyColumnNums = bigTableKeyColumnNums; + this.smallTableValueColumnNames = smallTableValueColumnNames; + this.smallTableValueTypeInfos = smallTableValueTypeInfos; + this.bigTableRetainColumnNums = bigTableRetainColumnNums; + this.smallTableRetainKeyColumnNums = smallTableRetainKeyColumnNums; + this.smallTableRetainValueColumnNums = smallTableRetainValueColumnNums; + } + + public void computeDerived() { + bigTableColumnNamesList = Arrays.asList(bigTableColumnNames); + + bigTableKeyColumnNames = new String[bigTableKeyColumnNums.length]; + bigTableKeyTypeInfos = new TypeInfo[bigTableKeyColumnNums.length]; + for (int i = 0; i < bigTableKeyColumnNums.length; i++) { + bigTableKeyColumnNames[i] = bigTableColumnNames[bigTableKeyColumnNums[i]]; + bigTableKeyTypeInfos[i] = bigTableTypeInfos[bigTableKeyColumnNums[i]]; + } + + smallTableValueColumnNamesList = Arrays.asList(smallTableValueColumnNames); + + bigTableObjectInspectors = new ObjectInspector[bigTableTypeInfos.length]; + for (int i = 0; i < bigTableTypeInfos.length; i++) { + bigTableObjectInspectors[i] = + PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector((PrimitiveTypeInfo) bigTableTypeInfos[i]); + } + bigTableObjectInspectorsList = Arrays.asList(bigTableObjectInspectors); + + smallTableObjectInspectors = new ObjectInspector[smallTableValueTypeInfos.length]; + smallTablePrimitiveCategories = new PrimitiveCategory[smallTableValueTypeInfos.length]; + smallTableValuePrimitiveTypeInfos = new PrimitiveTypeInfo[smallTableValueTypeInfos.length]; + for (int i = 0; i < smallTableValueTypeInfos.length; i++) { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) smallTableValueTypeInfos[i]; + smallTableObjectInspectors[i] = + PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo); + smallTablePrimitiveCategories[i] = primitiveTypeInfo.getPrimitiveCategory(); + smallTableValuePrimitiveTypeInfos[i] = primitiveTypeInfo; + } + smallTableObjectInspectorsList = Arrays.asList(smallTableObjectInspectors); + + bigTableStandardObjectInspector = + ObjectInspectorFactory.getStandardStructObjectInspector( + bigTableColumnNamesList, Arrays.asList((ObjectInspector[]) bigTableObjectInspectors)); + smallTableStandardObjectInspector = + ObjectInspectorFactory.getStandardStructObjectInspector( + smallTableValueColumnNamesList, Arrays.asList((ObjectInspector[]) smallTableObjectInspectors)); + + inputObjectInspectors = + new ObjectInspector[] { bigTableStandardObjectInspector, smallTableStandardObjectInspector }; + + int outputLength = + bigTableRetainColumnNums.length + + smallTableRetainKeyColumnNums.length + + smallTableRetainValueColumnNums.length; + outputColumnNames = createOutputColumnNames(outputLength); + + outputTypeInfos = new TypeInfo[outputLength]; + int outputIndex = 0; + for (int i = 0; i < bigTableRetainColumnNums.length; i++) { + outputTypeInfos[outputIndex++] = bigTableTypeInfos[bigTableRetainColumnNums[i]]; + } + // for (int i = 0; i < smallTableRetainKeyColumnNums.length; i++) { + // outputTypeInfos[outputIndex++] = smallTableTypeInfos[smallTableRetainKeyColumnNums[i]]; + // } + for (int i = 0; i < smallTableRetainValueColumnNums.length; i++) { + outputTypeInfos[outputIndex++] = smallTableValueTypeInfos[smallTableRetainValueColumnNums[i]]; + } + + outputObjectInspectors = new ObjectInspector[outputLength]; + for (int i = 0; i < outputLength; i++) { + PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) outputTypeInfos[i]; + outputObjectInspectors[i] = + PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(primitiveTypeInfo); + } + } + + public void trimAwaySmallTableValueInfo() { + smallTableValueColumnNames = new String[] {}; + smallTableValueTypeInfos = new TypeInfo[] {}; + smallTableRetainKeyColumnNums = new int[] {}; + smallTableRetainValueColumnNums = new int[] {}; + } + } + + private String[] createOutputColumnNames(int outputColumnCount) { + String[] outputColumnNames = new String[outputColumnCount]; + int counter = 1; + for (int i = 0; i < outputColumnCount; i++) { + outputColumnNames[i] = "out" + counter++; + } + return outputColumnNames; + } + + private MapJoinDesc createMapJoinDesc(TestMapJoinDescription testDesc) { + + MapJoinDesc mapJoinDesc = new MapJoinDesc(); + mapJoinDesc.setPosBigTable(0); + List keyExpr = new ArrayList(); + for (int i = 0; i < testDesc.bigTableKeyColumnNums.length; i++) { + keyExpr.add(new ExprNodeColumnDesc(testDesc.bigTableKeyTypeInfos[i], testDesc.bigTableKeyColumnNames[i], "B", false)); + } + + Map> keyMap = new HashMap>(); + keyMap.put((byte)0, keyExpr); + + List smallTableExpr = new ArrayList(); + for (int i = 0; i < testDesc.smallTableValueColumnNames.length; i++) { + smallTableExpr.add(new ExprNodeColumnDesc(testDesc.smallTableValueTypeInfos[i], testDesc.smallTableValueColumnNames[i], "S", false)); + } + keyMap.put((byte)1, smallTableExpr); + + mapJoinDesc.setKeys(keyMap); + mapJoinDesc.setExprs(keyMap); + + Byte[] order = new Byte[] {(byte) 0, (byte) 1}; + mapJoinDesc.setTagOrder(order); + mapJoinDesc.setNoOuterJoin(testDesc.operatorVariation != OperatorVariation.OUTER); + + Map> filterMap = new HashMap>(); + filterMap.put((byte) 0, new ArrayList()); // None. + mapJoinDesc.setFilters(filterMap); + + List bigTableRetainColumnNumsList = intArrayToList(testDesc.bigTableRetainColumnNums); + + // For now, just small table values... + List smallTableRetainColumnNumsList = intArrayToList(testDesc.smallTableRetainValueColumnNums); + + Map> retainListMap = new HashMap>(); + retainListMap.put((byte) 0, bigTableRetainColumnNumsList); + retainListMap.put((byte) 1, smallTableRetainColumnNumsList); + mapJoinDesc.setRetainList(retainListMap); + + int joinDescType; + switch (testDesc.operatorVariation) { + case INNER: + case INNER_BIG_ONLY: + joinDescType = JoinDesc.INNER_JOIN; + break; + case LEFT_SEMI: + joinDescType = JoinDesc.LEFT_SEMI_JOIN; + break; + case OUTER: + joinDescType = JoinDesc.LEFT_OUTER_JOIN; + break; + default: + throw new RuntimeException("unknown operator variation " + testDesc.operatorVariation); + } + JoinCondDesc[] conds = new JoinCondDesc[1]; + conds[0] = new JoinCondDesc(0, 1, joinDescType); + mapJoinDesc.setConds(conds); + + TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(testDesc.hiveConf, PlanUtils + .getFieldSchemasFromColumnList(keyExpr, "")); + mapJoinDesc.setKeyTblDesc(keyTableDesc); + + TableDesc valueTableDesc = PlanUtils.getMapJoinValueTableDesc( + PlanUtils.getFieldSchemasFromColumnList(smallTableExpr, "")); + ArrayList valueTableDescsList = new ArrayList(); + valueTableDescsList.add(null); + valueTableDescsList.add(valueTableDesc); + mapJoinDesc.setValueTblDescs(valueTableDescsList); + mapJoinDesc.setValueFilteredTblDescs(valueTableDescsList); + + mapJoinDesc.setOutputColumnNames(Arrays.asList(testDesc.outputColumnNames)); + + return mapJoinDesc; + } + + public VectorMapJoinDesc createVectorMapJoinDesc(TestMapJoinDescription testDesc) { + VectorMapJoinDesc vectorDesc = new VectorMapJoinDesc(); + vectorDesc.setHashTableImplementationType(HashTableImplementationType.FAST); + HashTableKind hashTableKind; + switch (testDesc.operatorVariation) { + case INNER: + hashTableKind = HashTableKind.HASH_MAP; + break; + case INNER_BIG_ONLY: + hashTableKind = HashTableKind.HASH_MULTISET; + break; + case LEFT_SEMI: + hashTableKind = HashTableKind.HASH_SET; + break; + case OUTER: + hashTableKind = HashTableKind.HASH_MAP; + break; + default: + throw new RuntimeException("unknown operator variation " + testDesc.operatorVariation); + } + vectorDesc.setHashTableKind(hashTableKind); + HashTableKeyType hashTableKeyType = HashTableKeyType.MULTI_KEY; // Assume. + if (testDesc.bigTableKeyTypeInfos.length == 1) { + switch (((PrimitiveTypeInfo) testDesc.bigTableKeyTypeInfos[0]).getPrimitiveCategory()) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + hashTableKeyType = HashTableKeyType.LONG; + break; + case STRING: + hashTableKeyType = HashTableKeyType.STRING; + break; + default: + // Stay with MULTI_KEY + } + } + vectorDesc.setHashTableKeyType(hashTableKeyType); + vectorDesc.setOperatorVariation(testDesc.operatorVariation); + vectorDesc.setMinMaxEnabled(false); + + VectorMapJoinInfo vectorMapJoinInfo = new VectorMapJoinInfo(); + + vectorMapJoinInfo.setBigTableKeyColumnMap(testDesc.bigTableKeyColumnNums); + vectorMapJoinInfo.setBigTableKeyColumnNames(testDesc.bigTableKeyColumnNames); + vectorMapJoinInfo.setBigTableKeyTypeInfos(testDesc.bigTableKeyTypeInfos); + vectorMapJoinInfo.setBigTableKeyExpressions(null); + + vectorMapJoinInfo.setBigTableValueColumnMap(new int[0]); + vectorMapJoinInfo.setBigTableValueColumnNames(new String[0]); + vectorMapJoinInfo.setBigTableValueTypeInfos(new TypeInfo[0]); + vectorMapJoinInfo.setBigTableValueExpressions(null); + + VectorColumnSourceMapping projectionMapping = + new VectorColumnSourceMapping("Projection Mapping"); + + + VectorColumnOutputMapping bigTableRetainedMapping = + new VectorColumnOutputMapping("Big Table Retained Mapping"); + for (int i = 0; i < testDesc.bigTableTypeInfos.length; i++) { + bigTableRetainedMapping.add(i, i, testDesc.bigTableTypeInfos[i]); + projectionMapping.add(i, i, testDesc.bigTableKeyTypeInfos[i]); + } + + VectorColumnOutputMapping bigTableOuterKeyMapping = + new VectorColumnOutputMapping("Big Table Outer Key Mapping"); + + // The order of the fields in the LazyBinary small table value must be used, so + // we use the source ordering flavor for the mapping. + VectorColumnSourceMapping smallTableMapping = + new VectorColumnSourceMapping("Small Table Mapping"); + int outputColumn = testDesc.bigTableTypeInfos.length; + for (int i = 0; i < testDesc.smallTableValueTypeInfos.length; i++) { + smallTableMapping.add(i, outputColumn, testDesc.smallTableValueTypeInfos[i]); + projectionMapping.add(outputColumn, outputColumn, testDesc.smallTableValueTypeInfos[i]); + outputColumn++; + } + + // Convert dynamic arrays and maps to simple arrays. + + bigTableRetainedMapping.finalize(); + + bigTableOuterKeyMapping.finalize(); + + smallTableMapping.finalize(); + + vectorMapJoinInfo.setBigTableRetainedMapping(bigTableRetainedMapping); + vectorMapJoinInfo.setBigTableOuterKeyMapping(bigTableOuterKeyMapping); + vectorMapJoinInfo.setSmallTableMapping(smallTableMapping); + + projectionMapping.finalize(); + + // Verify we added an entry for each output. + assert projectionMapping.isSourceSequenceGood(); + + vectorMapJoinInfo.setProjectionMapping(projectionMapping); + + assert projectionMapping.getCount() == testDesc.outputColumnNames.length; + + vectorDesc.setVectorMapJoinInfo(vectorMapJoinInfo); + + return vectorDesc; + } + + public class TestData { + + final int bigTableBatchesLength; + final int bigTableBatchesSize; + final VectorizedRowBatch[] bigTableBatches; + TestRow[] bigTableTestKeys; + HashMap smallTableKeyHashMap; + ArrayList smallTableValueCounts; + ArrayList> smallTableValues; + + public TestData(int batchCount, TypeInfo[] bigTableTypeInfos, + ObjectInspector[] bigTableObjectInspectors, TypeInfo[] bigTableKeyTypeInfos, + Random random) + throws HiveException { + GenerateType[] generateTypes = generateTypesFromTypeInfos(bigTableTypeInfos); + VectorBatchGenerator generator = new VectorBatchGenerator(generateTypes); + + final int size = VectorizedRowBatch.DEFAULT_SIZE; + + bigTableBatchesLength = batchCount; + bigTableBatchesSize = bigTableBatchesLength * size; + + bigTableBatches = createBigTableBatches(generator, bigTableBatchesLength); + for (int i = 0; i < bigTableBatchesLength; i++) { + generator.generateBatch(bigTableBatches[i], random, size); + bigTableBatches[i].cols[0].isRepeating = true; + } + + VectorExtractRow vectorExtractRow = new VectorExtractRow(); + vectorExtractRow.init(bigTableKeyTypeInfos); + + bigTableTestKeys = + getTestKeys(bigTableBatches, vectorExtractRow, bigTableKeyTypeInfos.length, + bigTableObjectInspectors); + + smallTableKeyHashMap = new HashMap(); + final int keyProbes = bigTableBatchesSize / 2; + for (int i = 0; i < keyProbes; i++) { + int index = random.nextInt(bigTableBatchesSize); + TestRow bigTableTestKey = bigTableTestKeys[index]; + smallTableKeyHashMap.put((TestRow) bigTableTestKey.clone(), -1); + } + + // Add more small table keys that are not in Big Table batches. + final int smallTableAdditionalLength = 1 + random.nextInt(4); + final int smallTableAdditionalSize = smallTableAdditionalLength * size; + VectorizedRowBatch[] smallTableAdditionalBatches = createBigTableBatches(generator, smallTableAdditionalLength); + for (int i = 0; i < smallTableAdditionalLength; i++) { + generator.generateBatch(smallTableAdditionalBatches[i], random, size); + } + TestRow[] additionalTestKeys = getTestKeys(smallTableAdditionalBatches, vectorExtractRow, bigTableKeyTypeInfos.length, bigTableObjectInspectors); + final int smallTableAdditionKeyProbes = smallTableAdditionalSize / 2; + for (int i = 0; i < smallTableAdditionKeyProbes; i++) { + int index = random.nextInt(smallTableAdditionalSize); + TestRow additionalTestKey = additionalTestKeys[index]; + smallTableKeyHashMap.put((TestRow) additionalTestKey.clone(), -1); + } + + // Number the test rows with collection order. + int addCount = 0; + for (Entry testRowEntry : smallTableKeyHashMap.entrySet()) { + testRowEntry.setValue(addCount++); + } + } + + private TestRow generateRandomSmallTableValueRow(TestMapJoinDescription testDesc, Random random) { + final int columnCount = testDesc.smallTableValueTypeInfos.length; + Object[] smallTableValueRow = VectorRandomRowSource.randomRow(columnCount, random, + testDesc.smallTableObjectInspectorsList, testDesc.smallTablePrimitiveCategories, + testDesc.smallTableValuePrimitiveTypeInfos); + for (int c = 0; c < smallTableValueRow.length; c++) { + smallTableValueRow[c] = ((PrimitiveObjectInspector) testDesc.smallTableObjectInspectors[c]).copyObject(smallTableValueRow[c]); + } + return new TestRow(smallTableValueRow); + } + + public void generateRandomSmallTableCounts(TestMapJoinDescription testDesc, Random random) { + smallTableValueCounts = new ArrayList(); + for (Entry testKeyEntry : smallTableKeyHashMap.entrySet()) { + final int valueCount = 1 + random.nextInt(19); + smallTableValueCounts.add(valueCount); + } + } + + public void generateRandomSmallTableValues(TestMapJoinDescription testDesc, Random random) { + smallTableValues = new ArrayList>(); + for (Entry testKeyEntry : smallTableKeyHashMap.entrySet()) { + ArrayList valueList = new ArrayList(); + smallTableValues.add(valueList); + final int valueCount = smallTableValueCounts.get(testKeyEntry.getValue()); + for (int v = 0; v < valueCount; v++) { + valueList.add(generateRandomSmallTableValueRow(testDesc, random)); + } + } + } + } + + private void loadTableContainerData(TestMapJoinDescription testDesc, TestData testData, + MapJoinBytesTableContainer mapJoinTableContainer ) throws IOException, SerDeException { + + LazyBinarySerializeWrite valueSerializeWrite = null; + Output valueOutput = null; + if (testData.smallTableValues != null) { + valueSerializeWrite = new LazyBinarySerializeWrite(testDesc.smallTableValueTypeInfos.length); + valueOutput = new Output(); + } + BytesWritable valueBytesWritable = new BytesWritable(); + + BytesWritable keyBytesWritable = new BytesWritable(); + BinarySortableSerializeWrite keySerializeWrite = + new BinarySortableSerializeWrite(testDesc.bigTableKeyTypeInfos.length); + Output keyOutput = new Output(); + int round = 0; + boolean atLeastOneValueAdded = false; + while (true) { + for (Entry testRowEntry : testData.smallTableKeyHashMap.entrySet()) { + final int smallTableKeyIndex = testRowEntry.getValue(); + final int valueCount = testData.smallTableValueCounts.get(smallTableKeyIndex); + boolean addEntry = round + 1 <= valueCount; + + if (addEntry) { + atLeastOneValueAdded = true; + + TestRow valueRow = null; + if (testData.smallTableValues != null) { + ArrayList valueList = testData.smallTableValues.get(smallTableKeyIndex); + valueRow = valueList.get(round); + } + + Object[] smallTableKey = testRowEntry.getKey().getRow(); + keyOutput.reset(); + keySerializeWrite.set(keyOutput); + + for (int index = 0; index < testDesc.bigTableKeyTypeInfos.length; index++) { + + Writable keyWritable = (Writable) smallTableKey[index]; + + VerifyFastRow.serializeWrite( + keySerializeWrite, (PrimitiveTypeInfo) testDesc.bigTableKeyTypeInfos[index], keyWritable); + } + + keyBytesWritable.set(keyOutput.getData(), 0, keyOutput.getLength()); + + if (valueRow == null) { + // Empty value. + mapJoinTableContainer.putRow(keyBytesWritable, valueBytesWritable); + } else { + Object[] smallTableValue = valueRow.getRow(); + valueOutput.reset(); + valueSerializeWrite.set(valueOutput); + for (int index = 0; index < testDesc.smallTableValueTypeInfos.length; index++) { + + Writable valueWritable = (Writable) smallTableValue[index]; + + VerifyFastRow.serializeWrite( + valueSerializeWrite, (PrimitiveTypeInfo) testDesc.smallTableValueTypeInfos[index], valueWritable); + } + valueBytesWritable.set(valueOutput.getData(), 0, valueOutput.getLength()); + mapJoinTableContainer.putRow(keyBytesWritable, valueBytesWritable); + } + } + } + if (testData.smallTableValues == null || !atLeastOneValueAdded) { + break; + } + round++; + atLeastOneValueAdded = false; + } + mapJoinTableContainer.seal(); + } + + protected boolean hasFilter(MapJoinDesc mapJoinDesc, int alias) { + int[][] filterMaps = mapJoinDesc.getFilterMap(); + return filterMaps != null && filterMaps[alias] != null; + } + + protected MapJoinTableContainerSerDe createMapJoinTableContainerSerDe(MapJoinDesc mapJoinDesc) + throws SerDeException { + TableDesc keyTableDesc = mapJoinDesc.getKeyTblDesc(); + SerDe keySerializer = (SerDe) ReflectionUtil.newInstance( + keyTableDesc.getDeserializerClass(), null); + SerDeUtils.initializeSerDe(keySerializer, null, keyTableDesc.getProperties(), null); + MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerializer, false); + + TableDesc valueTableDesc; + if (mapJoinDesc.getNoOuterJoin()) { + valueTableDesc = mapJoinDesc.getValueTblDescs().get(1); + } else { + valueTableDesc = mapJoinDesc.getValueFilteredTblDescs().get(1); + } + SerDe valueSerDe = (SerDe) ReflectionUtil.newInstance( + valueTableDesc.getDeserializerClass(), null); + SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null); + MapJoinObjectSerDeContext valueContext = + new MapJoinObjectSerDeContext(valueSerDe, hasFilter(mapJoinDesc, 1)); + MapJoinTableContainerSerDe mapJoinTableContainerSerDe = + new MapJoinTableContainerSerDe(keyContext, valueContext); + return mapJoinTableContainerSerDe; + } + + private void connectOperators(TestMapJoinDescription testDesc, + Operatoroperator, + Operator testCollectorOperator) throws HiveException { + Operator[] parents = new Operator[] {operator}; + testCollectorOperator.setParentOperators(Arrays.asList(parents)); + Operator[] childOperators = new Operator[] {testCollectorOperator}; + operator.setChildOperators(Arrays.asList(childOperators)); + HiveConf.setBoolVar(testDesc.hiveConf, + HiveConf.ConfVars.HIVE_MAPJOIN_TESTING_NO_HASH_TABLE_LOAD, true); + operator.initialize(testDesc.hiveConf, testDesc.inputObjectInspectors); + } + + private void driveVectorBigTableData(TestMapJoinDescription testDesc, TestData testData, + MapJoinOperator operator) throws HiveException { + for (int i = 0; i < testData.bigTableBatches.length; i++) { + VectorizedRowBatch bigTableBatch = testData.bigTableBatches[i]; + ColumnVector[] newCols = new ColumnVector[bigTableBatch.cols.length + testDesc.smallTableValueTypeInfos.length]; + System.arraycopy(bigTableBatch.cols, 0, newCols, 0, bigTableBatch.cols.length); + for (int s = 0; s < testDesc.smallTableValueTypeInfos.length; s++) { + newCols[bigTableBatch.cols.length + s] = + VectorizedBatchUtil.createColumnVector(testDesc.smallTableValueTypeInfos[s]); + } + bigTableBatch.cols = newCols; + bigTableBatch.numCols = newCols.length; + + operator.process(bigTableBatch, 0); + } + operator.closeOp(false); + } + + private void testMapJoin(TestMapJoinDescription testDesc, TestData testData, + MapJoinDesc mapJoinDesc, boolean isVectorMapJoin, TestRowMultiSet expectedTestRowMultiSet) + throws SerDeException, IOException, HiveException { + + BytesBytesMultiHashMapFactory mapJoinHashTableFactory = + new BytesBytesMultiHashMapFactory(); + + MapJoinBytesTableContainer mapJoinTableContainer = + new MapJoinBytesTableContainer(testDesc.hiveConf, null, testData.smallTableKeyHashMap.size(), 0, + mapJoinHashTableFactory); + + MapJoinTableContainerSerDe mapJoinTableContainerSerDe = + createMapJoinTableContainerSerDe(mapJoinDesc); + + mapJoinTableContainer.setSerde( + mapJoinTableContainerSerDe.getKeyContext(), + mapJoinTableContainerSerDe.getValueContext()); + + loadTableContainerData(testDesc, testData, mapJoinTableContainer); + + MapJoinOperator operator; + Operator testCollectorOperator; + if (!isVectorMapJoin) { + operator = new MapJoinOperator(new CompilationOpContext()); + operator.setConf(mapJoinDesc); + testCollectorOperator = new TestCollectorOperator(testDesc.outputObjectInspectors); + } else { + VectorizationContext vContext = new VectorizationContext("test", testDesc.bigTableColumnNamesList); + // Create scratch columns to hold small table results. + for (int i = 0; i < testDesc.smallTableValueTypeInfos.length; i++) { + vContext.allocateScratchColumn(testDesc.smallTableValueTypeInfos[i].getTypeName()); + } + + // This is what the Vectorizer class does. + List bigTableFilters = mapJoinDesc.getFilters().get((byte) 0); + boolean isOuterAndFiltered = (!mapJoinDesc.isNoOuterJoin() && bigTableFilters.size() > 0); + if (!isOuterAndFiltered) { + operator = new VectorMapJoinOperator(new CompilationOpContext(), vContext, mapJoinDesc); + } else { + operator = new VectorMapJoinOuterFilteredOperator(new CompilationOpContext(), vContext, mapJoinDesc); + } + testCollectorOperator = new TestVectorCollectorOperator(testDesc.outputTypeInfos, testDesc.outputObjectInspectors); + } + + connectOperators(testDesc, operator, testCollectorOperator); + + operator.setTestMapJoinTableContainer(1, mapJoinTableContainer, mapJoinTableContainerSerDe); + + if (!isVectorMapJoin) { + VectorExtractRow vectorExtractRow = new VectorExtractRow(); + vectorExtractRow.init(testDesc.bigTableKeyTypeInfos); + + for (int i = 0; i < testData.bigTableBatches.length; i++) { + VectorizedRowBatch bigTableBatch = testData.bigTableBatches[i]; + + // Extract rows and call process per row + final int columnCount = bigTableBatch.cols.length; + for (int r = 0; r < bigTableBatch.size; r++) { + Object[] rowObjects = new Object[columnCount]; + vectorExtractRow.extractRow(bigTableBatch, r, rowObjects); + for (int c = 0; c < rowObjects.length; c++) { + rowObjects[c] = ((PrimitiveObjectInspector) testDesc.bigTableObjectInspectors[c]).copyObject(rowObjects[c]); + } + operator.process(rowObjects, 0); + } + } + operator.closeOp(false); + } else { + driveVectorBigTableData(testDesc, testData, operator); + } + + if (!isVectorMapJoin) { + TestRowMultiSet actual = ((TestCollectorOperator) testCollectorOperator).getTestRowMultiSet(); + if (!expectedTestRowMultiSet.equals(actual)) { + failures++; + } + } else { + TestRowMultiSet actual = ((TestVectorCollectorOperator) testCollectorOperator).getTestRowMultiSet(); + if (!expectedTestRowMultiSet.equals(actual)) { + failures++; + } + } + } + + private void testNativeVectorMapJoin(TestMapJoinDescription testDesc, TestData testData, + MapJoinDesc mapJoinDesc, TestRowMultiSet expectedTestRowMultiSet) + throws SerDeException, IOException, HiveException { + + VectorMapJoinDesc vectorDesc = createVectorMapJoinDesc(testDesc); + mapJoinDesc.setVectorDesc(vectorDesc); + + VectorMapJoinFastHashTableFactory mapJoinHashTableFactory = + new VectorMapJoinFastHashTableFactory(mapJoinDesc); + + MapJoinBytesTableContainer mapJoinTableContainer = + new MapJoinBytesTableContainer(testDesc.hiveConf, null, testData.smallTableKeyHashMap.size(), 0, + mapJoinHashTableFactory); + + // No key or value context needed for native vector map join. + mapJoinTableContainer.setSerde(null, null); + + loadTableContainerData(testDesc, testData, mapJoinTableContainer); + + VectorizationContext vContext = new VectorizationContext("test", testDesc.bigTableColumnNamesList); + // Create scratch columns to hold small table results. + for (int i = 0; i < testDesc.smallTableValueTypeInfos.length; i++) { + vContext.allocateScratchColumn(testDesc.smallTableValueTypeInfos[i].getTypeName()); + } + + VectorMapJoinCommonOperator operator; + switch (vectorDesc.hashTableKeyType()) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + switch (testDesc.operatorVariation) { + case INNER: + operator = + new VectorMapJoinInnerLongOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + case INNER_BIG_ONLY: + operator = + new VectorMapJoinInnerBigOnlyLongOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + case LEFT_SEMI: + operator = + new VectorMapJoinLeftSemiLongOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + case OUTER: + operator = + new VectorMapJoinOuterLongOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + default: + throw new RuntimeException("unknown operator variation " + testDesc.operatorVariation); + } + break; + case STRING: + switch (testDesc.operatorVariation) { + case INNER: + operator = + new VectorMapJoinInnerStringOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + case INNER_BIG_ONLY: + operator = + new VectorMapJoinInnerBigOnlyStringOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + case LEFT_SEMI: + operator = + new VectorMapJoinLeftSemiStringOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + case OUTER: + operator = + new VectorMapJoinOuterStringOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + default: + throw new RuntimeException("unknown operator variation " + testDesc.operatorVariation); + } + break; + case MULTI_KEY: + switch (testDesc.operatorVariation) { + case INNER: + operator = + new VectorMapJoinInnerMultiKeyOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + case INNER_BIG_ONLY: + operator = + new VectorMapJoinInnerBigOnlyMultiKeyOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + case LEFT_SEMI: + operator = + new VectorMapJoinLeftSemiMultiKeyOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + case OUTER: + operator = + new VectorMapJoinOuterMultiKeyOperator(new CompilationOpContext(), + vContext, mapJoinDesc); + break; + default: + throw new RuntimeException("unknown operator variation " + testDesc.operatorVariation); + } + break; + default: + throw new RuntimeException("Unknown hash table key type " + vectorDesc.hashTableKeyType()); + } + + Operator testVectorCollectorOperator = + new TestVectorCollectorOperator(testDesc.outputTypeInfos, testDesc.outputObjectInspectors); + + connectOperators(testDesc, operator, testVectorCollectorOperator); + + operator.setTestMapJoinTableContainer(1, mapJoinTableContainer, null); + + driveVectorBigTableData(testDesc, testData, operator); + + TestRowMultiSet actual = ((TestVectorCollectorOperator) testVectorCollectorOperator).getTestRowMultiSet(); + if (!expectedTestRowMultiSet.equals(actual)) { + // failures++; + assertTrue(false); + } + } + + static int failures = 0; + + private void addBigTableRetained(TestMapJoinDescription testDesc, Object[] bigTableRowObjects, + Object[] outputObjects) { + final int bigTableRetainColumnNumsLength = testDesc.bigTableRetainColumnNums.length; + for (int o = 0; o < bigTableRetainColumnNumsLength; o++) { + outputObjects[o] = bigTableRowObjects[testDesc.bigTableRetainColumnNums[o]]; + } + } + + private void addToOutput(TestMapJoinDescription testDesc, TestRowMultiSet expectedTestRowMultiSet, + Object[] outputObjects) { + for (int c = 0; c < outputObjects.length; c++) { + PrimitiveObjectInspector primitiveObjInsp = ((PrimitiveObjectInspector) testDesc.outputObjectInspectors[c]); + Object outputObject = outputObjects[c]; + outputObjects[c] = primitiveObjInsp.copyObject(outputObject); + } + expectedTestRowMultiSet.add(new TestRow(outputObjects)); + } + + private TestRowMultiSet createExpectedTestRowMultiSet(TestMapJoinDescription testDesc, + TestData testData) throws HiveException { + + TestRowMultiSet expectedTestRowMultiSet = new TestRowMultiSet(); + + VectorExtractRow vectorExtractRow = new VectorExtractRow(); + vectorExtractRow.init(testDesc.bigTableKeyTypeInfos); + + final int bigTableColumnCount = testDesc.bigTableTypeInfos.length; + Object[] bigTableRowObjects = new Object[bigTableColumnCount]; + + final int bigTableKeyColumnCount = testDesc.bigTableKeyTypeInfos.length; + Object[] bigTableKeyObjects = new Object[bigTableKeyColumnCount]; + + for (int i = 0; i < testData.bigTableBatches.length; i++) { + VectorizedRowBatch bigTableBatch = testData.bigTableBatches[i]; + + for (int r = 0; r < bigTableBatch.size; r++) { + vectorExtractRow.extractRow(bigTableBatch, r, bigTableRowObjects); + + // Form key object array + for (int k = 0; k < bigTableKeyColumnCount; k++) { + int keyColumnNum = testDesc.bigTableKeyColumnNums[k]; + bigTableKeyObjects[k] = bigTableRowObjects[keyColumnNum]; + bigTableKeyObjects[k] = ((PrimitiveObjectInspector) testDesc.bigTableObjectInspectors[keyColumnNum]).copyObject(bigTableKeyObjects[k]); + } + TestRow testKey = new TestRow(bigTableKeyObjects); + + if (testData.smallTableKeyHashMap.containsKey(testKey)) { + + int smallTableKeyIndex = testData.smallTableKeyHashMap.get(testKey); + + switch (testDesc.operatorVariation) { + case INNER: + case OUTER: + { + // One row per value. + ArrayList valueList = testData.smallTableValues.get(smallTableKeyIndex); + final int valueCount = valueList.size(); + for (int v = 0; v < valueCount; v++) { + Object[] outputObjects = new Object[testDesc.outputColumnNames.length]; + + addBigTableRetained(testDesc, bigTableRowObjects, outputObjects); + + Object[] valueRow = valueList.get(v).getRow(); + final int bigTableRetainColumnNumsLength = testDesc.bigTableRetainColumnNums.length; + final int smallTableRetainValueColumnNumsLength = testDesc.smallTableRetainValueColumnNums.length; + for (int o = 0; o < smallTableRetainValueColumnNumsLength; o++) { + outputObjects[bigTableRetainColumnNumsLength + o] = valueRow[testDesc.smallTableRetainValueColumnNums[o]]; + } + + addToOutput(testDesc, expectedTestRowMultiSet, outputObjects); + } + } + break; + case INNER_BIG_ONLY: + { + // Value count rows. + final int valueCount = testData.smallTableValueCounts.get(smallTableKeyIndex); + for (int v = 0; v < valueCount; v++) { + Object[] outputObjects = new Object[testDesc.outputColumnNames.length]; + + addBigTableRetained(testDesc, bigTableRowObjects, outputObjects); + addToOutput(testDesc, expectedTestRowMultiSet, outputObjects); + } + } + break; + case LEFT_SEMI: + { + // One row (existence). + Object[] outputObjects = new Object[testDesc.outputColumnNames.length]; + + addBigTableRetained(testDesc, bigTableRowObjects, outputObjects); + addToOutput(testDesc, expectedTestRowMultiSet, outputObjects); + } + break; + default: + throw new RuntimeException("Unknown operator variation " + testDesc.operatorVariation); + } + + } else { + + // No match. + + if (testDesc.operatorVariation == OperatorVariation.OUTER) { + + // We need to add a non-match row with nulls for small table values. + + Object[] outputObjects = new Object[testDesc.outputColumnNames.length]; + + addBigTableRetained(testDesc, bigTableRowObjects, outputObjects); + + final int bigTableRetainColumnNumsLength = testDesc.bigTableRetainColumnNums.length; + final int smallTableRetainValueColumnNumsLength = testDesc.smallTableRetainValueColumnNums.length; + for (int o = 0; o < smallTableRetainValueColumnNumsLength; o++) { + outputObjects[bigTableRetainColumnNumsLength + o] = null; + } + + addToOutput(testDesc, expectedTestRowMultiSet, outputObjects); + } + } + } + } + + return expectedTestRowMultiSet; + } + + private void execute(TestMapJoinDescription testDesc, Random random) throws Exception { + + switch (testDesc.operatorVariation) { + case INNER_BIG_ONLY: + case LEFT_SEMI: + testDesc.trimAwaySmallTableValueInfo(); + break; + case INNER: + case OUTER: + break; + default: + throw new RuntimeException("Unknown operator variation " + testDesc.operatorVariation); + } + + testDesc.computeDerived(); + + TestData testData = new TestData(testDesc.batchCount, testDesc.bigTableTypeInfos, + testDesc.bigTableObjectInspectors, testDesc.bigTableKeyTypeInfos, random); + + switch (testDesc.operatorVariation) { + case INNER_BIG_ONLY: + case LEFT_SEMI: + testData.generateRandomSmallTableCounts(testDesc, random); + break; + case INNER: + case OUTER: + testData.generateRandomSmallTableCounts(testDesc, random); + testData.generateRandomSmallTableValues(testDesc, random); + break; + default: + throw new RuntimeException("Unknown operator variation " + testDesc.operatorVariation); + } + + TestRowMultiSet expectedTestRowMultiSet = + createExpectedTestRowMultiSet(testDesc, testData); + + MapJoinDesc mapJoinDesc = createMapJoinDesc(testDesc); + + // testMapJoin(testDesc, testData, mapJoinDesc, /* isVectorMapJoin */ false, expectedTestRowMultiSet); + + // testMapJoin(testDesc, testData, mapJoinDesc, /* isVectorMapJoin */ true, expectedTestRowMultiSet); + + testNativeVectorMapJoin(testDesc, testData, mapJoinDesc, expectedTestRowMultiSet); + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastHashTable.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastHashTable.java index 28a4dc6..713d70a 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastHashTable.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastHashTable.java @@ -30,13 +30,15 @@ import junit.framework.TestCase; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; import org.apache.hadoop.hive.serde2.WriteBuffers; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.WritableComparator; +import org.apache.hive.common.util.HashCodeUtil; import com.google.common.base.Preconditions; @@ -59,7 +61,7 @@ public static boolean findMatch(int valueIndex, byte[] valueBytes, List return false; } - public static int[] verifyHashMapValues(VectorMapJoinHashMapResult hashMapResult, + public static int[] verifyHashMapValues(MapJoinHashMapResult hashMapResult, List values) { int valueCount = values.size(); @@ -197,7 +199,7 @@ public long getKey(int index) { return array[index].getValues(); } - public void verify(VectorMapJoinFastLongHashMap map) { + public void verify(VectorMapJoinFastHashTableFactory factory, VectorMapJoinFastLongHashMap map) { int mapSize = map.size(); if (mapSize != count) { TestCase.fail("map.size() does not match expected count"); @@ -208,9 +210,10 @@ public void verify(VectorMapJoinFastLongHashMap map) { long key = element.getKey(); List values = element.getValues(); - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, hashMapResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + int hashCode = HashCodeUtil.calculateLongHashCode(key); + map.hashMapLookup(key, hashCode, hashMapResult); + if (hashMapResult.getMapJoinResult() != MapJoinResult.MATCH) { assertTrue(false); } @@ -310,7 +313,7 @@ public void add(byte[] key, byte[] value) { return array[index].getValues(); } - public void verify(VectorMapJoinFastBytesHashMap map) { + public void verify(VectorMapJoinFastHashTableFactory factory, VectorMapJoinFastBytesHashMap map) { int mapSize = map.size(); if (mapSize != count) { TestCase.fail("map.size() does not match expected count"); @@ -321,9 +324,10 @@ public void verify(VectorMapJoinFastBytesHashMap map) { byte[] key = element.getKey(); List values = element.getValues(); - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key, 0, key.length, hashMapResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMapLookup(key, 0, key.length, hashCode, hashMapResult); + if (hashMapResult.getMapJoinResult() != MapJoinResult.MATCH) { assertTrue(false); } @@ -414,7 +418,7 @@ public int getMultiSetCount(int index) { return array[index].getMultiSetCount(); } - public void verify(VectorMapJoinFastLongHashMultiSet map) { + public void verify(VectorMapJoinFastHashTableFactory factory, VectorMapJoinFastLongHashMultiSet map) { int mapSize = map.size(); if (mapSize != count) { TestCase.fail("map.size() does not match expected count"); @@ -425,9 +429,10 @@ public void verify(VectorMapJoinFastLongHashMultiSet map) { long key = element.getKey(); int multiSetCount = element.getMultiSetCount(); - VectorMapJoinHashMultiSetResult hashMultiSetResult = map.createHashMultiSetResult(); - JoinUtil.JoinResult joinResult = map.contains(key, hashMultiSetResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + MapJoinHashMultiSetResult hashMultiSetResult = factory.createHashMultiSetResult(); + int hashCode = HashCodeUtil.calculateLongHashCode(key); + map.hashMultiSetContains(key, hashCode, hashMultiSetResult); + if (hashMultiSetResult.getMapJoinResult() != MapJoinResult.MATCH) { assertTrue(false); } @@ -522,7 +527,7 @@ public int getMultiSetCount(int index) { return array[index].getMultiSetCount(); } - public void verify(VectorMapJoinFastBytesHashMultiSet map) { + public void verify(VectorMapJoinFastHashTableFactory factory, VectorMapJoinFastBytesHashMultiSet map) { int mapSize = map.size(); if (mapSize != count) { TestCase.fail("map.size() does not match expected count"); @@ -533,9 +538,10 @@ public void verify(VectorMapJoinFastBytesHashMultiSet map) { byte[] key = element.getKey(); int multiSetCount = element.getMultiSetCount(); - VectorMapJoinHashMultiSetResult hashMultiSetResult = map.createHashMultiSetResult(); - JoinUtil.JoinResult joinResult = map.contains(key, 0, key.length, hashMultiSetResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + MapJoinHashMultiSetResult hashMultiSetResult = factory.createHashMultiSetResult(); + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashMultiSetContains(key, 0, key.length, hashCode, hashMultiSetResult); + if (hashMultiSetResult.getMapJoinResult() != MapJoinResult.MATCH) { assertTrue(false); } @@ -613,7 +619,7 @@ public long getKey(int index) { return array[index].getKey(); } - public void verify(VectorMapJoinFastLongHashSet map) { + public void verify(VectorMapJoinFastHashTableFactory factory, VectorMapJoinFastLongHashSet map) { int mapSize = map.size(); if (mapSize != count) { TestCase.fail("map.size() does not match expected count"); @@ -623,9 +629,10 @@ public void verify(VectorMapJoinFastLongHashSet map) { FastLongHashSetElement element = array[index]; long key = element.getKey(); - VectorMapJoinHashSetResult hashSetResult = map.createHashSetResult(); - JoinUtil.JoinResult joinResult = map.contains(key, hashSetResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + MapJoinHashSetResult hashSetResult = factory.createHashSetResult(); + int hashCode = HashCodeUtil.calculateLongHashCode(key); + map.hashSetContains(key, hashCode, hashSetResult); + if (hashSetResult.getMapJoinResult() != MapJoinResult.MATCH) { assertTrue(false); } } @@ -705,7 +712,7 @@ public void add(byte[] key) { return array[index].getKey(); } - public void verify(VectorMapJoinFastBytesHashSet map) { + public void verify(VectorMapJoinFastHashTableFactory factory, VectorMapJoinFastBytesHashSet map) { int mapSize = map.size(); if (mapSize != count) { TestCase.fail("map.size() does not match expected count"); @@ -715,9 +722,10 @@ public void verify(VectorMapJoinFastBytesHashSet map) { FastBytesHashSetElement element = array[index]; byte[] key = element.getKey(); - VectorMapJoinHashSetResult hashSetResult = map.createHashSetResult(); - JoinUtil.JoinResult joinResult = map.contains(key, 0, key.length, hashSetResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + MapJoinHashSetResult hashSetResult = factory.createHashSetResult(); + int hashCode = HashCodeUtil.murmurHash(key, 0, key.length); + map.hashSetContains(key, 0, key.length, hashCode, hashSetResult); + if (hashSetResult.getMapJoinResult() != MapJoinResult.MATCH) { assertTrue(false); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastRowHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastRowHashMap.java index 0bcfb56..a9d584b 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastRowHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CheckFastRowHashMap.java @@ -31,7 +31,9 @@ import junit.framework.TestCase; import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.serde2.WriteBuffers; import org.apache.hadoop.hive.serde2.io.ByteWritable; @@ -45,13 +47,14 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HashCodeUtil; import com.google.common.base.Preconditions; public class CheckFastRowHashMap extends CheckFastHashTable { public static void verifyHashMapRows(List rows, int[] actualToValueMap, - VectorMapJoinHashMapResult hashMapResult, TypeInfo[] typeInfos) throws IOException { + MapJoinHashMapResult hashMapResult, TypeInfo[] typeInfos) throws IOException { final int count = rows.size(); final int columnCount = typeInfos.length; @@ -99,7 +102,7 @@ public static void verifyHashMapRows(List rows, int[] actualToValueMap private static StackTraceElement[] debugStackTrace; public static void verifyHashMapRowsMore(List rows, int[] actualToValueMap, - VectorMapJoinHashMapResult hashMapResult, TypeInfo[] typeInfos, + MapJoinHashMapResult hashMapResult, TypeInfo[] typeInfos, int clipIndex, boolean useExactBytes) throws IOException { final int count = rows.size(); @@ -288,7 +291,7 @@ public void add(byte[] key, Object[] keyRow, byte[] value, Object[] valueRow) { return array[index].getValues(); } - public void verify(VectorMapJoinFastHashTable map, + public void verify(MapJoinHashTableFactory factory, VectorMapJoinFastHashTable map, HashTableKeyType hashTableKeyType, PrimitiveTypeInfo[] valuePrimitiveTypeInfos, boolean doClipping, boolean useExactBytes, Random random) throws IOException { @@ -302,8 +305,7 @@ public void verify(VectorMapJoinFastHashTable map, List values = element.getValues(); - VectorMapJoinHashMapResult hashMapResult = null; - JoinUtil.JoinResult joinResult = JoinUtil.JoinResult.NOMATCH; + MapJoinHashMapResult hashMapResult = null; switch (hashTableKeyType) { case BOOLEAN: case BYTE: @@ -314,7 +316,7 @@ public void verify(VectorMapJoinFastHashTable map, Object[] keyRow = element.getKeyRow(); Object keyObject = keyRow[0]; VectorMapJoinFastLongHashMap longHashMap = (VectorMapJoinFastLongHashMap) map; - hashMapResult = longHashMap.createHashMapResult(); + hashMapResult = factory.createHashMapResult(); long longKey; switch (hashTableKeyType) { case BOOLEAN: @@ -335,8 +337,9 @@ public void verify(VectorMapJoinFastHashTable map, default: throw new RuntimeException("Unexpected hash table key type " + hashTableKeyType.name()); } - joinResult = longHashMap.lookup(longKey, hashMapResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + int hashCode = HashCodeUtil.calculateLongHashCode(longKey); + longHashMap.hashMapLookup(longKey, hashCode, hashMapResult); + if (hashMapResult.getMapJoinResult() != MapJoinResult.MATCH) { assertTrue(false); } } @@ -346,12 +349,13 @@ public void verify(VectorMapJoinFastHashTable map, Object[] keyRow = element.getKeyRow(); Object keyObject = keyRow[0]; VectorMapJoinFastStringHashMap stringHashMap = (VectorMapJoinFastStringHashMap) map; - hashMapResult = stringHashMap.createHashMapResult(); + hashMapResult = factory.createHashMapResult(); Text text = (Text) keyObject; byte[] bytes = text.getBytes(); int length = text.getLength(); - joinResult = stringHashMap.lookup(bytes, 0, length, hashMapResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + int hashCode = HashCodeUtil.murmurHash(bytes, 0, length); + stringHashMap.hashMapLookup(bytes, 0, length, hashCode, hashMapResult); + if (hashMapResult.getMapJoinResult() != MapJoinResult.MATCH) { assertTrue(false); } } @@ -360,9 +364,10 @@ public void verify(VectorMapJoinFastHashTable map, { byte[] keyBytes = element.getKey(); VectorMapJoinFastMultiKeyHashMap stringHashMap = (VectorMapJoinFastMultiKeyHashMap) map; - hashMapResult = stringHashMap.createHashMapResult(); - joinResult = stringHashMap.lookup(keyBytes, 0, keyBytes.length, hashMapResult); - if (joinResult != JoinUtil.JoinResult.MATCH) { + hashMapResult = factory.createHashMapResult(); + int hashCode = HashCodeUtil.murmurHash(keyBytes, 0, keyBytes.length); + stringHashMap.hashMapLookup(keyBytes, 0, keyBytes.length, hashCode, hashMapResult); + if (hashMapResult.getMapJoinResult() != MapJoinResult.MATCH) { assertTrue(false); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CommonFastHashTable.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CommonFastHashTable.java index 90e8f33..093a36d 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CommonFastHashTable.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/CommonFastHashTable.java @@ -18,8 +18,16 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Random; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.serde2.WriteBuffers; + +import static org.junit.Assert.*; + public class CommonFastHashTable { protected static final float LOAD_FACTOR = 0.75f; @@ -71,4 +79,53 @@ public static int generateLargeCount() { } return count; } + + public static void verifyHashMapResult(MapJoinHashMapResult hashMapResult, + RandomByteArrayStream randomByteArrayStream ) { + + List resultBytes = new ArrayList(); + int count = 0; + if (hashMapResult.hasRows()) { + WriteBuffers.ByteSegmentRef ref = hashMapResult.first(); + while (ref != null) { + count++; + byte[] bytes = ref.getBytes(); + int offset = (int) ref.getOffset(); + int length = ref.getLength(); + resultBytes.add(Arrays.copyOfRange(bytes, offset, offset + length)); + ref = hashMapResult.next(); + } + } + if (randomByteArrayStream.size() != count) { + assertTrue(false); + } + + for (int i = 0; i < count; ++i) { + byte[] bytes = resultBytes.get(i); + if (!randomByteArrayStream.contains(bytes)) { + assertTrue(false); + } + } + } + + public static void verifyHashMapResult(MapJoinHashMapResult hashMapResult, + byte[] valueBytes ) { + + assertTrue(hashMapResult.hasRows()); + WriteBuffers.ByteSegmentRef ref = hashMapResult.first(); + byte[] bytes = ref.getBytes(); + int offset = (int) ref.getOffset(); + int length = ref.getLength(); + assertTrue(valueBytes.length == length); + boolean match = true; // Assume + for (int j = 0; j < length; j++) { + if (valueBytes[j] != bytes[offset + j]) { + match = false; + break; + } + } + if (!match) { + assertTrue(false); + } + } } \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestKeyValueLong.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestKeyValueLong.java new file mode 100644 index 0000000..2182e89 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestKeyValueLong.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; +import org.apache.hadoop.io.BytesWritable; + +public class TestKeyValueLong { + + private BinarySortableSerializeWrite serializeWrite = new BinarySortableSerializeWrite(1); + private Output output = new Output(); + + public TestKeyValueLong() { + } + + public BytesWritable getLongAsBytesWritable(long longValue) throws IOException { + serializeWrite.set(output); + serializeWrite.writeLong(longValue); + byte[] bytes = Arrays.copyOf(output.getData(), output.getLength()); + return new BytesWritable(bytes); + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestLongKeyValueWriter.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestLongKeyValueWriter.java new file mode 100644 index 0000000..273fa51 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestLongKeyValueWriter.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTable; +import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HashCodeUtil; + +public class TestLongKeyValueWriter implements MapJoinHashTable.KeyValuePutWriter { + + private long key; + private byte[] value; + + public TestLongKeyValueWriter() { + } + + void setLongKeyValue(long key, byte[] value) { + this.key = key; + this.value = value; + } + + @Override + public void writeKey(RandomAccessOutput dest) throws SerDeException { + LazyBinaryUtils.writeVLong(dest, key); + } + + @Override + public void writeValue(RandomAccessOutput dest) throws SerDeException { + try { + dest.write(value); + } catch (Exception e) { + throw new SerDeException(e); + } + } + + @Override + public byte updateStateByte(Byte previousValue) { + return 0; + } + + @Override + public void setKeyValue(Writable key, Writable value) throws SerDeException, + IOException { + throw new RuntimeException("Not used"); + } + + @Override + public boolean hasHashCode() { + return true; + } + + @Override + public int getKeyHashCode() throws SerDeException { + return HashCodeUtil.calculateLongHashCode(key); + } + + @Override + public long getLongKey() { + throw new RuntimeException("Not used"); + } +} \ No newline at end of file diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMap.java index 8525e99..e37174c 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMap.java @@ -24,13 +24,17 @@ import java.util.Random; import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.CheckFastHashTable.VerifyFastBytesHashMap; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; /* - * An multi-key value hash map optimized for vector map join. + * An multi-key value hash hashMap optimized for vector hashMap join. * * The key is uninterpreted bytes. */ @@ -40,60 +44,66 @@ public void testOneKey() throws Exception { random = new Random(82733); - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false,CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); - VerifyFastBytesHashMap verifyTable = new VerifyFastBytesHashMap(); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastBytesHashMap verifyFastBytesHashMap = new VerifyFastBytesHashMap(); byte[] key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastBytesHashMap.add(key, value); + verifyFastBytesHashMap.verify(factory, hashMap); // Second value. value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastBytesHashMap.add(key, value); + verifyFastBytesHashMap.verify(factory, hashMap); // Third value. value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastBytesHashMap.add(key, value); + verifyFastBytesHashMap.verify(factory, hashMap); } @Test public void testMultipleKeysSingleValue() throws Exception { random = new Random(29383); - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false,CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastBytesHashMap verifyTable = new VerifyFastBytesHashMap(); + VerifyFastBytesHashMap verifyFastBytesHashMap = new VerifyFastBytesHashMap(); int keyCount = 100 + random.nextInt(1000); for (int i = 0; i < keyCount; i++) { byte[] key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashMap.contains(key)) { // Unique keys for this test. break; } byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastBytesHashMap.add(key, value); + verifyFastBytesHashMap.verify(factory, hashMap); } } @@ -101,37 +111,42 @@ public void testMultipleKeysSingleValue() throws Exception { public void testGetNonExistent() throws Exception { random = new Random(1002); - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false,CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); - VerifyFastBytesHashMap verifyTable = new VerifyFastBytesHashMap(); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastBytesHashMap verifyFastBytesHashMap = new VerifyFastBytesHashMap(); byte[] key1 = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key1); byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key1, value); - verifyTable.add(key1, value); - verifyTable.verify(map); + hashMap.testPutRow(key1, value); + verifyFastBytesHashMap.add(key1, value); + verifyFastBytesHashMap.verify(factory, hashMap); byte[] key2 = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key2); - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key2, 0, key2.length, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + int hashCode = HashCodeUtil.murmurHash(key2, 0, key2.length); + hashMap.hashMapLookup(key2, 0, key2.length, hashCode, hashMapResult); + assertTrue(hashMapResult.getMapJoinResult() == MapJoinResult.NO_MATCH); assertTrue(!hashMapResult.hasRows()); - map.testPutRow(key2, value); - verifyTable.add(key2, value); - verifyTable.verify(map); + hashMap.testPutRow(key2, value); + verifyFastBytesHashMap.add(key2, value); + verifyFastBytesHashMap.verify(factory, hashMap); byte[] key3 = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key3); - hashMapResult = map.createHashMapResult(); - joinResult = map.lookup(key3, 0, key3.length, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + hashMapResult = factory.createHashMapResult(); + hashCode = HashCodeUtil.murmurHash(key3, 0, key3.length); + hashMap.hashMapLookup(key3, 0, key3.length, hashCode, hashMapResult); + assertTrue(hashMapResult.getMapJoinResult() == MapJoinResult.NO_MATCH); assertTrue(!hashMapResult.hasRows()); } @@ -139,18 +154,22 @@ public void testGetNonExistent() throws Exception { public void testFullMap() throws Exception { random = new Random(200001); - // Make sure the map does not expand; should be able to find space. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap(false,CAPACITY, 1f, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + + // Make sure the hashMap does not expand; should be able to find space. + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(CAPACITY, 1f, WB_SIZE, 0); - VerifyFastBytesHashMap verifyTable = new VerifyFastBytesHashMap(); + VerifyFastBytesHashMap verifyFastBytesHashMap = new VerifyFastBytesHashMap(); for (int i = 0; i < CAPACITY; i++) { byte[] key; while (true) { key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashMap.contains(key)) { // Unique keys for this test. break; } @@ -158,43 +177,48 @@ public void testFullMap() throws Exception { byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - // verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastBytesHashMap.add(key, value); + // verifyFastBytesHashMap.verify(factory, hashMap); } - verifyTable.verify(map); + verifyFastBytesHashMap.verify(factory, hashMap); byte[] anotherKey; while (true) { anotherKey = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(anotherKey); - if (!verifyTable.contains(anotherKey)) { + if (!verifyFastBytesHashMap.contains(anotherKey)) { // Unique keys for this test. break; } } - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(anotherKey, 0, anotherKey.length, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + int hashCode = HashCodeUtil.murmurHash(anotherKey, 0, anotherKey.length); + hashMap.hashMapLookup(anotherKey, 0, anotherKey.length, hashCode, hashMapResult); + assertTrue(hashMapResult.getMapJoinResult() == MapJoinResult.NO_MATCH); } @Test public void testExpand() throws Exception { random = new Random(99221); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Start with capacity 1; make sure we expand on every put. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap(false,1, 0.0000001f, WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(1, 0.0000001f, WB_SIZE, 0); - VerifyFastBytesHashMap verifyTable = new VerifyFastBytesHashMap(); + VerifyFastBytesHashMap verifyFastBytesHashMap = new VerifyFastBytesHashMap(); for (int i = 0; i < 18; ++i) { byte[] key; while (true) { key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashMap.contains(key)) { // Unique keys for this test. break; } @@ -202,86 +226,96 @@ public void testExpand() throws Exception { byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - // verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastBytesHashMap.add(key, value); + // verifyFastBytesHashMap.verify(factory, hashMap); } - verifyTable.verify(map); - // assertEquals(1 << 18, map.getCapacity()); + verifyFastBytesHashMap.verify(factory, hashMap); + // assertEquals(1 << 18, hashMap.getCapacity()); } public void addAndVerifyMultipleKeyMultipleValue(int keyCount, - VectorMapJoinFastMultiKeyHashMap map, VerifyFastBytesHashMap verifyTable) + VectorMapJoinFastHashTableFactory factory, + VectorMapJoinFastMultiKeyHashMap hashMap, VerifyFastBytesHashMap verifyFastBytesHashMap) throws HiveException, IOException { for (int i = 0; i < keyCount; i++) { byte[] value = new byte[generateLargeCount() - 1]; random.nextBytes(value); // Add a new key or add a value to an existing key? - if (random.nextBoolean() || verifyTable.getCount() == 0) { + if (random.nextBoolean() || verifyFastBytesHashMap.getCount() == 0) { byte[] key; while (true) { key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashMap.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key, value); - verifyTable.add(key, value); - // verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastBytesHashMap.add(key, value); + // verifyFastBytesHashMap.verify(factory, hashMap); } else { - byte[] randomExistingKey = verifyTable.addRandomExisting(value, random); - map.testPutRow(randomExistingKey, value); - // verifyTable.verify(map); + byte[] randomExistingKey = verifyFastBytesHashMap.addRandomExisting(value, random); + hashMap.testPutRow(randomExistingKey, value); + // verifyFastBytesHashMap.verify(factory, hashMap); } } - verifyTable.verify(map); + verifyFastBytesHashMap.verify(factory, hashMap); } @Test public void testMultipleKeysMultipleValue() throws Exception { random = new Random(9332); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false,LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); - VerifyFastBytesHashMap verifyTable = new VerifyFastBytesHashMap(); + VerifyFastBytesHashMap verifyFastBytesHashMap = new VerifyFastBytesHashMap(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashMap, verifyFastBytesHashMap); } @Test public void testLargeAndExpand() throws Exception { random = new Random(21111); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false,MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE, 0); - VerifyFastBytesHashMap verifyTable = new VerifyFastBytesHashMap(); + VerifyFastBytesHashMap verifyFastBytesHashMap = new VerifyFastBytesHashMap(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashMap, verifyFastBytesHashMap); } @Test public void testReallyBig() throws Exception { random = new Random(42662); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false,LARGE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE, 0); - VerifyFastBytesHashMap verifyTable = new VerifyFastBytesHashMap(); + VerifyFastBytesHashMap verifyFastBytesHashMap = new VerifyFastBytesHashMap(); int keyCount = 1000000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashMap, verifyFastBytesHashMap); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMultiSet.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMultiSet.java index 449a8b2..52990a9 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMultiSet.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashMultiSet.java @@ -23,10 +23,13 @@ import java.io.IOException; import java.util.Random; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.CheckFastHashTable.VerifyFastBytesHashMultiSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; public class TestVectorMapJoinFastBytesHashMultiSet extends CommonFastHashTable { @@ -35,88 +38,100 @@ public void testOneKey() throws Exception { random = new Random(5255); - VectorMapJoinFastMultiKeyHashMultiSet map = - new VectorMapJoinFastMultiKeyHashMultiSet( - false,CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.MULTI_KEY); - VerifyFastBytesHashMultiSet verifyTable = new VerifyFastBytesHashMultiSet(); + VectorMapJoinFastMultiKeyHashMultiSet hashMultiSet = + (VectorMapJoinFastMultiKeyHashMultiSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastBytesHashMultiSet verifyFastBytesHashMultiSet = new VerifyFastBytesHashMultiSet(); byte[] key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastBytesHashMultiSet.add(key); + verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); // Second time. - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastBytesHashMultiSet.add(key); + verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); // Third time. - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastBytesHashMultiSet.add(key); + verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); } @Test public void testMultipleKeysSingleValue() throws Exception { random = new Random(2374); - VectorMapJoinFastMultiKeyHashMultiSet map = - new VectorMapJoinFastMultiKeyHashMultiSet( - false,CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.MULTI_KEY); + + VectorMapJoinFastMultiKeyHashMultiSet hashMultiSet = + (VectorMapJoinFastMultiKeyHashMultiSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastBytesHashMultiSet verifyTable = new VerifyFastBytesHashMultiSet(); + VerifyFastBytesHashMultiSet verifyFastBytesHashMultiSet = new VerifyFastBytesHashMultiSet(); int keyCount = 100 + random.nextInt(1000); for (int i = 0; i < keyCount; i++) { byte[] key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashMultiSet.contains(key)) { // Unique keys for this test. break; } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastBytesHashMultiSet.add(key); + // verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); } - verifyTable.verify(map); + verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); } @Test public void testGetNonExistent() throws Exception { random = new Random(98222); - VectorMapJoinFastMultiKeyHashMultiSet map = - new VectorMapJoinFastMultiKeyHashMultiSet( - false,CAPACITY, LOAD_FACTOR, WB_SIZE); - VerifyFastBytesHashMultiSet verifyTable = new VerifyFastBytesHashMultiSet(); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.MULTI_KEY); + + VectorMapJoinFastMultiKeyHashMultiSet hashMultiSet = + (VectorMapJoinFastMultiKeyHashMultiSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastBytesHashMultiSet verifyFastBytesHashMultiSet = new VerifyFastBytesHashMultiSet(); byte[] key1 = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key1); - map.testPutRow(key1); - verifyTable.add(key1); - verifyTable.verify(map); + hashMultiSet.testPutRow(key1); + verifyFastBytesHashMultiSet.add(key1); + verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); byte[] key2 = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key2); - VectorMapJoinHashMultiSetResult hashMultiSetResult = map.createHashMultiSetResult(); - JoinUtil.JoinResult joinResult = map.contains(key2, 0, key2.length, hashMultiSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashMultiSetResult hashMultiSetResult = factory.createHashMultiSetResult(); + int hashCode = HashCodeUtil.murmurHash(key2, 0, key2.length); + hashMultiSet.hashMultiSetContains(key2, 0, key2.length, hashCode, hashMultiSetResult); + assertTrue(hashMultiSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); - map.testPutRow(key2); - verifyTable.add(key2); - verifyTable.verify(map); + hashMultiSet.testPutRow(key2); + verifyFastBytesHashMultiSet.add(key2); + verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); byte[] key3 = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key3); - hashMultiSetResult = map.createHashMultiSetResult(); - joinResult = map.contains(key3, 0, key3.length, hashMultiSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + hashMultiSetResult = factory.createHashMultiSetResult(); + hashCode = HashCodeUtil.murmurHash(key3, 0, key3.length); + hashMultiSet.hashMultiSetContains(key3, 0, key3.length, hashCode, hashMultiSetResult); + assertTrue(hashMultiSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); assertEquals(hashMultiSetResult.count(), 0); } @@ -124,130 +139,146 @@ public void testGetNonExistent() throws Exception { public void testFullMap() throws Exception { random = new Random(9024); - // Make sure the map does not expand; should be able to find space. - VectorMapJoinFastMultiKeyHashMultiSet map = - new VectorMapJoinFastMultiKeyHashMultiSet(false,CAPACITY, 1f, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.MULTI_KEY); - VerifyFastBytesHashMultiSet verifyTable = new VerifyFastBytesHashMultiSet(); + // Make sure the hashMultiSet does not expand; should be able to find space. + VectorMapJoinFastMultiKeyHashMultiSet hashMultiSet = + (VectorMapJoinFastMultiKeyHashMultiSet) + factory.createHashTable(CAPACITY, 1f, WB_SIZE, 0); + + VerifyFastBytesHashMultiSet verifyFastBytesHashMultiSet = new VerifyFastBytesHashMultiSet(); for (int i = 0; i < CAPACITY; i++) { byte[] key; while (true) { key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashMultiSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastBytesHashMultiSet.add(key); + // verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); } - verifyTable.verify(map); + verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); byte[] anotherKey; while (true) { anotherKey = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(anotherKey); - if (!verifyTable.contains(anotherKey)) { + if (!verifyFastBytesHashMultiSet.contains(anotherKey)) { // Unique keys for this test. break; } } - VectorMapJoinHashMultiSetResult hashMultiSetResult = map.createHashMultiSetResult(); - JoinUtil.JoinResult joinResult = map.contains(anotherKey, 0, anotherKey.length, hashMultiSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashMultiSetResult hashMultiSetResult = factory.createHashMultiSetResult(); + int hashCode = HashCodeUtil.murmurHash(anotherKey, 0, anotherKey.length); + hashMultiSet.hashMultiSetContains(anotherKey, 0, anotherKey.length, hashCode, hashMultiSetResult); + assertTrue(hashMultiSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); } @Test public void testExpand() throws Exception { random = new Random(2933); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.MULTI_KEY); + // Start with capacity 1; make sure we expand on every put. - VectorMapJoinFastMultiKeyHashMultiSet map = - new VectorMapJoinFastMultiKeyHashMultiSet(false,1, 0.0000001f, WB_SIZE); + VectorMapJoinFastMultiKeyHashMultiSet hashMultiSet = + (VectorMapJoinFastMultiKeyHashMultiSet) + factory.createHashTable(1, 0.0000001f, WB_SIZE, 0); - VerifyFastBytesHashMultiSet verifyTable = new VerifyFastBytesHashMultiSet(); + VerifyFastBytesHashMultiSet verifyFastBytesHashMultiSet = new VerifyFastBytesHashMultiSet(); for (int i = 0; i < 18; ++i) { byte[] key; while (true) { key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashMultiSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastBytesHashMultiSet.add(key); + // verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); } - verifyTable.verify(map); - // assertEquals(1 << 18, map.getCapacity()); + verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); + // assertEquals(1 << 18, hashMultiSet.getCapacity()); } public void addAndVerifyMultipleKeyMultipleValue(int keyCount, - VectorMapJoinFastMultiKeyHashMultiSet map, VerifyFastBytesHashMultiSet verifyTable) + VectorMapJoinFastHashTableFactory factory, + VectorMapJoinFastMultiKeyHashMultiSet hashMultiSet, VerifyFastBytesHashMultiSet verifyFastBytesHashMultiSet) throws HiveException, IOException { for (int i = 0; i < keyCount; i++) { byte[] value = new byte[generateLargeCount() - 1]; random.nextBytes(value); // Add a new key or add a value to an existing key? - if (random.nextBoolean() || verifyTable.getCount() == 0) { + if (random.nextBoolean() || verifyFastBytesHashMultiSet.getCount() == 0) { byte[] key; while (true) { key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashMultiSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastBytesHashMultiSet.add(key); + // verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); } else { - byte[] randomExistingKey = verifyTable.addRandomExisting(value, random); - map.testPutRow(randomExistingKey); - // verifyTable.verify(map); + byte[] randomExistingKey = verifyFastBytesHashMultiSet.addRandomExisting(value, random); + hashMultiSet.testPutRow(randomExistingKey); + // verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); } } - verifyTable.verify(map); + verifyFastBytesHashMultiSet.verify(factory, hashMultiSet); } @Test public void testMultipleKeysMultipleValue() throws Exception { random = new Random(5445); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMultiSet map = - new VectorMapJoinFastMultiKeyHashMultiSet( - false,LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMultiSet hashMultiSet = + (VectorMapJoinFastMultiKeyHashMultiSet) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastBytesHashMultiSet verifyTable = new VerifyFastBytesHashMultiSet(); + VerifyFastBytesHashMultiSet verifyFastBytesHashMultiSet = new VerifyFastBytesHashMultiSet(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashMultiSet, verifyFastBytesHashMultiSet); } @Test public void testLargeAndExpand() throws Exception { random = new Random(5637); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMultiSet map = - new VectorMapJoinFastMultiKeyHashMultiSet( - false,MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMultiSet hashMultiSet = + (VectorMapJoinFastMultiKeyHashMultiSet) + factory.createHashTable(MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE, 0); - VerifyFastBytesHashMultiSet verifyTable = new VerifyFastBytesHashMultiSet(); + VerifyFastBytesHashMultiSet verifyFastBytesHashMultiSet = new VerifyFastBytesHashMultiSet(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashMultiSet, verifyFastBytesHashMultiSet); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashSet.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashSet.java index ef7c91c..3d4cdcc 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashSet.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastBytesHashSet.java @@ -23,10 +23,13 @@ import java.io.IOException; import java.util.Random; -import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.CheckFastHashTable.VerifyFastBytesHashSet; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; public class TestVectorMapJoinFastBytesHashSet extends CommonFastHashTable { @@ -35,218 +38,245 @@ public void testOneKey() throws Exception { random = new Random(81104); - VectorMapJoinFastMultiKeyHashSet map = - new VectorMapJoinFastMultiKeyHashSet( - false,CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.MULTI_KEY); - VerifyFastBytesHashSet verifyTable = new VerifyFastBytesHashSet(); + VectorMapJoinFastMultiKeyHashSet hashSet = + (VectorMapJoinFastMultiKeyHashSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastBytesHashSet verifyFastBytesHashSet = new VerifyFastBytesHashSet(); byte[] key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastBytesHashSet.add(key); + verifyFastBytesHashSet.verify(factory, hashSet); // Second time. - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastBytesHashSet.add(key); + verifyFastBytesHashSet.verify(factory, hashSet); // Third time. - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastBytesHashSet.add(key); + verifyFastBytesHashSet.verify(factory, hashSet); } @Test public void testMultipleKeysSingleValue() throws Exception { random = new Random(1120); - VectorMapJoinFastMultiKeyHashSet map = - new VectorMapJoinFastMultiKeyHashSet( - false,CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.MULTI_KEY); + + VectorMapJoinFastMultiKeyHashSet hashSet = + (VectorMapJoinFastMultiKeyHashSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastBytesHashSet verifyTable = new VerifyFastBytesHashSet(); + VerifyFastBytesHashSet verifyFastBytesHashSet = new VerifyFastBytesHashSet(); int keyCount = 100 + random.nextInt(1000); for (int i = 0; i < keyCount; i++) { byte[] key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashSet.contains(key)) { // Unique keys for this test. break; } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastBytesHashSet.add(key); + // verifyFastBytesHashSet.verify(factory, hashSet); } - verifyTable.verify(map); + verifyFastBytesHashSet.verify(factory, hashSet); } @Test public void testGetNonExistent() throws Exception { random = new Random(2293); - VectorMapJoinFastMultiKeyHashSet map = - new VectorMapJoinFastMultiKeyHashSet( - false,CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.MULTI_KEY); - VerifyFastBytesHashSet verifyTable = new VerifyFastBytesHashSet(); + VectorMapJoinFastMultiKeyHashSet hashSet = + (VectorMapJoinFastMultiKeyHashSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastBytesHashSet verifyFastBytesHashSet = new VerifyFastBytesHashSet(); byte[] key1 = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key1); - map.testPutRow(key1); - verifyTable.add(key1); - verifyTable.verify(map); + hashSet.testPutRow(key1); + verifyFastBytesHashSet.add(key1); + verifyFastBytesHashSet.verify(factory, hashSet); byte[] key2 = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key2); - VectorMapJoinHashSetResult hashSetResult = map.createHashSetResult(); - JoinUtil.JoinResult joinResult = map.contains(key2, 0, key2.length, hashSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashSetResult hashSetResult = factory.createHashSetResult(); + int hashCode = HashCodeUtil.murmurHash(key2, 0, key2.length); + hashSet.hashSetContains(key2, 0, key2.length, hashCode, hashSetResult); + assertTrue(hashSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); - map.testPutRow(key2); - verifyTable.add(key2); - verifyTable.verify(map); + hashSet.testPutRow(key2); + verifyFastBytesHashSet.add(key2); + verifyFastBytesHashSet.verify(factory, hashSet); byte[] key3 = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key3); - hashSetResult = map.createHashSetResult(); - joinResult = map.contains(key3, 0, key3.length, hashSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + hashSetResult = factory.createHashSetResult(); + hashCode = HashCodeUtil.murmurHash(key3, 0, key3.length); + hashSet.hashSetContains(key3, 0, key3.length, hashCode, hashSetResult); + assertTrue(hashSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); } @Test public void testFullMap() throws Exception { random = new Random(219); - // Make sure the map does not expand; should be able to find space. - VectorMapJoinFastMultiKeyHashSet map = - new VectorMapJoinFastMultiKeyHashSet(false,CAPACITY, 1f, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.MULTI_KEY); + + // Make sure the hashSet does not expand; should be able to find space. + VectorMapJoinFastMultiKeyHashSet hashSet = + (VectorMapJoinFastMultiKeyHashSet) + factory.createHashTable(CAPACITY, 1f, WB_SIZE, 0); - VerifyFastBytesHashSet verifyTable = new VerifyFastBytesHashSet(); + VerifyFastBytesHashSet verifyFastBytesHashSet = new VerifyFastBytesHashSet(); for (int i = 0; i < CAPACITY; i++) { byte[] key; while (true) { key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastBytesHashSet.add(key); + // verifyFastBytesHashSet.verify(factory, hashSet); } - verifyTable.verify(map); + verifyFastBytesHashSet.verify(factory, hashSet); byte[] anotherKey; while (true) { anotherKey = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(anotherKey); - if (!verifyTable.contains(anotherKey)) { + if (!verifyFastBytesHashSet.contains(anotherKey)) { // Unique keys for this test. break; } } - VectorMapJoinHashSetResult hashSetResult = map.createHashSetResult(); - JoinUtil.JoinResult joinResult = map.contains(anotherKey, 0, anotherKey.length, hashSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashSetResult hashSetResult = factory.createHashSetResult(); + int hashCode = HashCodeUtil.murmurHash(anotherKey, 0, anotherKey.length); + hashSet.hashSetContains(anotherKey, 0, anotherKey.length, hashCode, hashSetResult); + assertTrue(hashSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); } @Test public void testExpand() throws Exception { random = new Random(773); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.MULTI_KEY); + // Start with capacity 1; make sure we expand on every put. - VectorMapJoinFastMultiKeyHashSet map = - new VectorMapJoinFastMultiKeyHashSet(false,1, 0.0000001f, WB_SIZE); + VectorMapJoinFastMultiKeyHashSet hashSet = + (VectorMapJoinFastMultiKeyHashSet) + factory.createHashTable(1, 0.0000001f, WB_SIZE, 0); - VerifyFastBytesHashSet verifyTable = new VerifyFastBytesHashSet(); + VerifyFastBytesHashSet verifyFastBytesHashSet = new VerifyFastBytesHashSet(); for (int i = 0; i < 18; ++i) { byte[] key; while (true) { key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastBytesHashSet.add(key); + // verifyFastBytesHashSet.verify(factory, hashSet); } - verifyTable.verify(map); - // assertEquals(1 << 18, map.getCapacity()); + verifyFastBytesHashSet.verify(factory, hashSet); + // assertEquals(1 << 18, hashSet.getCapacity()); } public void addAndVerifyMultipleKeyMultipleValue(int keyCount, - VectorMapJoinFastMultiKeyHashSet map, VerifyFastBytesHashSet verifyTable) + VectorMapJoinFastHashTableFactory factory, + VectorMapJoinFastMultiKeyHashSet hashSet, VerifyFastBytesHashSet verifyFastBytesHashSet) throws HiveException, IOException { for (int i = 0; i < keyCount; i++) { byte[] value = new byte[generateLargeCount() - 1]; random.nextBytes(value); // Add a new key or add a value to an existing key? - if (random.nextBoolean() || verifyTable.getCount() == 0) { + if (random.nextBoolean() || verifyFastBytesHashSet.getCount() == 0) { byte[] key; while (true) { key = new byte[random.nextInt(MAX_KEY_LENGTH)]; random.nextBytes(key); - if (!verifyTable.contains(key)) { + if (!verifyFastBytesHashSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastBytesHashSet.add(key); + // verifyFastBytesHashSet.verify(factory, hashSet); } else { - byte[] randomExistingKey = verifyTable.addRandomExisting(value, random); - map.testPutRow(randomExistingKey); - // verifyTable.verify(map); + byte[] randomExistingKey = verifyFastBytesHashSet.addRandomExisting(value, random); + hashSet.testPutRow(randomExistingKey); + // verifyFastBytesHashSet.verify(factory, hashSet); } } - verifyTable.verify(map); + verifyFastBytesHashSet.verify(factory, hashSet); } @Test public void testMultipleKeysMultipleValue() throws Exception { random = new Random(9); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashSet map = - new VectorMapJoinFastMultiKeyHashSet( - false,LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashSet hashSet = + (VectorMapJoinFastMultiKeyHashSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); - VerifyFastBytesHashSet verifyTable = new VerifyFastBytesHashSet(); + VerifyFastBytesHashSet verifyFastBytesHashSet = new VerifyFastBytesHashSet(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashSet, verifyFastBytesHashSet); } @Test public void testLargeAndExpand() throws Exception { random = new Random(8462); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashSet map = - new VectorMapJoinFastMultiKeyHashSet( - false,MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + VectorMapJoinFastMultiKeyHashSet hashSet = + (VectorMapJoinFastMultiKeyHashSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE, 0); - VerifyFastBytesHashSet verifyTable = new VerifyFastBytesHashSet(); + VerifyFastBytesHashSet verifyFastBytesHashSet = new VerifyFastBytesHashSet(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashSet, verifyFastBytesHashSet); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java index e8bbee3..550a7ea 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java @@ -21,12 +21,14 @@ import java.io.IOException; import java.util.Random; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMapResult; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.CheckFastHashTable.VerifyFastLongHashMap; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMapResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastLongHashMap; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; import static org.junit.Assert.*; @@ -37,51 +39,57 @@ public void testOneKey() throws Exception { random = new Random(33221); - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); - VerifyFastLongHashMap verifyTable = new VerifyFastLongHashMap(); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastLongHashMap verifyFastLongHashMap = new VerifyFastLongHashMap(); long key = random.nextLong(); byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastLongHashMap.add(key, value); + verifyFastLongHashMap.verify(factory, hashMap); // Second value. value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastLongHashMap.add(key, value); + verifyFastLongHashMap.verify(factory, hashMap); // Third value. value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastLongHashMap.add(key, value); + verifyFastLongHashMap.verify(factory, hashMap); } @Test public void testMultipleKeysSingleValue() throws Exception { random = new Random(900); - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); + + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastLongHashMap verifyTable = new VerifyFastLongHashMap(); + VerifyFastLongHashMap verifyFastLongHashMap = new VerifyFastLongHashMap(); int keyCount = 100 + random.nextInt(1000); for (int i = 0; i < keyCount; i++) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashMap.contains(key)) { // Unique keys for this test. break; } @@ -89,45 +97,50 @@ public void testMultipleKeysSingleValue() throws Exception { byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - // verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastLongHashMap.add(key, value); + // verifyFastLongHashMap.verify(factory, hashMap); } - verifyTable.verify(map); + verifyFastLongHashMap.verify(factory, hashMap); } @Test public void testGetNonExistent() throws Exception { random = new Random(450); - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); - VerifyFastLongHashMap verifyTable = new VerifyFastLongHashMap(); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastLongHashMap verifyFastLongHashMap = new VerifyFastLongHashMap(); long key1 = random.nextLong(); byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key1, value); - verifyTable.add(key1, value); - verifyTable.verify(map); + hashMap.testPutRow(key1, value); + verifyFastLongHashMap.add(key1, value); + verifyFastLongHashMap.verify(factory, hashMap); long key2 = key1 += 1; - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(key2, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + int hashCode = HashCodeUtil.calculateLongHashCode(key2); + hashMap.hashMapLookup(key2, hashCode, hashMapResult); + assertTrue(hashMapResult.getMapJoinResult() == MapJoinResult.NO_MATCH); assertTrue(!hashMapResult.hasRows()); - map.testPutRow(key2, value); - verifyTable.add(key2, value); - verifyTable.verify(map); + hashMap.testPutRow(key2, value); + verifyFastLongHashMap.add(key2, value); + verifyFastLongHashMap.verify(factory, hashMap); long key3 = key2 += 1; - hashMapResult = map.createHashMapResult(); - joinResult = map.lookup(key3, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + hashMapResult = factory.createHashMapResult(); + hashCode = HashCodeUtil.calculateLongHashCode(key3); + hashMap.hashMapLookup(key3, hashCode, hashMapResult); + assertTrue(hashMapResult.getMapJoinResult() == MapJoinResult.NO_MATCH); assertTrue(!hashMapResult.hasRows()); } @@ -135,18 +148,21 @@ public void testGetNonExistent() throws Exception { public void testFullMap() throws Exception { random = new Random(93440); - // Make sure the map does not expand; should be able to find space. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, CAPACITY, 1f, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); + + // Make sure the hashMap does not expand; should be able to find space. + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(CAPACITY, 1f, WB_SIZE, 0); - VerifyFastLongHashMap verifyTable = new VerifyFastLongHashMap(); + VerifyFastLongHashMap verifyFastLongHashMap = new VerifyFastLongHashMap(); for (int i = 0; i < CAPACITY; i++) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashMap.contains(key)) { // Unique keys for this test. break; } @@ -154,42 +170,47 @@ public void testFullMap() throws Exception { byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - // verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastLongHashMap.add(key, value); + // verifyFastLongHashMap.verify(factory, hashMap); } - verifyTable.verify(map); + verifyFastLongHashMap.verify(factory, hashMap); long anotherKey; while (true) { anotherKey = random.nextLong(); - if (!verifyTable.contains(anotherKey)) { + if (!verifyFastLongHashMap.contains(anotherKey)) { // Unique keys for this test. break; } } - VectorMapJoinHashMapResult hashMapResult = map.createHashMapResult(); - JoinUtil.JoinResult joinResult = map.lookup(anotherKey, hashMapResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashMapResult hashMapResult = factory.createHashMapResult(); + int hashCode = HashCodeUtil.calculateLongHashCode(anotherKey); + hashMap.hashMapLookup(anotherKey, hashCode, hashMapResult); + assertTrue(hashMapResult.getMapJoinResult() == MapJoinResult.NO_MATCH); } @Test public void testExpand() throws Exception { - random = new Random(5227); + + random = new Random(22470); + + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); // Start with capacity 1; make sure we expand on every put. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, 1, 0.0000001f, WB_SIZE); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(1, 0.0000001f, WB_SIZE, 0); - VerifyFastLongHashMap verifyTable = new VerifyFastLongHashMap(); + VerifyFastLongHashMap verifyFastLongHashMap = new VerifyFastLongHashMap(); for (int i = 0; i < 18; ++i) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashMap.contains(key)) { // Unique keys for this test. break; } @@ -197,70 +218,77 @@ public void testExpand() throws Exception { byte[] value = new byte[random.nextInt(MAX_VALUE_LENGTH)]; random.nextBytes(value); - map.testPutRow(key, value); - verifyTable.add(key, value); - // verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastLongHashMap.add(key, value); + // verifyFastLongHashMap.verify(factory, hashMap); } - verifyTable.verify(map); - // assertEquals(1 << 18, map.getCapacity()); + verifyFastLongHashMap.verify(factory, hashMap); + // assertEquals(1 << 18, hashMap.getCapacity()); } public void addAndVerifyMultipleKeyMultipleValue(int keyCount, - VectorMapJoinFastLongHashMap map, VerifyFastLongHashMap verifyTable) + VectorMapJoinFastHashTableFactory factory, + VectorMapJoinFastLongHashMap hashMap, VerifyFastLongHashMap verifyFastLongHashMap) throws HiveException, IOException { for (int i = 0; i < keyCount; i++) { byte[] value = new byte[generateLargeCount() - 1]; random.nextBytes(value); // Add a new key or add a value to an existing key? - if (random.nextBoolean() || verifyTable.getCount() == 0) { + if (random.nextBoolean() || verifyFastLongHashMap.getCount() == 0) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashMap.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key, value); - verifyTable.add(key, value); - verifyTable.verify(map); + hashMap.testPutRow(key, value); + verifyFastLongHashMap.add(key, value); + verifyFastLongHashMap.verify(factory, hashMap); } else { - long randomExistingKey = verifyTable.addRandomExisting(value, random); - map.testPutRow(randomExistingKey, value); - // verifyTable.verify(map); + long randomExistingKey = verifyFastLongHashMap.addRandomExisting(value, random); + hashMap.testPutRow(randomExistingKey, value); + // verifyFastLongHashMap.verify(factory, hashMap); } - verifyTable.verify(map); + verifyFastLongHashMap.verify(factory, hashMap); } } @Test public void testMultipleKeysMultipleValue() throws Exception { random = new Random(8); - // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); - VerifyFastLongHashMap verifyTable = new VerifyFastLongHashMap(); + // Use a large capacity that doesn't require expansion, yet. + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastLongHashMap verifyFastLongHashMap = new VerifyFastLongHashMap(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashMap, verifyFastLongHashMap); } @Test public void testLargeAndExpand() throws Exception { random = new Random(20); - // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); + + // Use a large capacity that doesn't require expansion, yet. + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(MODERATE_CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastLongHashMap verifyTable = new VerifyFastLongHashMap(); + VerifyFastLongHashMap verifyFastLongHashMap = new VerifyFastLongHashMap(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashMap, verifyFastLongHashMap); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMultiSet.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMultiSet.java index 9e94611..bd20a8c 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMultiSet.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMultiSet.java @@ -21,12 +21,14 @@ import java.io.IOException; import java.util.Random; -import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashMultiSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.CheckFastHashTable.VerifyFastLongHashMultiSet; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastLongHashMultiSet; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; import static org.junit.Assert.*; @@ -37,87 +39,98 @@ public void testOneKey() throws Exception { random = new Random(458); - VectorMapJoinFastLongHashMultiSet map = - new VectorMapJoinFastLongHashMultiSet( - false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.LONG); - VerifyFastLongHashMultiSet verifyTable = new VerifyFastLongHashMultiSet(); + VectorMapJoinFastLongHashMultiSet hashMultiSet = + (VectorMapJoinFastLongHashMultiSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastLongHashMultiSet verifyFastLongHashMultiSet = new VerifyFastLongHashMultiSet(); long key = random.nextLong(); - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastLongHashMultiSet.add(key); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); // Second time. - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastLongHashMultiSet.add(key); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); // Third time. - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastLongHashMultiSet.add(key); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); } @Test public void testMultipleKeysSingleValue() throws Exception { random = new Random(8000); - VectorMapJoinFastLongHashMultiSet map = - new VectorMapJoinFastLongHashMultiSet( - false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.LONG); + + VectorMapJoinFastLongHashMultiSet hashMultiSet = + (VectorMapJoinFastLongHashMultiSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastLongHashMultiSet verifyTable = new VerifyFastLongHashMultiSet(); + VerifyFastLongHashMultiSet verifyFastLongHashMultiSet = new VerifyFastLongHashMultiSet(); int keyCount = 100 + random.nextInt(1000); for (int i = 0; i < keyCount; i++) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashMultiSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastLongHashMultiSet.add(key); + // verifyFastLongHashMultiSet.verify(factory, hashMultiSet); } - verifyTable.verify(map); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); } @Test public void testGetNonExistent() throws Exception { random = new Random(4000); - VectorMapJoinFastLongHashMultiSet map = - new VectorMapJoinFastLongHashMultiSet( - false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.LONG); - VerifyFastLongHashMultiSet verifyTable = new VerifyFastLongHashMultiSet(); + VectorMapJoinFastLongHashMultiSet hashMultiSet = + (VectorMapJoinFastLongHashMultiSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastLongHashMultiSet verifyFastLongHashMultiSet = new VerifyFastLongHashMultiSet(); long key1 = random.nextLong(); - map.testPutRow(key1); - verifyTable.add(key1); - verifyTable.verify(map); + hashMultiSet.testPutRow(key1); + verifyFastLongHashMultiSet.add(key1); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); long key2 = key1 += 1; - VectorMapJoinHashMultiSetResult hashMultiSetResult = map.createHashMultiSetResult(); - JoinUtil.JoinResult joinResult = map.contains(key2, hashMultiSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashMultiSetResult hashMultiSetResult = factory.createHashMultiSetResult(); + int hashCode = HashCodeUtil.calculateLongHashCode(key2); + hashMultiSet.hashMultiSetContains(key2, hashCode, hashMultiSetResult); + assertTrue(hashMultiSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); assertEquals(hashMultiSetResult.count(), 0); - map.testPutRow(key2); - verifyTable.add(key2); - verifyTable.verify(map); + hashMultiSet.testPutRow(key2); + verifyFastLongHashMultiSet.add(key2); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); long key3 = key2 += 1; - hashMultiSetResult = map.createHashMultiSetResult(); - joinResult = map.contains(key3, hashMultiSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + hashMultiSetResult = factory.createHashMultiSetResult(); + hashCode = HashCodeUtil.calculateLongHashCode(key3); + hashMultiSet.hashMultiSetContains(key3, hashCode, hashMultiSetResult); + assertTrue(hashMultiSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); assertEquals(hashMultiSetResult.count(), 0); } @@ -125,128 +138,142 @@ public void testGetNonExistent() throws Exception { public void testFullMap() throws Exception { random = new Random(25000); - // Make sure the map does not expand; should be able to find space. - VectorMapJoinFastLongHashMultiSet map = - new VectorMapJoinFastLongHashMultiSet( - false, false, HashTableKeyType.LONG, CAPACITY, 1f, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.LONG); + + // Make sure the hashMultiSet does not expand; should be able to find space. + VectorMapJoinFastLongHashMultiSet hashMultiSet = + (VectorMapJoinFastLongHashMultiSet) + factory.createHashTable(CAPACITY, 1f, WB_SIZE, 0); - VerifyFastLongHashMultiSet verifyTable = new VerifyFastLongHashMultiSet(); + VerifyFastLongHashMultiSet verifyFastLongHashMultiSet = new VerifyFastLongHashMultiSet(); for (int i = 0; i < CAPACITY; i++) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashMultiSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastLongHashMultiSet.add(key); + // verifyFastLongHashMultiSet.verify(factory, hashMultiSet); } - verifyTable.verify(map); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); long anotherKey; while (true) { anotherKey = random.nextLong(); - if (!verifyTable.contains(anotherKey)) { + if (!verifyFastLongHashMultiSet.contains(anotherKey)) { // Unique keys for this test. break; } } - VectorMapJoinHashMultiSetResult hashMultiSetResult = map.createHashMultiSetResult(); - JoinUtil.JoinResult joinResult = map.contains(anotherKey, hashMultiSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashMultiSetResult hashMultiSetResult = factory.createHashMultiSetResult(); + int hashCode = HashCodeUtil.calculateLongHashCode(anotherKey); + hashMultiSet.hashMultiSetContains(anotherKey, hashCode, hashMultiSetResult); + assertTrue(hashMultiSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); } @Test public void testExpand() throws Exception { random = new Random(30000); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.LONG); + // Start with capacity 1; make sure we expand on every put. - VectorMapJoinFastLongHashMultiSet map = - new VectorMapJoinFastLongHashMultiSet( - false, false, HashTableKeyType.LONG, 1, 0.0000001f, WB_SIZE); + VectorMapJoinFastLongHashMultiSet hashMultiSet = + (VectorMapJoinFastLongHashMultiSet) + factory.createHashTable(1, 0.0000001f, WB_SIZE, 0); - VerifyFastLongHashMultiSet verifyTable = new VerifyFastLongHashMultiSet(); + VerifyFastLongHashMultiSet verifyFastLongHashMultiSet = new VerifyFastLongHashMultiSet(); for (int i = 0; i < 18; ++i) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashMultiSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastLongHashMultiSet.add(key); + // verifyFastLongHashMultiSet.verify(factory, hashMultiSet); } - verifyTable.verify(map); - // assertEquals(1 << 18, map.getCapacity()); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); + // assertEquals(1 << 18, hashMultiSet.getCapacity()); } public void addAndVerifyMultipleKeyMultipleValue(int keyCount, - VectorMapJoinFastLongHashMultiSet map, VerifyFastLongHashMultiSet verifyTable) + VectorMapJoinFastHashTableFactory factory, + VectorMapJoinFastLongHashMultiSet hashMultiSet, VerifyFastLongHashMultiSet verifyFastLongHashMultiSet) throws HiveException, IOException { for (int i = 0; i < keyCount; i++) { byte[] value = new byte[generateLargeCount() - 1]; random.nextBytes(value); // Add a new key or add a value to an existing key? - if (random.nextBoolean() || verifyTable.getCount() == 0) { + if (random.nextBoolean() || verifyFastLongHashMultiSet.getCount() == 0) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashMultiSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashMultiSet.testPutRow(key); + verifyFastLongHashMultiSet.add(key); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); } else { - long randomExistingKey = verifyTable.addRandomExisting(value, random); - map.testPutRow(randomExistingKey); - // verifyTable.verify(map); + long randomExistingKey = verifyFastLongHashMultiSet.addRandomExisting(value, random); + hashMultiSet.testPutRow(randomExistingKey); + // verifyFastLongHashMultiSet.verify(factory, hashMultiSet); } - verifyTable.verify(map); + verifyFastLongHashMultiSet.verify(factory, hashMultiSet); } } @Test public void testMultipleKeysMultipleValue() throws Exception { random = new Random(333); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.LONG); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMultiSet map = - new VectorMapJoinFastLongHashMultiSet( - false, false, HashTableKeyType.LONG, LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashMultiSet hashMultiSet = + (VectorMapJoinFastLongHashMultiSet) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastLongHashMultiSet verifyTable = new VerifyFastLongHashMultiSet(); + VerifyFastLongHashMultiSet verifyFastLongHashMultiSet = new VerifyFastLongHashMultiSet(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashMultiSet, verifyFastLongHashMultiSet); } @Test public void testLargeAndExpand() throws Exception { random = new Random(790); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MULTISET, HashTableKeyType.LONG); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMultiSet map = - new VectorMapJoinFastLongHashMultiSet( - false, false, HashTableKeyType.LONG, MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + VectorMapJoinFastLongHashMultiSet hashMultiSet = + (VectorMapJoinFastLongHashMultiSet) + factory.createHashTable(MODERATE_CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastLongHashMultiSet verifyTable = new VerifyFastLongHashMultiSet(); + VerifyFastLongHashMultiSet verifyFastLongHashMultiSet = new VerifyFastLongHashMultiSet(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashMultiSet, verifyFastLongHashMultiSet); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashSet.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashSet.java index 698bcdc..c57be07 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashSet.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashSet.java @@ -22,11 +22,14 @@ import java.util.Random; import org.apache.hadoop.hive.ql.exec.JoinUtil; -import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableResult.MapJoinResult; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.CheckFastHashTable.VerifyFastLongHashSet; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastLongHashSet; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; +import org.apache.hive.common.util.HashCodeUtil; import org.junit.Test; import static org.junit.Assert.*; @@ -37,214 +40,239 @@ public void testOneKey() throws Exception { random = new Random(4186); - VectorMapJoinFastLongHashSet map = - new VectorMapJoinFastLongHashSet( - false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.LONG); - VerifyFastLongHashSet verifyTable = new VerifyFastLongHashSet(); + VectorMapJoinFastLongHashSet hashSet = + (VectorMapJoinFastLongHashSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastLongHashSet verifyFastLongHashSet = new VerifyFastLongHashSet(); long key = random.nextLong(); - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastLongHashSet.add(key); + verifyFastLongHashSet.verify(factory, hashSet); // Second time. - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastLongHashSet.add(key); + verifyFastLongHashSet.verify(factory, hashSet); // Third time. - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastLongHashSet.add(key); + verifyFastLongHashSet.verify(factory, hashSet); } @Test public void testMultipleKeysSingleValue() throws Exception { random = new Random(1412); - VectorMapJoinFastLongHashSet map = - new VectorMapJoinFastLongHashSet( - false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.LONG); + + VectorMapJoinFastLongHashSet hashSet = + (VectorMapJoinFastLongHashSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastLongHashSet verifyTable = new VerifyFastLongHashSet(); + VerifyFastLongHashSet verifyFastLongHashSet = new VerifyFastLongHashSet(); int keyCount = 100 + random.nextInt(1000); for (int i = 0; i < keyCount; i++) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastLongHashSet.add(key); + // verifyFastLongHashSet.verify(factory, hashSet); } - verifyTable.verify(map); + verifyFastLongHashSet.verify(factory, hashSet); } @Test public void testGetNonExistent() throws Exception { random = new Random(100); - VectorMapJoinFastLongHashSet map = - new VectorMapJoinFastLongHashSet( - false, false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.LONG); - VerifyFastLongHashSet verifyTable = new VerifyFastLongHashSet(); + VectorMapJoinFastLongHashSet hashSet = + (VectorMapJoinFastLongHashSet) + factory.createHashTable(CAPACITY, LOAD_FACTOR, WB_SIZE, 0); + + VerifyFastLongHashSet verifyFastLongHashSet = new VerifyFastLongHashSet(); long key1 = random.nextLong(); - map.testPutRow(key1); - verifyTable.add(key1); - verifyTable.verify(map); + hashSet.testPutRow(key1); + verifyFastLongHashSet.add(key1); + verifyFastLongHashSet.verify(factory, hashSet); long key2 = key1 += 1; - VectorMapJoinHashSetResult hashSetResult = map.createHashSetResult(); - JoinUtil.JoinResult joinResult = map.contains(key2, hashSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashSetResult hashSetResult = factory.createHashSetResult(); + int hashCode = HashCodeUtil.calculateLongHashCode(key2); + hashSet.hashSetContains(key2, hashCode, hashSetResult); + assertTrue(hashSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); - map.testPutRow(key2); - verifyTable.add(key2); - verifyTable.verify(map); + hashSet.testPutRow(key2); + verifyFastLongHashSet.add(key2); + verifyFastLongHashSet.verify(factory, hashSet); long key3 = key2 += 1; - hashSetResult = map.createHashSetResult(); - joinResult = map.contains(key3, hashSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + hashSetResult = factory.createHashSetResult(); + hashCode = HashCodeUtil.calculateLongHashCode(key3); + hashSet.hashSetContains(key3, hashCode, hashSetResult); + assertTrue(hashSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); } @Test public void testFullMap() throws Exception { random = new Random(2520); - // Make sure the map does not expand; should be able to find space. - VectorMapJoinFastLongHashSet map = - new VectorMapJoinFastLongHashSet( - false, false, HashTableKeyType.LONG, CAPACITY, 1f, WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.LONG); + + // Make sure the hashSet does not expand; should be able to find space. + VectorMapJoinFastLongHashSet hashSet = + (VectorMapJoinFastLongHashSet) + factory.createHashTable(CAPACITY, 1f, WB_SIZE, 0); - VerifyFastLongHashSet verifyTable = new VerifyFastLongHashSet(); + VerifyFastLongHashSet verifyFastLongHashSet = new VerifyFastLongHashSet(); for (int i = 0; i < CAPACITY; i++) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastLongHashSet.add(key); + // verifyFastLongHashSet.verify(factory, hashSet); } - verifyTable.verify(map); + verifyFastLongHashSet.verify(factory, hashSet); long anotherKey; while (true) { anotherKey = random.nextLong(); - if (!verifyTable.contains(anotherKey)) { + if (!verifyFastLongHashSet.contains(anotherKey)) { // Unique keys for this test. break; } } - VectorMapJoinHashSetResult hashSetResult = map.createHashSetResult(); - JoinUtil.JoinResult joinResult = map.contains(anotherKey, hashSetResult); - assertTrue(joinResult == JoinUtil.JoinResult.NOMATCH); + MapJoinHashSetResult hashSetResult = factory.createHashSetResult(); + int hashCode = HashCodeUtil.calculateLongHashCode(anotherKey); + hashSet.hashSetContains(anotherKey, hashCode, hashSetResult); + assertTrue(hashSetResult.getMapJoinResult() == MapJoinResult.NO_MATCH); } @Test public void testExpand() throws Exception { random = new Random(348); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.LONG); + // Start with capacity 1; make sure we expand on every put. - VectorMapJoinFastLongHashSet map = - new VectorMapJoinFastLongHashSet( - false, false, HashTableKeyType.LONG, 1, 0.0000001f, WB_SIZE); + VectorMapJoinFastLongHashSet hashSet = + (VectorMapJoinFastLongHashSet) + factory.createHashTable(1, 0.0000001f, WB_SIZE, 0); - VerifyFastLongHashSet verifyTable = new VerifyFastLongHashSet(); + VerifyFastLongHashSet verifyFastLongHashSet = new VerifyFastLongHashSet(); for (int i = 0; i < 18; ++i) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - // verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastLongHashSet.add(key); + // verifyFastLongHashSet.verify(factory, hashSet); } - verifyTable.verify(map); - // assertEquals(1 << 18, map.getCapacity()); + verifyFastLongHashSet.verify(factory, hashSet); + // assertEquals(1 << 18, hashSet.getCapacity()); } public void addAndVerifyMultipleKeyMultipleValue(int keyCount, - VectorMapJoinFastLongHashSet map, VerifyFastLongHashSet verifyTable) + VectorMapJoinFastHashTableFactory factory, + VectorMapJoinFastLongHashSet hashSet, VerifyFastLongHashSet verifyFastLongHashSet) throws HiveException, IOException { for (int i = 0; i < keyCount; i++) { byte[] value = new byte[generateLargeCount() - 1]; random.nextBytes(value); // Add a new key or add a value to an existing key? - if (random.nextBoolean() || verifyTable.getCount() == 0) { + if (random.nextBoolean() || verifyFastLongHashSet.getCount() == 0) { long key; while (true) { key = random.nextLong(); - if (!verifyTable.contains(key)) { + if (!verifyFastLongHashSet.contains(key)) { // Unique keys for this test. break; } } - map.testPutRow(key); - verifyTable.add(key); - verifyTable.verify(map); + hashSet.testPutRow(key); + verifyFastLongHashSet.add(key); + verifyFastLongHashSet.verify(factory, hashSet); } else { - long randomExistingKey = verifyTable.addRandomExisting(value, random); - map.testPutRow(randomExistingKey); - // verifyTable.verify(map); + long randomExistingKey = verifyFastLongHashSet.addRandomExisting(value, random); + hashSet.testPutRow(randomExistingKey); + // verifyFastLongHashSet.verify(factory, hashSet); } - verifyTable.verify(map); + verifyFastLongHashSet.verify(factory, hashSet); } } @Test public void testMultipleKeysMultipleValue() throws Exception { random = new Random(7778); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.LONG); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashSet map = - new VectorMapJoinFastLongHashSet( - false, false, HashTableKeyType.LONG, LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashSet hashSet = + (VectorMapJoinFastLongHashSet) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastLongHashSet verifyTable = new VerifyFastLongHashSet(); + VerifyFastLongHashSet verifyFastLongHashSet = new VerifyFastLongHashSet(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashSet, verifyFastLongHashSet); } @Test public void testLargeAndExpand() throws Exception { random = new Random(56); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_SET, HashTableKeyType.LONG); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashSet map = - new VectorMapJoinFastLongHashSet( - false, false, HashTableKeyType.LONG, MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE); + VectorMapJoinFastLongHashSet hashSet = + (VectorMapJoinFastLongHashSet) + factory.createHashTable(MODERATE_CAPACITY, LOAD_FACTOR, WB_SIZE, 0); - VerifyFastLongHashSet verifyTable = new VerifyFastLongHashSet(); + VerifyFastLongHashSet verifyFastLongHashSet = new VerifyFastLongHashSet(); int keyCount = 1000; - addAndVerifyMultipleKeyMultipleValue(keyCount, map, verifyTable); + addAndVerifyMultipleKeyMultipleValue(keyCount, factory, hashSet, verifyFastLongHashSet); } } diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java index 3f02eb3..d184f2b 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastRowHashMap.java @@ -23,10 +23,13 @@ import java.util.Arrays; import java.util.Random; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableFactory; +import org.apache.hadoop.hive.ql.exec.persistence.mapjoinhashtable.MapJoinHashTableManage.KeyValuePut; import org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.CheckFastRowHashMap.VerifyFastRowHashMap; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe; @@ -51,6 +54,7 @@ public class TestVectorMapJoinFastRowHashMap extends CommonFastHashTable { private void addAndVerifyRows(VectorRandomRowSource valueSource, Object[][] rows, + MapJoinHashTableFactory factory, VectorMapJoinFastHashTable map, HashTableKeyType hashTableKeyType, VerifyFastRowHashMap verifyTable, String[] keyTypeNames, boolean doClipping, boolean useExactBytes) throws HiveException, IOException, SerDeException { @@ -132,10 +136,12 @@ private void addAndVerifyRows(VectorRandomRowSource valueSource, Object[][] rows // Serialize keyRow into key bytes. BytesWritable keyWritable = new BytesWritable(key); BytesWritable valueWritable = new BytesWritable(value); - map.putRow(keyWritable, valueWritable); + KeyValuePut keyValuePut = factory.createKeyValuePut(); + keyValuePut.setKeyValue(keyWritable, valueWritable); + map.put(keyValuePut); // verifyTable.verify(map); } - verifyTable.verify(map, hashTableKeyType, valuePrimitiveTypeInfos, + verifyTable.verify(factory, map, hashTableKeyType, valuePrimitiveTypeInfos, doClipping, useExactBytes, random); } @@ -143,11 +149,13 @@ private void addAndVerifyRows(VectorRandomRowSource valueSource, Object[][] rows public void testBigIntRows() throws Exception { random = new Random(927337); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -158,7 +166,7 @@ public void testBigIntRows() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.LONG, verifyTable, + factory, hashMap, HashTableKeyType.LONG, verifyTable, new String[] { "bigint" }, /* doClipping */ false, /* useExactBytes */ false); } @@ -167,11 +175,13 @@ public void testBigIntRows() throws Exception { public void testIntRows() throws Exception { random = new Random(927337); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.INT); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.INT, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -182,7 +192,7 @@ public void testIntRows() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.INT, verifyTable, + factory, hashMap, HashTableKeyType.INT, verifyTable, new String[] { "int" }, /* doClipping */ false, /* useExactBytes */ false); } @@ -191,11 +201,13 @@ public void testIntRows() throws Exception { public void testStringRows() throws Exception { random = new Random(927337); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.STRING); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastStringHashMap map = - new VectorMapJoinFastStringHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastStringHashMap hashMap = + (VectorMapJoinFastStringHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -206,7 +218,7 @@ public void testStringRows() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.STRING, verifyTable, + factory, hashMap, HashTableKeyType.STRING, verifyTable, new String[] { "string" }, /* doClipping */ false, /* useExactBytes */ false); } @@ -215,11 +227,13 @@ public void testStringRows() throws Exception { public void testMultiKeyRows1() throws Exception { random = new Random(833); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -230,7 +244,7 @@ public void testMultiKeyRows1() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "int", "int" }, /* doClipping */ false, /* useExactBytes */ false); } @@ -239,11 +253,13 @@ public void testMultiKeyRows1() throws Exception { public void testMultiKeyRows2() throws Exception { random = new Random(833099); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -254,7 +270,7 @@ public void testMultiKeyRows2() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "string", "string" }, /* doClipping */ false, /* useExactBytes */ false); } @@ -263,11 +279,13 @@ public void testMultiKeyRows2() throws Exception { public void testMultiKeyRows3() throws Exception { random = new Random(833099); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -278,7 +296,7 @@ public void testMultiKeyRows3() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "bigint", "timestamp", "double" }, /* doClipping */ false, /* useExactBytes */ false); } @@ -287,11 +305,13 @@ public void testMultiKeyRows3() throws Exception { public void testBigIntRowsClipped() throws Exception { random = new Random(326232); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -302,7 +322,7 @@ public void testBigIntRowsClipped() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.LONG, verifyTable, + factory, hashMap, HashTableKeyType.LONG, verifyTable, new String[] { "bigint" }, /* doClipping */ true, /* useExactBytes */ false); } @@ -311,11 +331,13 @@ public void testBigIntRowsClipped() throws Exception { public void testIntRowsClipped() throws Exception { random = new Random(326232); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.INT); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.INT, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -326,7 +348,7 @@ public void testIntRowsClipped() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.INT, verifyTable, + factory, hashMap, HashTableKeyType.INT, verifyTable, new String[] { "int" }, /* doClipping */ true, /* useExactBytes */ false); } @@ -335,11 +357,13 @@ public void testIntRowsClipped() throws Exception { public void testStringRowsClipped() throws Exception { random = new Random(326232); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.STRING); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastStringHashMap map = - new VectorMapJoinFastStringHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastStringHashMap hashMap = + (VectorMapJoinFastStringHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -350,7 +374,7 @@ public void testStringRowsClipped() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.STRING, verifyTable, + factory, hashMap, HashTableKeyType.STRING, verifyTable, new String[] { "string" }, /* doClipping */ true, /* useExactBytes */ false); } @@ -359,11 +383,13 @@ public void testStringRowsClipped() throws Exception { public void testMultiKeyRowsClipped1() throws Exception { random = new Random(2331); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -374,7 +400,7 @@ public void testMultiKeyRowsClipped1() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "varchar(20)", "date", "interval_day_time" }, /* doClipping */ true, /* useExactBytes */ false); } @@ -383,11 +409,13 @@ public void testMultiKeyRowsClipped1() throws Exception { public void testMultiKeyRowsClipped2() throws Exception { random = new Random(7403); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -398,7 +426,7 @@ public void testMultiKeyRowsClipped2() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "varchar(20)", "varchar(40)" }, /* doClipping */ true, /* useExactBytes */ false); } @@ -407,11 +435,13 @@ public void testMultiKeyRowsClipped2() throws Exception { public void testMultiKeyRowsClipped3() throws Exception { random = new Random(99); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -422,7 +452,7 @@ public void testMultiKeyRowsClipped3() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "float", "tinyint" }, /* doClipping */ true, /* useExactBytes */ false); } @@ -432,11 +462,13 @@ public void testMultiKeyRowsClipped3() throws Exception { public void testBigIntRowsExact() throws Exception { random = new Random(27722); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -447,7 +479,7 @@ public void testBigIntRowsExact() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.LONG, verifyTable, + factory, hashMap, HashTableKeyType.LONG, verifyTable, new String[] { "bigint" }, /* doClipping */ false, /* useExactBytes */ true); } @@ -456,11 +488,13 @@ public void testBigIntRowsExact() throws Exception { public void testIntRowsExact() throws Exception { random = new Random(8238383); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.INT); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.INT, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -471,7 +505,7 @@ public void testIntRowsExact() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.INT, verifyTable, + factory, hashMap, HashTableKeyType.INT, verifyTable, new String[] { "int" }, /* doClipping */ false, /* useExactBytes */ true); } @@ -480,11 +514,13 @@ public void testIntRowsExact() throws Exception { public void testStringRowsExact() throws Exception { random = new Random(8235); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.STRING); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastStringHashMap map = - new VectorMapJoinFastStringHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastStringHashMap hashMap = + (VectorMapJoinFastStringHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -495,7 +531,7 @@ public void testStringRowsExact() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.STRING, verifyTable, + factory, hashMap, HashTableKeyType.STRING, verifyTable, new String[] { "string" }, /* doClipping */ false, /* useExactBytes */ true); } @@ -504,11 +540,13 @@ public void testStringRowsExact() throws Exception { public void testMultiKeyRowsExact1() throws Exception { random = new Random(8235); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -519,7 +557,7 @@ public void testMultiKeyRowsExact1() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "string", "string", "string", "string" }, /* doClipping */ false, /* useExactBytes */ true); } @@ -528,11 +566,13 @@ public void testMultiKeyRowsExact1() throws Exception { public void testMultiKeyRowsExact2() throws Exception { random = new Random(8235); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -543,7 +583,7 @@ public void testMultiKeyRowsExact2() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "smallint" }, /* doClipping */ false, /* useExactBytes */ true); } @@ -552,11 +592,13 @@ public void testMultiKeyRowsExact2() throws Exception { public void testMultiKeyRowsExact3() throws Exception { random = new Random(8235); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -567,7 +609,7 @@ public void testMultiKeyRowsExact3() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "int", "binary" }, /* doClipping */ false, /* useExactBytes */ true); } @@ -576,11 +618,13 @@ public void testMultiKeyRowsExact3() throws Exception { public void testBigIntRowsClippedExact() throws Exception { random = new Random(2122); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.LONG); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.LONG, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -591,7 +635,7 @@ public void testBigIntRowsClippedExact() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.LONG, verifyTable, + factory, hashMap, HashTableKeyType.LONG, verifyTable, new String[] { "bigint" }, /* doClipping */ true, /* useExactBytes */ true); } @@ -600,11 +644,13 @@ public void testBigIntRowsClippedExact() throws Exception { public void testIntRowsClippedExact() throws Exception { random = new Random(7520); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.INT); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastLongHashMap map = - new VectorMapJoinFastLongHashMap( - false, false, HashTableKeyType.INT, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastLongHashMap hashMap = + (VectorMapJoinFastLongHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -615,7 +661,7 @@ public void testIntRowsClippedExact() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.INT, verifyTable, + factory, hashMap, HashTableKeyType.INT, verifyTable, new String[] { "int" }, /* doClipping */ true, /* useExactBytes */ true); } @@ -624,12 +670,13 @@ public void testIntRowsClippedExact() throws Exception { public void testStringRowsClippedExact() throws Exception { random = new Random(7539); - // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastStringHashMap map = - new VectorMapJoinFastStringHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.STRING); + // Use a large capacity that doesn't require expansion, yet. + VectorMapJoinFastStringHashMap hashMap = + (VectorMapJoinFastStringHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); VectorRandomRowSource valueSource = new VectorRandomRowSource(); @@ -639,7 +686,7 @@ public void testStringRowsClippedExact() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.STRING, verifyTable, + factory, hashMap, HashTableKeyType.STRING, verifyTable, new String[] { "string" }, /* doClipping */ true, /* useExactBytes */ true); } @@ -648,11 +695,13 @@ public void testStringRowsClippedExact() throws Exception { public void testMultiKeyRowsClippedExact1() throws Exception { random = new Random(13); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -663,7 +712,7 @@ public void testMultiKeyRowsClippedExact1() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "interval_year_month", "decimal(12,8)" }, /* doClipping */ true, /* useExactBytes */ true); } @@ -672,11 +721,13 @@ public void testMultiKeyRowsClippedExact1() throws Exception { public void testMultiKeyRowsClippedExact2() throws Exception { random = new Random(12); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -687,7 +738,7 @@ public void testMultiKeyRowsClippedExact2() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "bigint", "string", "int" }, /* doClipping */ true, /* useExactBytes */ true); } @@ -696,11 +747,13 @@ public void testMultiKeyRowsClippedExact2() throws Exception { public void testMultiKeyRowsClippedExact3() throws Exception { random = new Random(7); + VectorMapJoinFastHashTableFactory factory = + new VectorMapJoinFastHashTableFactory(HashTableKind.HASH_MAP, HashTableKeyType.MULTI_KEY); + // Use a large capacity that doesn't require expansion, yet. - VectorMapJoinFastMultiKeyHashMap map = - new VectorMapJoinFastMultiKeyHashMap( - false, - LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE); + VectorMapJoinFastMultiKeyHashMap hashMap = + (VectorMapJoinFastMultiKeyHashMap) + factory.createHashTable(LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0); VerifyFastRowHashMap verifyTable = new VerifyFastRowHashMap(); @@ -711,7 +764,7 @@ public void testMultiKeyRowsClippedExact3() throws Exception { Object[][] rows = valueSource.randomRows(rowCount); addAndVerifyRows(valueSource, rows, - map, HashTableKeyType.MULTI_KEY, verifyTable, + factory, hashMap, HashTableKeyType.MULTI_KEY, verifyTable, new String[] { "bigint", "string", "varchar(5000)" }, /* doClipping */ true, /* useExactBytes */ true); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java index a4ecd9f..63b7fbb 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java +++ serde/src/java/org/apache/hadoop/hive/serde2/WriteBuffers.java @@ -47,6 +47,11 @@ public void clear() { buffer = null; bufferIndex = offset = -1; } + public void set(Position pos) { + buffer = pos.buffer; + bufferIndex = pos.bufferIndex; + offset = pos.offset; + } } Position writePos = new Position(); // Position where we'd write @@ -542,6 +547,21 @@ public long readNByteLong(long offset, int bytes, Position readPos) { return v; } + public long readNByteLong(int bytes, Position readPos) { + long v = 0; + if (isAllInOneReadBuffer(bytes, readPos)) { + for (int i = 0; i < bytes; ++i) { + v = (v << 8) + (readPos.buffer[readPos.offset + i] & 0xff); + } + readPos.offset += bytes; + } else { + for (int i = 0; i < bytes; ++i) { + v = (v << 8) + (readNextByte(readPos) & 0xff); + } + } + return v; + } + public void writeFiveByteULong(long offset, long v) { int prevIndex = writePos.bufferIndex, prevOffset = writePos.offset; setWritePoint(offset); @@ -564,6 +584,23 @@ public void writeFiveByteULong(long offset, long v) { writePos.offset = prevOffset; } + public void writeFiveByteULong(long v) { + if (isAllInOneWriteBuffer(5)) { + writePos.buffer[writePos.offset] = (byte)(v >>> 32); + writePos.buffer[writePos.offset + 1] = (byte)(v >>> 24); + writePos.buffer[writePos.offset + 2] = (byte)(v >>> 16); + writePos.buffer[writePos.offset + 3] = (byte)(v >>> 8); + writePos.buffer[writePos.offset + 4] = (byte)(v); + writePos.offset += 5; + } else { + write((byte)(v >>> 32)); + write((byte)(v >>> 24)); + write((byte)(v >>> 16)); + write((byte)(v >>> 8)); + write((byte)(v)); + } + } + public int readInt(long offset) { return (int)unsafeReadNByteLong(offset, 4); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java index 003a2d4..2d6f7aa 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/fast/BinarySortableDeserializeRead.java @@ -77,8 +77,8 @@ /* * Use this constructor when only ascending sort order is used. */ - public BinarySortableDeserializeRead(PrimitiveTypeInfo[] primitiveTypeInfos) { - this(primitiveTypeInfos, null); + public BinarySortableDeserializeRead(TypeInfo[] typeInfos) { + this(typeInfos, null); } public BinarySortableDeserializeRead(TypeInfo[] typeInfos, diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java index 0df1d79..37b5824 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java @@ -100,22 +100,26 @@ public void set(byte[] bytes, int offset, int length) { public String getDetailedReadPositionString() { StringBuffer sb = new StringBuffer(); - sb.append("Reading byte[] of length "); - sb.append(bytes.length); - sb.append(" at start offset "); - sb.append(start); - sb.append(" for length "); - sb.append(end - start); - sb.append(" to read "); - sb.append(fieldCount); - sb.append(" fields with types "); - sb.append(Arrays.toString(typeInfos)); - sb.append(". Read field #"); - sb.append(fieldIndex); - sb.append(" at field start position "); - sb.append(fieldStart); - sb.append(" current read offset "); - sb.append(offset); + if (bytes == null) { + sb.append("byte[] is null"); + } else { + sb.append("Reading byte[] of length "); + sb.append(bytes.length); + sb.append(" at start offset "); + sb.append(start); + sb.append(" for length "); + sb.append(end - start); + sb.append(" to read "); + sb.append(fieldCount); + sb.append(" fields with types "); + sb.append(Arrays.toString(typeInfos)); + sb.append(". Read field #"); + sb.append(fieldIndex); + sb.append(" at field start position "); + sb.append(fieldStart); + sb.append(" current read offset "); + sb.append(offset); + } return sb.toString(); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java index 91ef12d..98d4d0b 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinarySerializeWrite.java @@ -45,7 +45,7 @@ * * This is an alternative way to serialize than what is provided by LazyBinarySerDe. */ -public class LazyBinarySerializeWrite implements SerializeWrite { +public final class LazyBinarySerializeWrite implements SerializeWrite { public static final Logger LOG = LoggerFactory.getLogger(LazyBinarySerializeWrite.class.getName()); private Output output; diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java index a6d932c..0b60a62 100644 --- storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -170,6 +170,23 @@ public void setVal(int elementNum, byte[] sourceBuf) { } /** + * A variation on setVal that allocates room for the value but lets the caller do the copy. + * Afterward, the caller must immediately set the bytes referenced by vector[elementNum] for + * the correct length at the start index. + * @param elementNum index within column vector to set + * @param length length of source byte sequence + */ + public void allocateVal(int elementNum, int length) { + if ((nextFree + length) > buffer.length) { + increaseBufferSpace(length); + } + vector[elementNum] = buffer; + this.start[elementNum] = nextFree; + this.length[elementNum] = length; + nextFree += length; + } + + /** * Set a field to the concatenation of two string values. Result data is copied * into the internal buffer. *