diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFilterOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFilterOperator.java index 37c0ed8889..f56105dc6b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFilterOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFilterOperator.java @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.vector.expressions.ConstantVectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterExprAndExpr; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; @@ -32,6 +33,7 @@ import org.apache.hadoop.hive.ql.plan.VectorFilterDesc; import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.io.LongWritable; /** * Filter operator implementation. @@ -150,6 +152,21 @@ public VectorExpression getPredicateExpression() { return predicateExpression; } + @Override + protected void closeOp(boolean abort) throws HiveException { + if (predicateExpression instanceof FilterExprAndExpr) { + int lookupcount = ((FilterExprAndExpr) predicateExpression).getLookupCount(); + int earlyreturncount = ((FilterExprAndExpr) predicateExpression).getEarlyReturnCount(); + if (lookupcount != 0) { + statsMap.put("CUCKOO_LOOKUP_" + getOperatorId(), new LongWritable(lookupcount)); + } + if (earlyreturncount != 0) { + statsMap.put("CUCKOO_EARLY_" + getOperatorId(), new LongWritable(earlyreturncount)); + } + } + super.closeOp(abort); + } + @Override public VectorDesc getVectorDesc() { return vectorDesc; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java index 9de2e92fbd..638db37866 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java @@ -41,6 +41,8 @@ private int rehashCount = 0; private static final long INT_MASK = 0x00000000ffffffffL; private static final long BYTE_MASK = 0x00000000000000ffL; + public int minLen; + public int maxLen; /** * Allocate a new set to hold expectedSize values. Re-allocation to expand @@ -76,7 +78,6 @@ public CuckooSetBytes(int expectedSize) { * and ending at start+len is present in the set. */ public boolean lookup(byte[] b, int start, int len) { - return entryEqual(t1, h1(b, start, len), b, start, len) || entryEqual(t2, h2(b, start, len), b, start, len); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterExprAndExpr.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterExprAndExpr.java index 9a239b6820..4094ebb52a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterExprAndExpr.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterExprAndExpr.java @@ -43,6 +43,24 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { } } + public int getLookupCount() { + for (VectorExpression exp : childExpressions) { + if (exp instanceof FilterStringColumnInList) { + return ((FilterStringColumnInList) exp).getLookupCount(); + } + } + return 0; + } + + public int getEarlyReturnCount() { + for (VectorExpression exp : childExpressions) { + if (exp instanceof FilterStringColumnInList) { + return ((FilterStringColumnInList) exp).getEarlyReturnCount(); + } + } + return 0; + } + @Override public String vectorExpressionParameters() { // The children are input. diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java index 3456abb9c9..f72ac0730f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java @@ -54,6 +54,21 @@ public FilterStringColumnInList(int colNum) { inSet = null; } + public int getLookupCount() { + if (inSet != null) { + return 0; + } else { + return 0; + } + } + + public int getEarlyReturnCount() { + if (inSet != null) { + return 0; + } + return 0; + } + @Override public void evaluate(VectorizedRowBatch batch) throws HiveException { @@ -63,7 +78,13 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { if (inSet == null) { inSet = new CuckooSetBytes(inListValues.length); - inSet.load(inListValues); +// inSet.load(inListValues); + inSet.minLen = Integer.MAX_VALUE; + for (byte[] a: inListValues) { + inSet.insert(a); + inSet.maxLen = Math.max(a.length, inSet.maxLen); + inSet.minLen = Math.min(a.length, inSet.minLen); + } } BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[inputCol]; @@ -74,6 +95,76 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { int[] start = inputColVector.start; int[] len = inputColVector.length; + ////////// + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + // not worth + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (inSet.maxLen >= len[i] && inSet.minLen <= len[i]) { + sel[newSize++] = i; + } + } + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (inSet.maxLen >= len[i] && inSet.minLen <= len[i]) { + sel[newSize++] = i; + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + + //not worth + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (inSet.maxLen >= len[i] && inSet.minLen <= len[i]) { + sel[newSize++] = i; + } + } + } + + // Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (inSet.maxLen >= len[i] && inSet.minLen <= len[i]) { + sel[newSize++] = i; + } + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } + + + n = batch.size; + + + ////// + + + // return immediately if batch is empty if (n == 0) { return; @@ -93,7 +184,7 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { int newSize = 0; for(int j = 0; j != n; j++) { int i = sel[j]; - if (inSet.lookup(vector[i], start[i], len[i])) { + if (inSet.maxLen >= len[i] && inSet.minLen <= len[i] && inSet.lookup(vector[i], start[i], len[i])) { sel[newSize++] = i; } } @@ -101,7 +192,7 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { } else { int newSize = 0; for(int i = 0; i != n; i++) { - if (inSet.lookup(vector[i], start[i], len[i])) { + if (inSet.maxLen >= len[i] && inSet.minLen <= len[i] && inSet.lookup(vector[i], start[i], len[i])) { sel[newSize++] = i; } } @@ -129,7 +220,7 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { for(int j = 0; j != n; j++) { int i = sel[j]; if (!nullPos[i]) { - if (inSet.lookup(vector[i], start[i], len[i] )) { + if (inSet.maxLen >= len[i] && inSet.minLen <= len[i] && inSet.lookup(vector[i], start[i], len[i] )) { sel[newSize++] = i; } } @@ -141,7 +232,7 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { int newSize = 0; for(int i = 0; i != n; i++) { if (!nullPos[i]) { - if (inSet.lookup(vector[i], start[i], len[i])) { + if (inSet.maxLen >= len[i] && inSet.minLen <= len[i] && inSet.lookup(vector[i], start[i], len[i])) { sel[newSize++] = i; } } diff --git ql/src/test/queries/clientpositive/a.q ql/src/test/queries/clientpositive/a.q new file mode 100644 index 0000000000..39c41be14c --- /dev/null +++ ql/src/test/queries/clientpositive/a.q @@ -0,0 +1,22 @@ +set hive.mapred.mode=nonstrict; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.fetch.task.conversion=none; + +create table a(id int, name string); +insert into a(id, name) values +(1, 'ali'), +(2, 'barak'), +(3, 'tan'), +(4, 'kerim'), +(5, 'veli'), +(6, 'burcak'), +(7, 'fatih'), +(8, 'sedat'), +(9, 'salim'), +(10, 'fatih'), +(11, 'fatih'); + +set hive.explain.user=false; +--explain vectorization detail +select * from a where name in ('kerim', 'fatih');