diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index bdcb1cb..9630616 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -43,6 +43,8 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.StringConcatColCol; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringConcatColScalar; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringConcatScalarCol; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringSubstrColStart; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringSubstrColStartLen; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFUnixTimeStampLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression; @@ -91,6 +93,7 @@ import org.apache.hadoop.hive.ql.udf.UDFOPPositive; import org.apache.hadoop.hive.ql.udf.UDFRTrim; import org.apache.hadoop.hive.ql.udf.UDFSecond; +import org.apache.hadoop.hive.ql.udf.UDFSubstr; import org.apache.hadoop.hive.ql.udf.UDFTrim; import org.apache.hadoop.hive.ql.udf.UDFUpper; import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear; @@ -477,7 +480,9 @@ private VectorExpression getVectorExpression(GenericUDFBridge udf, return getUnaryStringExpression("StringTrim", "String", childExpr); } else if (cl.equals(UDFConcat.class)) { return getConcatExpression(childExpr); - } + } else if (cl.equals(UDFSubstr.class)) { + return getSubstrExpression(childExpr); + } throw new HiveException("Udf: "+udf.getClass().getSimpleName()+", is not supported"); } @@ -609,6 +614,65 @@ private VectorExpression getUnaryStringExpression(String vectorExprClassName, } return expr; } + + private VectorExpression getSubstrExpression( + List childExprList) throws HiveException { + + ExprNodeDesc childExpr = childExprList.get(0); + ExprNodeDesc startExpr = childExprList.get(1); + startExpr = foldConstantsForUnaryExpression(startExpr); + + // Get second and optionally third arguments + int start; + if (startExpr instanceof ExprNodeConstantDesc) { + ExprNodeConstantDesc constDesc = (ExprNodeConstantDesc) startExpr; + start = ((Integer) constDesc.getValue()).intValue(); + } else { + throw new HiveException("Cannot vectorize non-constant start argument for SUBSTR"); + } + ExprNodeDesc lengthExpr = null; + int length = 0; + if (childExprList.size() == 3) { + lengthExpr = childExprList.get(2); + lengthExpr = foldConstantsForUnaryExpression(lengthExpr); + if (lengthExpr instanceof ExprNodeConstantDesc) { + ExprNodeConstantDesc constDesc = (ExprNodeConstantDesc) lengthExpr; + length = ((Integer) constDesc.getValue()).intValue(); + } else { + throw new HiveException("Cannot vectorize non-constant length argument for SUBSTR"); + } + } + + // Prepare first argument (whether it is a column or an expression) + int inputCol; + VectorExpression v1 = null; + if (childExpr instanceof ExprNodeGenericFuncDesc) { + v1 = getVectorExpression(childExpr); + inputCol = v1.getOutputColumn(); + } else if (childExpr instanceof ExprNodeColumnDesc) { + ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) childExpr; + inputCol = getInputColumnIndex(colDesc.getColumn()); + } else { + throw new HiveException("Expression not supported: " + childExpr); + } + int outputCol = ocm.allocateOutputColumn("String"); + + // Create appropriate vector expression for 2 or 3 argument version of SUBSTR() + VectorExpression expr = null; + if (childExprList.size() == 2) { + expr = new StringSubstrColStart(inputCol, start, outputCol); + } else if (childExprList.size() == 3) { + expr = new StringSubstrColStartLen(inputCol, start, length, outputCol); + } else { + throw new HiveException("Invalid number of arguments for SUBSTR()"); + } + + if (v1 != null) { + expr.setChildExpressions(new VectorExpression [] {v1}); + ocm.freeOutputColumn(v1.getOutputColumn()); + } + return expr; + } private VectorExpression getLikeExpression(List childExpr) throws HiveException { ExprNodeDesc leftExpr = childExpr.get(0); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java index 37610f1..28dc66a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java @@ -26,7 +26,22 @@ public StringSubstrColStart(int colNum, int startIdx, int outputColumn) { this.colNum = colNum; - this.startIdx = startIdx; + + /* Switch from a 1-based start offset (the Hive end user convention) to a 0-based start offset + * (the internal convention). + */ + if (startIdx >= 1) { + this.startIdx = startIdx - 1; + } else if (startIdx == 0) { + + // If start index is 0 in query, that is equivalent to using 1 in query. + // So internal offset is 0. + this.startIdx = 0; + } else { + + // start index of -n means give the last n characters of the string + this.startIdx = startIdx; + } this.outputColumn = outputColumn; } @@ -39,24 +54,27 @@ public StringSubstrColStart(int colNum, int startIdx, int outputColumn) { * @param substrStart the Start index for the substring operation */ static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) { - int curIdx = -1; + int end = start + len; if (substrStart < 0) { int length = 0; - for (int i = start; i != len; ++i) { + for (int i = start; i != end; ++i) { if ((utf8String[i] & 0xc0) != 0x80) { ++length; } } - - if (-length > substrStart) { + if (-substrStart > length) { + + /* The result is empty string if a negative start is provided + * whose absolute value is greater than the string length. + */ return -1; } substrStart = length + substrStart; } - - int end = start + len; + + int curIdx = -1; for (int i = start; i != end; ++i) { if ((utf8String[i] & 0xc0) != 0x80) { ++curIdx; @@ -83,26 +101,26 @@ public void evaluate(VectorizedRowBatch batch) { return; } - byte[][] vector = inV.vector; int[] sel = batch.selected; int[] len = inV.length; int[] start = inV.start; + outV.initBuffer(); if (inV.isRepeating) { outV.isRepeating = true; if (!inV.noNulls && inV.isNull[0]) { outV.isNull[0] = true; outV.noNulls = false; - outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(0, EMPTY_STRING, 0, EMPTY_STRING.length); return; } else { outV.noNulls = true; - int offset = getSubstrStartOffset(vector[0], sel[0], len[0], startIdx); + int offset = getSubstrStartOffset(vector[0], start[0], len[0], startIdx); if (offset != -1) { - outV.setRef(0, vector[0], offset, len[0] - offset); + outV.setVal(0, vector[0], offset, len[0] - (offset - start[0])); } else { - outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(0, EMPTY_STRING, 0, EMPTY_STRING.length); } } } else { @@ -116,9 +134,9 @@ public void evaluate(VectorizedRowBatch batch) { int offset = getSubstrStartOffset(vector[selected], start[selected], len[selected], startIdx); outV.isNull[selected] = false; if (offset != -1) { - outV.setRef(selected, vector[selected], offset, len[selected] - offset); + outV.setVal(selected, vector[selected], offset, len[selected] - (offset - start[selected])); } else { - outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(selected, EMPTY_STRING, 0, EMPTY_STRING.length); } } else { outV.isNull[selected] = true; @@ -130,9 +148,9 @@ public void evaluate(VectorizedRowBatch batch) { int selected = sel[i]; int offset = getSubstrStartOffset(vector[selected], start[selected], len[selected], startIdx); if (offset != -1) { - outV.setRef(selected, vector[selected], offset, len[selected] - offset); + outV.setVal(selected, vector[selected], offset, len[selected] - (offset - start[selected])); } else { - outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(selected, EMPTY_STRING, 0, EMPTY_STRING.length); } } } @@ -144,9 +162,9 @@ public void evaluate(VectorizedRowBatch batch) { if (!inV.isNull[i]) { int offset = getSubstrStartOffset(vector[i], start[i], len[i], startIdx); if (offset != -1) { - outV.setRef(i, vector[i], offset, len[i] - offset); + outV.setVal(i, vector[i], offset, len[i] - (offset - start[i])); } else { - outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(i, EMPTY_STRING, 0, EMPTY_STRING.length); } } } @@ -155,9 +173,9 @@ public void evaluate(VectorizedRowBatch batch) { for (int i = 0; i != n; ++i) { int offset = getSubstrStartOffset(vector[i], start[i], len[i], startIdx); if (offset != -1) { - outV.setRef(i, vector[i], offset, len[i] - offset); + outV.setVal(i, vector[i], offset, len[i] - (offset - start[i])); } else { - outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(i, EMPTY_STRING, 0, EMPTY_STRING.length); } } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java index 5864375..0cd8379 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java @@ -29,7 +29,23 @@ public StringSubstrColStartLen(int colNum, int startIdx, int length, int outputColumn) { this.colNum = colNum; - this.startIdx = startIdx; + + /* Switch from a 1-based start offset (the Hive end user convention) to a 0-based start offset + * (the internal convention). + */ + if (startIdx >= 1) { + this.startIdx = startIdx - 1; + } else if (startIdx == 0) { + + // If start index is 0 in query, that is equivalent to using 1 in query. + // So internal offset is 0. + this.startIdx = 0; + } else { + + // start index of -n means give the last n characters of the string + this.startIdx = startIdx; + } + this.length = length; this.outputColumn = outputColumn; offsetArray = new int[2]; @@ -49,25 +65,24 @@ static void populateSubstrOffsets(byte[] utf8String, int start, int len, int sub int curIdx = -1; offsetArray[0] = -1; offsetArray[1] = -1; + int end = start + len; if (substrStart < 0) { int length = 0; - for (int i = start; i != len; ++i) { + for (int i = start; i != end; ++i) { if ((utf8String[i] & 0xc0) != 0x80) { ++length; } } - if (-length > substrStart) { + if (-substrStart > length) { return; } substrStart = length + substrStart; } - int endIdx = substrStart + substrLength - 1; - int end = start + len; for (int i = start; i != end; ++i) { if ((utf8String[i] & 0xc0) != 0x80) { ++curIdx; @@ -103,21 +118,22 @@ public void evaluate(VectorizedRowBatch batch) { int[] sel = batch.selected; int[] len = inV.length; int[] start = inV.start; + outV.initBuffer(); if (inV.isRepeating) { outV.isRepeating = true; if (!inV.noNulls && inV.isNull[0]) { outV.isNull[0] = true; outV.noNulls = false; - outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(0, EMPTY_STRING, 0, EMPTY_STRING.length); return; } else { outV.noNulls = true; - populateSubstrOffsets(vector[0], sel[0], len[0], startIdx, length, offsetArray); + populateSubstrOffsets(vector[0], start[0], len[0], startIdx, length, offsetArray); if (offsetArray[0] != -1) { - outV.setRef(0, vector[0], offsetArray[0], offsetArray[1]); + outV.setVal(0, vector[0], offsetArray[0], offsetArray[1]); } else { - outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(0, EMPTY_STRING, 0, EMPTY_STRING.length); } } } else { @@ -131,9 +147,9 @@ public void evaluate(VectorizedRowBatch batch) { outV.isNull[selected] = false; populateSubstrOffsets(vector[selected], start[selected], len[selected], startIdx, length, offsetArray); if (offsetArray[0] != -1) { - outV.setRef(selected, vector[selected], offsetArray[0], offsetArray[1]); + outV.setVal(selected, vector[selected], offsetArray[0], offsetArray[1]); } else { - outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(selected, EMPTY_STRING, 0, EMPTY_STRING.length); } } else { outV.isNull[selected] = true; @@ -146,9 +162,9 @@ public void evaluate(VectorizedRowBatch batch) { outV.isNull[selected] = false; populateSubstrOffsets(vector[selected], start[selected], len[selected], startIdx, length, offsetArray); if (offsetArray[0] != -1) { - outV.setRef(selected, vector[selected], offsetArray[0], offsetArray[1]); + outV.setVal(selected, vector[selected], offsetArray[0], offsetArray[1]); } else { - outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(selected, EMPTY_STRING, 0, EMPTY_STRING.length); } } } @@ -160,9 +176,9 @@ public void evaluate(VectorizedRowBatch batch) { if (!inV.isNull[i]) { populateSubstrOffsets(vector[i], start[i], len[i], startIdx, length, offsetArray); if (offsetArray[0] != -1) { - outV.setRef(i, vector[i], offsetArray[0], offsetArray[1]); + outV.setVal(i, vector[i], offsetArray[0], offsetArray[1]); } else { - outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(i, EMPTY_STRING, 0, EMPTY_STRING.length); } } } @@ -172,9 +188,9 @@ public void evaluate(VectorizedRowBatch batch) { outV.isNull[i] = false; populateSubstrOffsets(vector[i], start[i], len[i], startIdx, length, offsetArray); if (offsetArray[0] != -1) { - outV.setRef(i, vector[i], offsetArray[0], offsetArray[1]); + outV.setVal(i, vector[i], offsetArray[0], offsetArray[1]); } else { - outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length); + outV.setVal(i, EMPTY_STRING, 0, EMPTY_STRING.length); } } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index 0084558..1d44abf 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -1000,7 +1000,7 @@ public void testSubstrStart() throws UnsupportedEncodingException { v.setRef(2, data3, 0, data3.length); v.isNull[2] = false; - StringSubstrColStart expr = new StringSubstrColStart(0, 5, 1); + StringSubstrColStart expr = new StringSubstrColStart(0, 6, 1); expr.evaluate(batch); BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; Assert.assertEquals(3, batch.size); @@ -1030,7 +1030,7 @@ public void testSubstrStart() throws UnsupportedEncodingException { outCol.isRepeating = true; // Testing negative substring index. - // For a string with length 11, start idx 5 should yield same results as -6 + // Start index -6 should yield the last 6 characters of the string expr = new StringSubstrColStart(0, -6, 1); expr.evaluate(batch); @@ -1059,9 +1059,9 @@ public void testSubstrStart() throws UnsupportedEncodingException { outCol.noNulls = false; outCol.isRepeating = true; - // Testing substring starting from index 0 + // Testing substring starting from index 1 - expr = new StringSubstrColStart(0, 0, 1); + expr = new StringSubstrColStart(0, 1, 1); expr.evaluate(batch); Assert.assertEquals(3, batch.size); Assert.assertTrue(outCol.noNulls); @@ -1090,7 +1090,7 @@ public void testSubstrStart() throws UnsupportedEncodingException { // Testing with nulls - expr = new StringSubstrColStart(0, 5, 1); + expr = new StringSubstrColStart(0, 6, 1); v.noNulls = false; v.isNull[0] = true; expr.evaluate(batch); @@ -1144,7 +1144,7 @@ public void testSubstrStart() throws UnsupportedEncodingException { batch.cols[1] = outV; outV.isRepeating = true; outV.noNulls = false; - expr = new StringSubstrColStart(0, 2, 1); + expr = new StringSubstrColStart(0, 3, 1); batch.size = 1; expr.evaluate(batch); outCol = (BytesColumnVector) batch.cols[1]; @@ -1163,21 +1163,22 @@ public void testSubstrStart() throws UnsupportedEncodingException { v = new BytesColumnVector(); v.isRepeating = false; v.noNulls = true; - v.setRef(0, multiByte, 3, 10); + + // string is 2 chars long (a 3 byte and a 4 byte char) + v.setRef(0, multiByte, 3, 7); batch.cols[0] = v; batch.cols[1] = outV; outV.isRepeating = true; outV.noNulls = false; outCol = (BytesColumnVector) batch.cols[1]; - expr = new StringSubstrColStart(0, 1, 1); + expr = new StringSubstrColStart(0, 2, 1); expr.evaluate(batch); Assert.assertFalse(outV.isRepeating); Assert.assertTrue(outV.noNulls); Assert.assertEquals(0, StringExpr.compare( - // Since references starts at index 3 (2nd char), substring with start idx 1 - // will start at the 3rd char which starts at index 6 - multiByte, 6, 10 - 6, outCol.vector[0], outCol.start[0], outCol.length[0] + // the result is the last 1 character, which occupies 4 bytes + multiByte, 6, 4, outCol.vector[0], outCol.start[0], outCol.length[0] ) ); } @@ -1206,7 +1207,7 @@ public void testSubstrStartLen() throws UnsupportedEncodingException { outV.isRepeating = true; outV.noNulls = false; - StringSubstrColStartLen expr = new StringSubstrColStartLen(0, 5, 6, 1); + StringSubstrColStartLen expr = new StringSubstrColStartLen(0, 6, 6, 1); expr.evaluate(batch); BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; Assert.assertEquals(3, batch.size); @@ -1296,7 +1297,7 @@ public void testSubstrStartLen() throws UnsupportedEncodingException { outV.isRepeating = true; outV.noNulls = false; - expr = new StringSubstrColStartLen(0, 5, 10, 1); + expr = new StringSubstrColStartLen(0, 6, 10, 1); expr.evaluate(batch); outCol = (BytesColumnVector) batch.cols[1]; Assert.assertEquals(3, batch.size); @@ -1377,7 +1378,7 @@ public void testSubstrStartLen() throws UnsupportedEncodingException { batch.cols[1] = outV; outV.isRepeating = true; outV.noNulls = false; - expr = new StringSubstrColStartLen(0, 2, 2, 1); + expr = new StringSubstrColStartLen(0, 3, 2, 1); expr.evaluate(batch); Assert.assertEquals(1, batch.size); Assert.assertFalse(outV.isRepeating); @@ -1400,7 +1401,7 @@ public void testSubstrStartLen() throws UnsupportedEncodingException { batch.cols[1] = outV; outV.isRepeating = true; outV.noNulls = false; - expr = new StringSubstrColStartLen(0, 1, 2, 1); + expr = new StringSubstrColStartLen(0, 2, 2, 1); expr.evaluate(batch); outCol = (BytesColumnVector) batch.cols[1]; Assert.assertEquals(1, batch.size);