commit c924d38e86bf83cc94852d285a00d9ef8515a611 Author: Owen O'Malley Date: Sat Jul 18 17:23:53 2015 -0700 HIVE-11253. Create the storage-api module. diff --git common/pom.xml common/pom.xml index aedf7ba..a7997e2 100644 --- common/pom.xml +++ common/pom.xml @@ -39,6 +39,11 @@ hive-shims ${project.version} + + org.apache.hive + hive-storage-api + ${project.version} + commons-cli diff --git common/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java common/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java deleted file mode 100644 index 7d7fb28..0000000 --- common/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java +++ /dev/null @@ -1,312 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.common.type; - -import java.math.BigDecimal; -import java.math.BigInteger; -import java.math.RoundingMode; - -/** - * - * HiveDecimal. Simple wrapper for BigDecimal. Adds fixed max precision and non scientific string - * representation - * - */ -public class HiveDecimal implements Comparable { - public static final int MAX_PRECISION = 38; - public static final int MAX_SCALE = 38; - - /** - * Default precision/scale when user doesn't specify in the column metadata, such as - * decimal and decimal(8). - */ - public static final int USER_DEFAULT_PRECISION = 10; - public static final int USER_DEFAULT_SCALE = 0; - - /** - * Default precision/scale when system is not able to determine them, such as in case - * of a non-generic udf. - */ - public static final int SYSTEM_DEFAULT_PRECISION = 38; - public static final int SYSTEM_DEFAULT_SCALE = 18; - - public static final HiveDecimal ZERO = new HiveDecimal(BigDecimal.ZERO); - public static final HiveDecimal ONE = new HiveDecimal(BigDecimal.ONE); - - public static final int ROUND_FLOOR = BigDecimal.ROUND_FLOOR; - public static final int ROUND_CEILING = BigDecimal.ROUND_CEILING; - public static final int ROUND_HALF_UP = BigDecimal.ROUND_HALF_UP; - - private BigDecimal bd = BigDecimal.ZERO; - - private HiveDecimal(BigDecimal bd) { - this.bd = bd; - } - - public static HiveDecimal create(BigDecimal b) { - return create(b, true); - } - - public static HiveDecimal create(BigDecimal b, boolean allowRounding) { - BigDecimal bd = normalize(b, allowRounding); - return bd == null ? null : new HiveDecimal(bd); - } - - public static HiveDecimal create(BigInteger unscaled, int scale) { - BigDecimal bd = normalize(new BigDecimal(unscaled, scale), true); - return bd == null ? null : new HiveDecimal(bd); - } - - public static HiveDecimal create(String dec) { - BigDecimal bd; - try { - bd = new BigDecimal(dec.trim()); - } catch (NumberFormatException ex) { - return null; - } - - bd = normalize(bd, true); - return bd == null ? null : new HiveDecimal(bd); - } - - public static HiveDecimal create(BigInteger bi) { - BigDecimal bd = normalize(new BigDecimal(bi), true); - return bd == null ? null : new HiveDecimal(bd); - } - - public static HiveDecimal create(int i) { - return new HiveDecimal(new BigDecimal(i)); - } - - public static HiveDecimal create(long l) { - return new HiveDecimal(new BigDecimal(l)); - } - - @Override - public String toString() { - return bd.toPlainString(); - } - - public HiveDecimal setScale(int i) { - return new HiveDecimal(bd.setScale(i, RoundingMode.HALF_UP)); - } - - @Override - public int compareTo(HiveDecimal dec) { - return bd.compareTo(dec.bd); - } - - @Override - public int hashCode() { - return bd.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (obj == null || obj.getClass() != getClass()) { - return false; - } - return bd.equals(((HiveDecimal) obj).bd); - } - - public int scale() { - return bd.scale(); - } - - /** - * Returns the number of digits (integer and fractional) in the number, which is equivalent - * to SQL decimal precision. Note that this is different from BigDecimal.precision(), - * which returns the precision of the unscaled value (BigDecimal.valueOf(0.01).precision() = 1, - * whereas HiveDecimal.create("0.01").precision() = 2). - * If you want the BigDecimal precision, use HiveDecimal.bigDecimalValue().precision() - * @return - */ - public int precision() { - int bdPrecision = bd.precision(); - int bdScale = bd.scale(); - - if (bdPrecision < bdScale) { - // This can happen for numbers less than 0.1 - // For 0.001234: bdPrecision=4, bdScale=6 - // In this case, we'll set the type to have the same precision as the scale. - return bdScale; - } - return bdPrecision; - } - - public int intValue() { - return bd.intValue(); - } - - public double doubleValue() { - return bd.doubleValue(); - } - - public long longValue() { - return bd.longValue(); - } - - public short shortValue() { - return bd.shortValue(); - } - - public float floatValue() { - return bd.floatValue(); - } - - public BigDecimal bigDecimalValue() { - return bd; - } - - public byte byteValue() { - return bd.byteValue(); - } - - public HiveDecimal setScale(int adjustedScale, int rm) { - return create(bd.setScale(adjustedScale, rm)); - } - - public HiveDecimal subtract(HiveDecimal dec) { - return create(bd.subtract(dec.bd)); - } - - public HiveDecimal multiply(HiveDecimal dec) { - return create(bd.multiply(dec.bd), false); - } - - public BigInteger unscaledValue() { - return bd.unscaledValue(); - } - - public HiveDecimal scaleByPowerOfTen(int n) { - return create(bd.scaleByPowerOfTen(n)); - } - - public HiveDecimal abs() { - return create(bd.abs()); - } - - public HiveDecimal negate() { - return create(bd.negate()); - } - - public HiveDecimal add(HiveDecimal dec) { - return create(bd.add(dec.bd)); - } - - public HiveDecimal pow(int n) { - BigDecimal result = normalize(bd.pow(n), false); - return result == null ? null : new HiveDecimal(result); - } - - public HiveDecimal remainder(HiveDecimal dec) { - return create(bd.remainder(dec.bd)); - } - - public HiveDecimal divide(HiveDecimal dec) { - return create(bd.divide(dec.bd, MAX_SCALE, RoundingMode.HALF_UP), true); - } - - /** - * Get the sign of the underlying decimal. - * @return 0 if the decimal is equal to 0, -1 if less than zero, and 1 if greater than 0 - */ - public int signum() { - return bd.signum(); - } - - private static BigDecimal trim(BigDecimal d) { - if (d.compareTo(BigDecimal.ZERO) == 0) { - // Special case for 0, because java doesn't strip zeros correctly on that number. - d = BigDecimal.ZERO; - } else { - d = d.stripTrailingZeros(); - if (d.scale() < 0) { - // no negative scale decimals - d = d.setScale(0); - } - } - return d; - } - - private static BigDecimal normalize(BigDecimal bd, boolean allowRounding) { - if (bd == null) { - return null; - } - - bd = trim(bd); - - int intDigits = bd.precision() - bd.scale(); - - if (intDigits > MAX_PRECISION) { - return null; - } - - int maxScale = Math.min(MAX_SCALE, Math.min(MAX_PRECISION - intDigits, bd.scale())); - if (bd.scale() > maxScale ) { - if (allowRounding) { - bd = bd.setScale(maxScale, RoundingMode.HALF_UP); - // Trimming is again necessary, because rounding may introduce new trailing 0's. - bd = trim(bd); - } else { - bd = null; - } - } - - return bd; - } - - public static BigDecimal enforcePrecisionScale(BigDecimal bd, int maxPrecision, int maxScale) { - if (bd == null) { - return null; - } - - bd = trim(bd); - - if (bd.scale() > maxScale) { - bd = bd.setScale(maxScale, RoundingMode.HALF_UP); - } - - int maxIntDigits = maxPrecision - maxScale; - int intDigits = bd.precision() - bd.scale(); - if (intDigits > maxIntDigits) { - return null; - } - - return bd; - } - - public static HiveDecimal enforcePrecisionScale(HiveDecimal dec, int maxPrecision, int maxScale) { - if (dec == null) { - return null; - } - - // Minor optimization, avoiding creating new objects. - if (dec.precision() - dec.scale() <= maxPrecision - maxScale && - dec.scale() <= maxScale) { - return dec; - } - - BigDecimal bd = enforcePrecisionScale(dec.bd, maxPrecision, maxScale); - if (bd == null) { - return null; - } - - return HiveDecimal.create(bd); - } -} diff --git pom.xml pom.xml index 1abf738..196a0a4 100644 --- pom.xml +++ pom.xml @@ -49,6 +49,7 @@ service shims spark-client + storage-api testutils packaging diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java deleted file mode 100644 index 02c52fa..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java +++ /dev/null @@ -1,322 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector; - -/** - * This class supports string and binary data by value reference -- i.e. each field is - * explicitly present, as opposed to provided by a dictionary reference. - * In some cases, all the values will be in the same byte array to begin with, - * but this need not be the case. If each value is in a separate byte - * array to start with, or not all of the values are in the same original - * byte array, you can still assign data by reference into this column vector. - * This gives flexibility to use this in multiple situations. - *

- * When setting data by reference, the caller - * is responsible for allocating the byte arrays used to hold the data. - * You can also set data by value, as long as you call the initBuffer() method first. - * You can mix "by value" and "by reference" in the same column vector, - * though that use is probably not typical. - */ -public class BytesColumnVector extends ColumnVector { - public byte[][] vector; - public int[] start; // start offset of each field - - /* - * The length of each field. If the value repeats for every entry, then it is stored - * in vector[0] and isRepeating from the superclass is set to true. - */ - public int[] length; - private byte[] buffer; // optional buffer to use when actually copying in data - private int nextFree; // next free position in buffer - - // Estimate that there will be 16 bytes per entry - static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE; - - // Proportion of extra space to provide when allocating more buffer space. - static final float EXTRA_SPACE_FACTOR = (float) 1.2; - - /** - * Use this constructor for normal operation. - * All column vectors should be the default size normally. - */ - public BytesColumnVector() { - this(VectorizedRowBatch.DEFAULT_SIZE); - } - - /** - * Don't call this constructor except for testing purposes. - * - * @param size number of elements in the column vector - */ - public BytesColumnVector(int size) { - super(size); - vector = new byte[size][]; - start = new int[size]; - length = new int[size]; - } - - /** - * Additional reset work for BytesColumnVector (releasing scratch bytes for by value strings). - */ - @Override - public void reset() { - super.reset(); - initBuffer(0); - } - - /** Set a field by reference. - * - * @param elementNum index within column vector to set - * @param sourceBuf container of source data - * @param start start byte position within source - * @param length length of source byte sequence - */ - public void setRef(int elementNum, byte[] sourceBuf, int start, int length) { - vector[elementNum] = sourceBuf; - this.start[elementNum] = start; - this.length[elementNum] = length; - } - - /** - * You must call initBuffer first before using setVal(). - * Provide the estimated number of bytes needed to hold - * a full column vector worth of byte string data. - * - * @param estimatedValueSize Estimated size of buffer space needed - */ - public void initBuffer(int estimatedValueSize) { - nextFree = 0; - - // if buffer is already allocated, keep using it, don't re-allocate - if (buffer != null) { - return; - } - - // allocate a little extra space to limit need to re-allocate - int bufferSize = this.vector.length * (int)(estimatedValueSize * EXTRA_SPACE_FACTOR); - if (bufferSize < DEFAULT_BUFFER_SIZE) { - bufferSize = DEFAULT_BUFFER_SIZE; - } - buffer = new byte[bufferSize]; - } - - /** - * Initialize buffer to default size. - */ - public void initBuffer() { - initBuffer(0); - } - - /** - * @return amount of buffer space currently allocated - */ - public int bufferSize() { - if (buffer == null) { - return 0; - } - return buffer.length; - } - - /** - * Set a field by actually copying in to a local buffer. - * If you must actually copy data in to the array, use this method. - * DO NOT USE this method unless it's not practical to set data by reference with setRef(). - * Setting data by reference tends to run a lot faster than copying data in. - * - * @param elementNum index within column vector to set - * @param sourceBuf container of source data - * @param start start byte position within source - * @param length length of source byte sequence - */ - public void setVal(int elementNum, byte[] sourceBuf, int start, int length) { - if ((nextFree + length) > buffer.length) { - increaseBufferSpace(length); - } - System.arraycopy(sourceBuf, start, buffer, nextFree, length); - vector[elementNum] = buffer; - this.start[elementNum] = nextFree; - this.length[elementNum] = length; - nextFree += length; - } - - /** - * Set a field to the concatenation of two string values. Result data is copied - * into the internal buffer. - * - * @param elementNum index within column vector to set - * @param leftSourceBuf container of left argument - * @param leftStart start of left argument - * @param leftLen length of left argument - * @param rightSourceBuf container of right argument - * @param rightStart start of right argument - * @param rightLen length of right arugment - */ - public void setConcat(int elementNum, byte[] leftSourceBuf, int leftStart, int leftLen, - byte[] rightSourceBuf, int rightStart, int rightLen) { - int newLen = leftLen + rightLen; - if ((nextFree + newLen) > buffer.length) { - increaseBufferSpace(newLen); - } - vector[elementNum] = buffer; - this.start[elementNum] = nextFree; - this.length[elementNum] = newLen; - - System.arraycopy(leftSourceBuf, leftStart, buffer, nextFree, leftLen); - nextFree += leftLen; - System.arraycopy(rightSourceBuf, rightStart, buffer, nextFree, rightLen); - nextFree += rightLen; - } - - /** - * Increase buffer space enough to accommodate next element. - * This uses an exponential increase mechanism to rapidly - * increase buffer size to enough to hold all data. - * As batches get re-loaded, buffer space allocated will quickly - * stabilize. - * - * @param nextElemLength size of next element to be added - */ - public void increaseBufferSpace(int nextElemLength) { - - // Keep doubling buffer size until there will be enough space for next element. - int newLength = 2 * buffer.length; - while((nextFree + nextElemLength) > newLength) { - newLength *= 2; - } - - // Allocate new buffer, copy data to it, and set buffer to new buffer. - byte[] newBuffer = new byte[newLength]; - System.arraycopy(buffer, 0, newBuffer, 0, nextFree); - buffer = newBuffer; - } - - /** Copy the current object contents into the output. Only copy selected entries, - * as indicated by selectedInUse and the sel array. - */ - public void copySelected( - boolean selectedInUse, int[] sel, int size, BytesColumnVector output) { - - // Output has nulls if and only if input has nulls. - output.noNulls = noNulls; - output.isRepeating = false; - - // Handle repeating case - if (isRepeating) { - output.setVal(0, vector[0], start[0], length[0]); - output.isNull[0] = isNull[0]; - output.isRepeating = true; - return; - } - - // Handle normal case - - // Copy data values over - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - output.setVal(i, vector[i], start[i], length[i]); - } - } - else { - for (int i = 0; i < size; i++) { - output.setVal(i, vector[i], start[i], length[i]); - } - } - - // Copy nulls over if needed - if (!noNulls) { - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - output.isNull[i] = isNull[i]; - } - } - else { - System.arraycopy(isNull, 0, output.isNull, 0, size); - } - } - } - - /** Simplify vector by brute-force flattening noNulls and isRepeating - * This can be used to reduce combinatorial explosion of code paths in VectorExpressions - * with many arguments, at the expense of loss of some performance. - */ - public void flatten(boolean selectedInUse, int[] sel, int size) { - flattenPush(); - if (isRepeating) { - isRepeating = false; - - // setRef is used below and this is safe, because the reference - // is to data owned by this column vector. If this column vector - // gets re-used, the whole thing is re-used together so there - // is no danger of a dangling reference. - - // Only copy data values if entry is not null. The string value - // at position 0 is undefined if the position 0 value is null. - if (noNulls || !isNull[0]) { - - // loops start at position 1 because position 0 is already set - if (selectedInUse) { - for (int j = 1; j < size; j++) { - int i = sel[j]; - this.setRef(i, vector[0], start[0], length[0]); - } - } else { - for (int i = 1; i < size; i++) { - this.setRef(i, vector[0], start[0], length[0]); - } - } - } - flattenRepeatingNulls(selectedInUse, sel, size); - } - flattenNoNulls(selectedInUse, sel, size); - } - - // Fill the all the vector entries with provided value - public void fill(byte[] value) { - noNulls = true; - isRepeating = true; - setRef(0, value, 0, value.length); - } - - @Override - public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { - BytesColumnVector in = (BytesColumnVector) inputVector; - setVal(outElementNum, in.vector[inputElementNum], in.start[inputElementNum], in.length[inputElementNum]); - } - - @Override - public void init() { - initBuffer(0); - } - - @Override - public void stringifyValue(StringBuilder buffer, int row) { - if (isRepeating) { - row = 0; - } - if (noNulls || !isNull[row]) { - buffer.append('"'); - buffer.append(new String(this.buffer, start[row], length[row])); - buffer.append('"'); - } else { - buffer.append("null"); - } - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java deleted file mode 100644 index 4b5cf39..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java +++ /dev/null @@ -1,174 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector; - -import java.util.Arrays; - -/** - * ColumnVector contains the shared structure for the sub-types, - * including NULL information, and whether this vector - * repeats, i.e. has all values the same, so only the first - * one is set. This is used to accelerate query performance - * by handling a whole vector in O(1) time when applicable. - * - * The fields are public by design since this is a performance-critical - * structure that is used in the inner loop of query execution. - */ -public abstract class ColumnVector { - - /* - * The current kinds of column vectors. - */ - public static enum Type { - LONG, - DOUBLE, - BYTES, - DECIMAL - } - - /* - * If hasNulls is true, then this array contains true if the value - * is null, otherwise false. The array is always allocated, so a batch can be re-used - * later and nulls added. - */ - public boolean[] isNull; - - // If the whole column vector has no nulls, this is true, otherwise false. - public boolean noNulls; - - /* - * True if same value repeats for whole column vector. - * If so, vector[0] holds the repeating value. - */ - public boolean isRepeating; - - // Variables to hold state from before flattening so it can be easily restored. - private boolean preFlattenIsRepeating; - private boolean preFlattenNoNulls; - - /** - * Constructor for super-class ColumnVector. This is not called directly, - * but used to initialize inherited fields. - * - * @param len Vector length - */ - public ColumnVector(int len) { - isNull = new boolean[len]; - noNulls = true; - isRepeating = false; - } - - /** - * Resets the column to default state - * - fills the isNull array with false - * - sets noNulls to true - * - sets isRepeating to false - */ - public void reset() { - if (false == noNulls) { - Arrays.fill(isNull, false); - } - noNulls = true; - isRepeating = false; - } - - abstract public void flatten(boolean selectedInUse, int[] sel, int size); - - // Simplify vector by brute-force flattening noNulls if isRepeating - // This can be used to reduce combinatorial explosion of code paths in VectorExpressions - // with many arguments. - public void flattenRepeatingNulls(boolean selectedInUse, int[] sel, int size) { - - boolean nullFillValue; - - if (noNulls) { - nullFillValue = false; - } else { - nullFillValue = isNull[0]; - } - - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - isNull[i] = nullFillValue; - } - } else { - Arrays.fill(isNull, 0, size, nullFillValue); - } - - // all nulls are now explicit - noNulls = false; - } - - public void flattenNoNulls(boolean selectedInUse, int[] sel, int size) { - if (noNulls) { - noNulls = false; - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - isNull[i] = false; - } - } else { - Arrays.fill(isNull, 0, size, false); - } - } - } - - /** - * Restore the state of isRepeating and noNulls to what it was - * before flattening. This must only be called just after flattening - * and then evaluating a VectorExpression on the column vector. - * It is an optimization that allows other operations on the same - * column to continue to benefit from the isRepeating and noNulls - * indicators. - */ - public void unFlatten() { - isRepeating = preFlattenIsRepeating; - noNulls = preFlattenNoNulls; - } - - // Record repeating and no nulls state to be restored later. - protected void flattenPush() { - preFlattenIsRepeating = isRepeating; - preFlattenNoNulls = noNulls; - } - - /** - * Set the element in this column vector from the given input vector. - */ - public abstract void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector); - - /** - * Initialize the column vector. This method can be overridden by specific column vector types. - * Use this method only if the individual type of the column vector is not known, otherwise its - * preferable to call specific initialization methods. - */ - public void init() { - // Do nothing by default - } - - /** - * Print the value for this column into the given string builder. - * @param buffer the buffer to print into - * @param row the id of the row to print - */ - public abstract void stringifyValue(StringBuilder buffer, - int row); - } - diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java deleted file mode 100644 index 74a9d5f..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec.vector; - -import java.math.BigInteger; - -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.common.type.HiveDecimal; - -public class DecimalColumnVector extends ColumnVector { - - /** - * A vector of HiveDecimalWritable objects. - * - * For high performance and easy access to this low-level structure, - * the fields are public by design (as they are in other ColumnVector - * types). - */ - public HiveDecimalWritable[] vector; - public short scale; - public short precision; - - public DecimalColumnVector(int precision, int scale) { - this(VectorizedRowBatch.DEFAULT_SIZE, precision, scale); - } - - public DecimalColumnVector(int size, int precision, int scale) { - super(size); - this.precision = (short) precision; - this.scale = (short) scale; - vector = new HiveDecimalWritable[size]; - for (int i = 0; i < size; i++) { - vector[i] = new HiveDecimalWritable(HiveDecimal.ZERO); - } - } - - @Override - public void flatten(boolean selectedInUse, int[] sel, int size) { - // TODO Auto-generated method stub - } - - @Override - public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { - HiveDecimal hiveDec = ((DecimalColumnVector) inputVector).vector[inputElementNum].getHiveDecimal(precision, scale); - if (hiveDec == null) { - noNulls = false; - isNull[outElementNum] = true; - } else { - vector[outElementNum].set(hiveDec); - } - } - - @Override - public void stringifyValue(StringBuilder buffer, int row) { - if (isRepeating) { - row = 0; - } - if (noNulls || !isNull[row]) { - buffer.append(vector[row].toString()); - } else { - buffer.append("null"); - } - } - - public void set(int elementNum, HiveDecimalWritable writeable) { - HiveDecimal hiveDec = writeable.getHiveDecimal(precision, scale); - if (hiveDec == null) { - noNulls = false; - isNull[elementNum] = true; - } else { - vector[elementNum].set(hiveDec); - } - } - - public void set(int elementNum, HiveDecimal hiveDec) { - HiveDecimal checkedDec = HiveDecimal.enforcePrecisionScale(hiveDec, precision, scale); - if (checkedDec == null) { - noNulls = false; - isNull[elementNum] = true; - } else { - vector[elementNum].set(checkedDec); - } - } - - public void setNullDataValue(int elementNum) { - // E.g. For scale 2 the minimum is "0.01" - HiveDecimal minimumNonZeroValue = HiveDecimal.create(BigInteger.ONE, scale); - vector[elementNum].set(minimumNonZeroValue); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java deleted file mode 100644 index 4a7811d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java +++ /dev/null @@ -1,143 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector; - -import java.util.Arrays; - -/** - * This class represents a nullable double precision floating point column vector. - * This class will be used for operations on all floating point types (float, double) - * and as such will use a 64-bit double value to hold the biggest possible value. - * During copy-in/copy-out, smaller types (i.e. float) will be converted as needed. This will - * reduce the amount of code that needs to be generated and also will run fast since the - * machine operates with 64-bit words. - * - * The vector[] field is public by design for high-performance access in the inner - * loop of query execution. - */ -public class DoubleColumnVector extends ColumnVector { - public double[] vector; - public static final double NULL_VALUE = Double.NaN; - - /** - * Use this constructor by default. All column vectors - * should normally be the default size. - */ - public DoubleColumnVector() { - this(VectorizedRowBatch.DEFAULT_SIZE); - } - - /** - * Don't use this except for testing purposes. - * - * @param len - */ - public DoubleColumnVector(int len) { - super(len); - vector = new double[len]; - } - - // Copy the current object contents into the output. Only copy selected entries, - // as indicated by selectedInUse and the sel array. - public void copySelected( - boolean selectedInUse, int[] sel, int size, DoubleColumnVector output) { - - // Output has nulls if and only if input has nulls. - output.noNulls = noNulls; - output.isRepeating = false; - - // Handle repeating case - if (isRepeating) { - output.vector[0] = vector[0]; - output.isNull[0] = isNull[0]; - output.isRepeating = true; - return; - } - - // Handle normal case - - // Copy data values over - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - output.vector[i] = vector[i]; - } - } - else { - System.arraycopy(vector, 0, output.vector, 0, size); - } - - // Copy nulls over if needed - if (!noNulls) { - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - output.isNull[i] = isNull[i]; - } - } - else { - System.arraycopy(isNull, 0, output.isNull, 0, size); - } - } - } - - // Fill the column vector with the provided value - public void fill(double value) { - noNulls = true; - isRepeating = true; - vector[0] = value; - } - - // Simplify vector by brute-force flattening noNulls and isRepeating - // This can be used to reduce combinatorial explosion of code paths in VectorExpressions - // with many arguments. - public void flatten(boolean selectedInUse, int[] sel, int size) { - flattenPush(); - if (isRepeating) { - isRepeating = false; - double repeatVal = vector[0]; - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - vector[i] = repeatVal; - } - } else { - Arrays.fill(vector, 0, size, repeatVal); - } - flattenRepeatingNulls(selectedInUse, sel, size); - } - flattenNoNulls(selectedInUse, sel, size); - } - - @Override - public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { - vector[outElementNum] = ((DoubleColumnVector) inputVector).vector[inputElementNum]; - } - - @Override - public void stringifyValue(StringBuilder buffer, int row) { - if (isRepeating) { - row = 0; - } - if (noNulls || !isNull[row]) { - buffer.append(vector[row]); - } else { - buffer.append("null"); - } - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java deleted file mode 100644 index 5702584..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java +++ /dev/null @@ -1,189 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector; - -import java.util.Arrays; - -/** - * This class represents a nullable int column vector. - * This class will be used for operations on all integer types (tinyint, smallint, int, bigint) - * and as such will use a 64-bit long value to hold the biggest possible value. - * During copy-in/copy-out, smaller int types will be converted as needed. This will - * reduce the amount of code that needs to be generated and also will run fast since the - * machine operates with 64-bit words. - * - * The vector[] field is public by design for high-performance access in the inner - * loop of query execution. - */ -public class LongColumnVector extends ColumnVector { - public long[] vector; - public static final long NULL_VALUE = 1; - - /** - * Use this constructor by default. All column vectors - * should normally be the default size. - */ - public LongColumnVector() { - this(VectorizedRowBatch.DEFAULT_SIZE); - } - - /** - * Don't use this except for testing purposes. - * - * @param len the number of rows - */ - public LongColumnVector(int len) { - super(len); - vector = new long[len]; - } - - // Copy the current object contents into the output. Only copy selected entries, - // as indicated by selectedInUse and the sel array. - public void copySelected( - boolean selectedInUse, int[] sel, int size, LongColumnVector output) { - - // Output has nulls if and only if input has nulls. - output.noNulls = noNulls; - output.isRepeating = false; - - // Handle repeating case - if (isRepeating) { - output.vector[0] = vector[0]; - output.isNull[0] = isNull[0]; - output.isRepeating = true; - return; - } - - // Handle normal case - - // Copy data values over - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - output.vector[i] = vector[i]; - } - } - else { - System.arraycopy(vector, 0, output.vector, 0, size); - } - - // Copy nulls over if needed - if (!noNulls) { - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - output.isNull[i] = isNull[i]; - } - } - else { - System.arraycopy(isNull, 0, output.isNull, 0, size); - } - } - } - - // Copy the current object contents into the output. Only copy selected entries, - // as indicated by selectedInUse and the sel array. - public void copySelected( - boolean selectedInUse, int[] sel, int size, DoubleColumnVector output) { - - // Output has nulls if and only if input has nulls. - output.noNulls = noNulls; - output.isRepeating = false; - - // Handle repeating case - if (isRepeating) { - output.vector[0] = vector[0]; // automatic conversion to double is done here - output.isNull[0] = isNull[0]; - output.isRepeating = true; - return; - } - - // Handle normal case - - // Copy data values over - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - output.vector[i] = vector[i]; - } - } - else { - for(int i = 0; i < size; ++i) { - output.vector[i] = vector[i]; - } - } - - // Copy nulls over if needed - if (!noNulls) { - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - output.isNull[i] = isNull[i]; - } - } - else { - System.arraycopy(isNull, 0, output.isNull, 0, size); - } - } - } - - // Fill the column vector with the provided value - public void fill(long value) { - noNulls = true; - isRepeating = true; - vector[0] = value; - } - - // Simplify vector by brute-force flattening noNulls and isRepeating - // This can be used to reduce combinatorial explosion of code paths in VectorExpressions - // with many arguments. - public void flatten(boolean selectedInUse, int[] sel, int size) { - flattenPush(); - if (isRepeating) { - isRepeating = false; - long repeatVal = vector[0]; - if (selectedInUse) { - for (int j = 0; j < size; j++) { - int i = sel[j]; - vector[i] = repeatVal; - } - } else { - Arrays.fill(vector, 0, size, repeatVal); - } - flattenRepeatingNulls(selectedInUse, sel, size); - } - flattenNoNulls(selectedInUse, sel, size); - } - - @Override - public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { - vector[outElementNum] = ((LongColumnVector) inputVector).vector[inputElementNum]; - } - - @Override - public void stringifyValue(StringBuilder buffer, int row) { - if (isRepeating) { - row = 0; - } - if (noNulls || !isNull[row]) { - buffer.append(vector[row]); - } else { - buffer.append("null"); - } - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java deleted file mode 100644 index 7c18da6..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java +++ /dev/null @@ -1,186 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; - -/** - * A VectorizedRowBatch is a set of rows, organized with each column - * as a vector. It is the unit of query execution, organized to minimize - * the cost per row and achieve high cycles-per-instruction. - * The major fields are public by design to allow fast and convenient - * access by the vectorized query execution code. - */ -public class VectorizedRowBatch implements Writable { - public int numCols; // number of columns - public ColumnVector[] cols; // a vector for each column - public int size; // number of rows that qualify (i.e. haven't been filtered out) - public int[] selected; // array of positions of selected values - public int[] projectedColumns; - public int projectionSize; - - /* - * If no filtering has been applied yet, selectedInUse is false, - * meaning that all rows qualify. If it is true, then the selected[] array - * records the offsets of qualifying rows. - */ - public boolean selectedInUse; - - // If this is true, then there is no data in the batch -- we have hit the end of input. - public boolean endOfFile; - - /* - * This number is carefully chosen to minimize overhead and typically allows - * one VectorizedRowBatch to fit in cache. - */ - public static final int DEFAULT_SIZE = 1024; - - /** - * Return a batch with the specified number of columns. - * This is the standard constructor -- all batches should be the same size - * - * @param numCols the number of columns to include in the batch - */ - public VectorizedRowBatch(int numCols) { - this(numCols, DEFAULT_SIZE); - } - - /** - * Return a batch with the specified number of columns and rows. - * Only call this constructor directly for testing purposes. - * Batch size should normally always be defaultSize. - * - * @param numCols the number of columns to include in the batch - * @param size the number of rows to include in the batch - */ - public VectorizedRowBatch(int numCols, int size) { - this.numCols = numCols; - this.size = size; - selected = new int[size]; - selectedInUse = false; - this.cols = new ColumnVector[numCols]; - projectedColumns = new int[numCols]; - - // Initially all columns are projected and in the same order - projectionSize = numCols; - for (int i = 0; i < numCols; i++) { - projectedColumns[i] = i; - } - } - - /** - * Returns the maximum size of the batch (number of rows it can hold) - */ - public int getMaxSize() { - return selected.length; - } - - /** - * Return count of qualifying rows. - * - * @return number of rows that have not been filtered out - */ - public long count() { - return size; - } - - private static String toUTF8(Object o) { - if(o == null || o instanceof NullWritable) { - return "\\N"; /* as found in LazySimpleSerDe's nullSequence */ - } - return o.toString(); - } - - @Override - public String toString() { - if (size == 0) { - return ""; - } - StringBuilder b = new StringBuilder(); - if (this.selectedInUse) { - for (int j = 0; j < size; j++) { - int i = selected[j]; - b.append('['); - for (int k = 0; k < projectionSize; k++) { - int projIndex = projectedColumns[k]; - ColumnVector cv = cols[projIndex]; - if (k > 0) { - b.append(", "); - } - cv.stringifyValue(b, i); - } - b.append(']'); - if (j < size - 1) { - b.append('\n'); - } - } - } else { - for (int i = 0; i < size; i++) { - b.append('['); - for (int k = 0; k < projectionSize; k++) { - int projIndex = projectedColumns[k]; - ColumnVector cv = cols[projIndex]; - if (k > 0) { - b.append(", "); - } - cv.stringifyValue(b, i); - } - b.append(']'); - if (i < size - 1) { - b.append('\n'); - } - } - } - return b.toString(); - } - - @Override - public void readFields(DataInput arg0) throws IOException { - throw new UnsupportedOperationException("Do you really need me?"); - } - - @Override - public void write(DataOutput arg0) throws IOException { - throw new UnsupportedOperationException("Don't call me"); - } - - /** - * Resets the row batch to default state - * - sets selectedInUse to false - * - sets size to 0 - * - sets endOfFile to false - * - resets each column - * - inits each column - */ - public void reset() { - selectedInUse = false; - size = 0; - endOfFile = false; - for (ColumnVector vc : cols) { - if (vc != null) { - vc.reset(); - vc.init(); - } - } - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java deleted file mode 100644 index 6ad927d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.sarg; - -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import org.apache.commons.codec.binary.Base64; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder; -import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; -import org.apache.hadoop.hive.ql.plan.TableScanDesc; - -/** - * A factory for creating SearchArguments. - */ -public class SearchArgumentFactory { - public static Builder newBuilder() { - return new SearchArgumentImpl.BuilderImpl(); - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java deleted file mode 100644 index 1582a75..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java +++ /dev/null @@ -1,697 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.sarg; - -import java.sql.Timestamp; -import java.util.ArrayDeque; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Deque; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Output; - -/** - * The implementation of SearchArguments. - */ -final class SearchArgumentImpl implements SearchArgument { - public static final Log LOG = LogFactory.getLog(SearchArgumentImpl.class); - - static final class PredicateLeafImpl implements PredicateLeaf { - private final Operator operator; - private final Type type; - private final String columnName; - private final Object literal; - private final List literalList; - - // Used by kryo - @SuppressWarnings("unused") - PredicateLeafImpl() { - operator = null; - type = null; - columnName = null; - literal = null; - literalList = null; - } - - PredicateLeafImpl(Operator operator, - Type type, - String columnName, - Object literal, - List literalList) { - this.operator = operator; - this.type = type; - this.columnName = columnName; - this.literal = literal; - if (literal != null) { - if (literal.getClass() != type.getValueClass()) { - throw new IllegalArgumentException("Wrong value class " + - literal.getClass().getName() + " for " + type + "." + operator + - " leaf"); - } - } - this.literalList = literalList; - if (literalList != null) { - Class valueCls = type.getValueClass(); - for(Object lit: literalList) { - if (lit != null && lit.getClass() != valueCls) { - throw new IllegalArgumentException("Wrong value class item " + - lit.getClass().getName() + " for " + type + "." + operator + - " leaf"); - } - } - } - } - - @Override - public Operator getOperator() { - return operator; - } - - @Override - public Type getType(){ - return type; - } - - @Override - public String getColumnName() { - return columnName; - } - - @Override - public Object getLiteral() { - // To get around a kryo 2.22 bug while deserialize a Timestamp into Date - // (https://github.com/EsotericSoftware/kryo/issues/88) - // When we see a Date, convert back into Timestamp - if (literal instanceof java.util.Date) { - return new Timestamp(((java.util.Date)literal).getTime()); - } - return literal; - } - - @Override - public List getLiteralList() { - return literalList; - } - - @Override - public String toString() { - StringBuilder buffer = new StringBuilder(); - buffer.append('('); - buffer.append(operator); - buffer.append(' '); - buffer.append(columnName); - if (literal != null) { - buffer.append(' '); - buffer.append(literal); - } else if (literalList != null) { - for(Object lit: literalList) { - buffer.append(' '); - buffer.append(lit == null ? "null" : lit.toString()); - } - } - buffer.append(')'); - return buffer.toString(); - } - - private static boolean isEqual(Object left, Object right) { - - return left == right || - (left != null && right != null && left.equals(right)); - } - - @Override - public boolean equals(Object other) { - if (other == null || other.getClass() != getClass()) { - return false; - } else if (other == this) { - return true; - } else { - PredicateLeafImpl o = (PredicateLeafImpl) other; - return operator == o.operator && - type == o.type && - columnName.equals(o.columnName) && - isEqual(literal, o.literal) && - isEqual(literalList, o.literalList); - } - } - - @Override - public int hashCode() { - return operator.hashCode() + - type.hashCode() * 17 + - columnName.hashCode() * 3 * 17+ - (literal == null ? 0 : literal.hashCode()) * 101 * 3 * 17 + - (literalList == null ? 0 : literalList.hashCode()) * - 103 * 101 * 3 * 17; - } - } - - - private final List leaves; - private final ExpressionTree expression; - - SearchArgumentImpl(ExpressionTree expression, List leaves) { - this.expression = expression; - this.leaves = leaves; - } - - // Used by kyro - @SuppressWarnings("unused") - SearchArgumentImpl() { - leaves = null; - expression = null; - } - - @Override - public List getLeaves() { - return leaves; - } - - @Override - public TruthValue evaluate(TruthValue[] leaves) { - return expression == null ? TruthValue.YES : expression.evaluate(leaves); - } - - @Override - public ExpressionTree getExpression() { - return expression; - } - - @Override - public String toString() { - StringBuilder buffer = new StringBuilder(); - for(int i=0; i < leaves.size(); ++i) { - buffer.append("leaf-"); - buffer.append(i); - buffer.append(" = "); - buffer.append(leaves.get(i).toString()); - buffer.append('\n'); - } - buffer.append("expr = "); - buffer.append(expression); - return buffer.toString(); - } - - public String toKryo() { - Output out = new Output(4 * 1024, 10 * 1024 * 1024); - new Kryo().writeObject(out, this); - out.close(); - return Base64.encodeBase64String(out.toBytes()); - } - - static class BuilderImpl implements Builder { - - // max threshold for CNF conversion. having >8 elements in andList will be - // converted to maybe - private static final int CNF_COMBINATIONS_THRESHOLD = 256; - - private final Deque currentTree = - new ArrayDeque(); - private final Map leaves = - new HashMap(); - private final ExpressionTree root = - new ExpressionTree(ExpressionTree.Operator.AND); - { - currentTree.add(root); - } - - @Override - public Builder startOr() { - ExpressionTree node = new ExpressionTree(ExpressionTree.Operator.OR); - currentTree.getFirst().getChildren().add(node); - currentTree.addFirst(node); - return this; - } - - @Override - public Builder startAnd() { - ExpressionTree node = new ExpressionTree(ExpressionTree.Operator.AND); - currentTree.getFirst().getChildren().add(node); - currentTree.addFirst(node); - return this; - } - - @Override - public Builder startNot() { - ExpressionTree node = new ExpressionTree(ExpressionTree.Operator.NOT); - currentTree.getFirst().getChildren().add(node); - currentTree.addFirst(node); - return this; - } - - @Override - public Builder end() { - ExpressionTree current = currentTree.removeFirst(); - if (current.getChildren().size() == 0) { - throw new IllegalArgumentException("Can't create expression " + root + - " with no children."); - } - if (current.getOperator() == ExpressionTree.Operator.NOT && - current.getChildren().size() != 1) { - throw new IllegalArgumentException("Can't create not expression " + - current + " with more than 1 child."); - } - return this; - } - - private int addLeaf(PredicateLeaf leaf) { - Integer result = leaves.get(leaf); - if (result == null) { - int id = leaves.size(); - leaves.put(leaf, id); - return id; - } else { - return result; - } - } - - @Override - public Builder lessThan(String column, PredicateLeaf.Type type, - Object literal) { - ExpressionTree parent = currentTree.getFirst(); - if (column == null || literal == null) { - parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); - } else { - PredicateLeaf leaf = - new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN, - type, column, literal, null); - parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); - } - return this; - } - - @Override - public Builder lessThanEquals(String column, PredicateLeaf.Type type, - Object literal) { - ExpressionTree parent = currentTree.getFirst(); - if (column == null || literal == null) { - parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); - } else { - PredicateLeaf leaf = - new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN_EQUALS, - type, column, literal, null); - parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); - } - return this; - } - - @Override - public Builder equals(String column, PredicateLeaf.Type type, - Object literal) { - ExpressionTree parent = currentTree.getFirst(); - if (column == null || literal == null) { - parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); - } else { - PredicateLeaf leaf = - new PredicateLeafImpl(PredicateLeaf.Operator.EQUALS, - type, column, literal, null); - parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); - } - return this; - } - - @Override - public Builder nullSafeEquals(String column, PredicateLeaf.Type type, - Object literal) { - ExpressionTree parent = currentTree.getFirst(); - if (column == null || literal == null) { - parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); - } else { - PredicateLeaf leaf = - new PredicateLeafImpl(PredicateLeaf.Operator.NULL_SAFE_EQUALS, - type, column, literal, null); - parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); - } - return this; - } - - @Override - public Builder in(String column, PredicateLeaf.Type type, - Object... literal) { - ExpressionTree parent = currentTree.getFirst(); - if (column == null || literal == null) { - parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); - } else { - if (literal.length == 0) { - throw new IllegalArgumentException("Can't create in expression with " - + "no arguments"); - } - List argList = new ArrayList(); - argList.addAll(Arrays.asList(literal)); - - PredicateLeaf leaf = - new PredicateLeafImpl(PredicateLeaf.Operator.IN, - type, column, null, argList); - parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); - } - return this; - } - - @Override - public Builder isNull(String column, PredicateLeaf.Type type) { - ExpressionTree parent = currentTree.getFirst(); - if (column == null) { - parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); - } else { - PredicateLeaf leaf = - new PredicateLeafImpl(PredicateLeaf.Operator.IS_NULL, - type, column, null, null); - parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); - } - return this; - } - - @Override - public Builder between(String column, PredicateLeaf.Type type, Object lower, - Object upper) { - ExpressionTree parent = currentTree.getFirst(); - if (column == null || lower == null || upper == null) { - parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); - } else { - List argList = new ArrayList(); - argList.add(lower); - argList.add(upper); - PredicateLeaf leaf = - new PredicateLeafImpl(PredicateLeaf.Operator.BETWEEN, - type, column, null, argList); - parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); - } - return this; - } - - @Override - public Builder literal(TruthValue truth) { - ExpressionTree parent = currentTree.getFirst(); - parent.getChildren().add(new ExpressionTree(truth)); - return this; - } - - /** - * Recursively explore the tree to find the leaves that are still reachable - * after optimizations. - * @param tree the node to check next - * @param next the next available leaf id - * @param leafReorder - * @return the next available leaf id - */ - static int compactLeaves(ExpressionTree tree, int next, int[] leafReorder) { - if (tree.getOperator() == ExpressionTree.Operator.LEAF) { - int oldLeaf = tree.getLeaf(); - if (leafReorder[oldLeaf] == -1) { - leafReorder[oldLeaf] = next++; - } - } else if (tree.getChildren() != null){ - for(ExpressionTree child: tree.getChildren()) { - next = compactLeaves(child, next, leafReorder); - } - } - return next; - } - - /** - * Rewrite expression tree to update the leaves. - * @param root the root of the tree to fix - * @param leafReorder a map from old leaf ids to new leaf ids - * @return the fixed root - */ - static ExpressionTree rewriteLeaves(ExpressionTree root, - int[] leafReorder) { - if (root.getOperator() == ExpressionTree.Operator.LEAF) { - return new ExpressionTree(leafReorder[root.getLeaf()]); - } else if (root.getChildren() != null){ - List children = root.getChildren(); - for(int i=0; i < children.size(); ++i) { - children.set(i, rewriteLeaves(children.get(i), leafReorder)); - } - } - return root; - } - - @Override - public SearchArgument build() { - if (currentTree.size() != 1) { - throw new IllegalArgumentException("Failed to end " + - currentTree.size() + " operations."); - } - ExpressionTree optimized = pushDownNot(root); - optimized = foldMaybe(optimized); - optimized = flatten(optimized); - optimized = convertToCNF(optimized); - optimized = flatten(optimized); - int leafReorder[] = new int[leaves.size()]; - Arrays.fill(leafReorder, -1); - int newLeafCount = compactLeaves(optimized, 0, leafReorder); - optimized = rewriteLeaves(optimized, leafReorder); - ArrayList leafList = new ArrayList<>(newLeafCount); - // expand list to correct size - for(int i=0; i < newLeafCount; ++i) { - leafList.add(null); - } - // build the new list - for(Map.Entry elem: leaves.entrySet()) { - int newLoc = leafReorder[elem.getValue()]; - if (newLoc != -1) { - leafList.set(newLoc, elem.getKey()); - } - } - return new SearchArgumentImpl(optimized, leafList); - } - - /** - * Push the negations all the way to just before the leaves. Also remove - * double negatives. - * @param root the expression to normalize - * @return the normalized expression, which may share some or all of the - * nodes of the original expression. - */ - static ExpressionTree pushDownNot(ExpressionTree root) { - if (root.getOperator() == ExpressionTree.Operator.NOT) { - ExpressionTree child = root.getChildren().get(0); - switch (child.getOperator()) { - case NOT: - return pushDownNot(child.getChildren().get(0)); - case CONSTANT: - return new ExpressionTree(child.getConstant().not()); - case AND: - root = new ExpressionTree(ExpressionTree.Operator.OR); - for(ExpressionTree kid: child.getChildren()) { - root.getChildren().add(pushDownNot(new - ExpressionTree(ExpressionTree.Operator.NOT, kid))); - } - break; - case OR: - root = new ExpressionTree(ExpressionTree.Operator.AND); - for(ExpressionTree kid: child.getChildren()) { - root.getChildren().add(pushDownNot(new ExpressionTree - (ExpressionTree.Operator.NOT, kid))); - } - break; - // for leaf, we don't do anything - default: - break; - } - } else if (root.getChildren() != null) { - // iterate through children and push down not for each one - for(int i=0; i < root.getChildren().size(); ++i) { - root.getChildren().set(i, pushDownNot(root.getChildren().get(i))); - } - } - return root; - } - - /** - * Remove MAYBE values from the expression. If they are in an AND operator, - * they are dropped. If they are in an OR operator, they kill their parent. - * This assumes that pushDownNot has already been called. - * @param expr The expression to clean up - * @return The cleaned up expression - */ - static ExpressionTree foldMaybe(ExpressionTree expr) { - if (expr.getChildren() != null) { - for(int i=0; i < expr.getChildren().size(); ++i) { - ExpressionTree child = foldMaybe(expr.getChildren().get(i)); - if (child.getConstant() == TruthValue.YES_NO_NULL) { - switch (expr.getOperator()) { - case AND: - expr.getChildren().remove(i); - i -= 1; - break; - case OR: - // a maybe will kill the or condition - return child; - default: - throw new IllegalStateException("Got a maybe as child of " + - expr); - } - } else { - expr.getChildren().set(i, child); - } - } - if (expr.getChildren().isEmpty()) { - return new ExpressionTree(TruthValue.YES_NO_NULL); - } - } - return expr; - } - - /** - * Converts multi-level ands and ors into single level ones. - * @param root the expression to flatten - * @return the flattened expression, which will always be root with - * potentially modified children. - */ - static ExpressionTree flatten(ExpressionTree root) { - if (root.getChildren() != null) { - // iterate through the index, so that if we add more children, - // they don't get re-visited - for(int i=0; i < root.getChildren().size(); ++i) { - ExpressionTree child = flatten(root.getChildren().get(i)); - // do we need to flatten? - if (child.getOperator() == root.getOperator() && - child.getOperator() != ExpressionTree.Operator.NOT) { - boolean first = true; - for(ExpressionTree grandkid: child.getChildren()) { - // for the first grandkid replace the original parent - if (first) { - first = false; - root.getChildren().set(i, grandkid); - } else { - root.getChildren().add(++i, grandkid); - } - } - } else { - root.getChildren().set(i, child); - } - } - // if we have a singleton AND or OR, just return the child - if ((root.getOperator() == ExpressionTree.Operator.OR || - root.getOperator() == ExpressionTree.Operator.AND) && - root.getChildren().size() == 1) { - return root.getChildren().get(0); - } - } - return root; - } - - /** - * Generate all combinations of items on the andList. For each item on the - * andList, it generates all combinations of one child from each and - * expression. Thus, (and a b) (and c d) will be expanded to: (or a c) - * (or a d) (or b c) (or b d). If there are items on the nonAndList, they - * are added to each or expression. - * @param result a list to put the results onto - * @param andList a list of and expressions - * @param nonAndList a list of non-and expressions - */ - private static void generateAllCombinations(List result, - List andList, - List nonAndList - ) { - List kids = andList.get(0).getChildren(); - if (result.isEmpty()) { - for(ExpressionTree kid: kids) { - ExpressionTree or = new ExpressionTree(ExpressionTree.Operator.OR); - result.add(or); - for(ExpressionTree node: nonAndList) { - or.getChildren().add(new ExpressionTree(node)); - } - or.getChildren().add(kid); - } - } else { - List work = new ArrayList(result); - result.clear(); - for(ExpressionTree kid: kids) { - for(ExpressionTree or: work) { - ExpressionTree copy = new ExpressionTree(or); - copy.getChildren().add(kid); - result.add(copy); - } - } - } - if (andList.size() > 1) { - generateAllCombinations(result, andList.subList(1, andList.size()), - nonAndList); - } - } - - /** - * Convert an expression so that the top level operator is AND with OR - * operators under it. This routine assumes that all of the NOT operators - * have been pushed to the leaves via pushdDownNot. - * @param root the expression - * @return the normalized expression - */ - static ExpressionTree convertToCNF(ExpressionTree root) { - if (root.getChildren() != null) { - // convert all of the children to CNF - int size = root.getChildren().size(); - for(int i=0; i < size; ++i) { - root.getChildren().set(i, convertToCNF(root.getChildren().get(i))); - } - if (root.getOperator() == ExpressionTree.Operator.OR) { - // a list of leaves that weren't under AND expressions - List nonAndList = new ArrayList(); - // a list of AND expressions that we need to distribute - List andList = new ArrayList(); - for(ExpressionTree child: root.getChildren()) { - if (child.getOperator() == ExpressionTree.Operator.AND) { - andList.add(child); - } else if (child.getOperator() == ExpressionTree.Operator.OR) { - // pull apart the kids of the OR expression - for(ExpressionTree grandkid: child.getChildren()) { - nonAndList.add(grandkid); - } - } else { - nonAndList.add(child); - } - } - if (!andList.isEmpty()) { - if (checkCombinationsThreshold(andList)) { - root = new ExpressionTree(ExpressionTree.Operator.AND); - generateAllCombinations(root.getChildren(), andList, nonAndList); - } else { - root = new ExpressionTree(TruthValue.YES_NO_NULL); - } - } - } - } - return root; - } - - private static boolean checkCombinationsThreshold(List andList) { - int numComb = 1; - for (ExpressionTree tree : andList) { - numComb *= tree.getChildren().size(); - if (numComb > CNF_COMBINATIONS_THRESHOLD) { - return false; - } - } - return true; - } - - } -} diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 46deda5..6cb8529 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -17,6 +17,9 @@ */ package org.apache.hadoop.hive.ql.io.orc; +import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.io.Output; + import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -40,6 +43,7 @@ import java.util.TimeZone; import java.util.TreeSet; +import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FSDataInputStream; @@ -106,6 +110,13 @@ public class TestInputOutputFormat { + public static String toKryo(SearchArgument sarg) { + Output out = new Output(4 * 1024, 10 * 1024 * 1024); + new Kryo().writeObject(out, sarg); + out.close(); + return Base64.encodeBase64String(out.toBytes()); + } + Path workDir = new Path(System.getProperty("test.tmp.dir","target/tmp")); static final int MILLIS_IN_DAY = 1000 * 60 * 60 * 24; private static final SimpleDateFormat DATE_FORMAT = @@ -1751,7 +1762,7 @@ public void testSetSearchArgument() throws Exception { types.add(builder.build()); SearchArgument isNull = SearchArgumentFactory.newBuilder() .startAnd().isNull("cost", PredicateLeaf.Type.INTEGER).end().build(); - conf.set(ConvertAstToSearchArg.SARG_PUSHDOWN, isNull.toKryo()); + conf.set(ConvertAstToSearchArg.SARG_PUSHDOWN, toKryo(isNull)); conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "url,cost"); options.include(new boolean[]{true, true, false, true, false}); @@ -1798,7 +1809,7 @@ public void testSplitElimination() throws Exception { .lessThan("z", PredicateLeaf.Type.INTEGER, new Integer(0)) .end() .build(); - conf.set("sarg.pushdown", sarg.toKryo()); + conf.set("sarg.pushdown", toKryo(sarg)); conf.set("hive.io.file.readcolumn.names", "z,r"); properties.setProperty("columns", "z,r"); properties.setProperty("columns.types", "int:struct"); @@ -1840,7 +1851,7 @@ public void testSplitEliminationNullStats() throws Exception { .lessThan("z", PredicateLeaf.Type.STRING, new String("foo")) .end() .build(); - conf.set("sarg.pushdown", sarg.toKryo()); + conf.set("sarg.pushdown", toKryo(sarg)); conf.set("hive.io.file.readcolumn.names", "z"); properties.setProperty("columns", "z"); properties.setProperty("columns.types", "string"); diff --git ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java index 433474b..71d2adf 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java @@ -25,6 +25,7 @@ import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl.PredicateLeafImpl; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; @@ -464,7 +465,7 @@ public void testTimestampSerialization() throws Exception { .end() .build(); - String serializedSarg = sarg.toKryo(); + String serializedSarg = TestInputOutputFormat.toKryo(sarg); SearchArgument sarg2 = ConvertAstToSearchArg.create(serializedSarg); Field literalField = PredicateLeafImpl.class.getDeclaredField("literal"); diff --git serde/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java serde/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java deleted file mode 100644 index 2dd3a45..0000000 --- serde/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java +++ /dev/null @@ -1,157 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.sarg; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * The inner representation of the SearchArgument. Most users should not - * need this interface, it is only for file formats that need to translate - * the SearchArgument into an internal form. - */ -public class ExpressionTree { - public enum Operator {OR, AND, NOT, LEAF, CONSTANT} - private final Operator operator; - private final List children; - private final int leaf; - private final SearchArgument.TruthValue constant; - - ExpressionTree() { - operator = null; - children = null; - leaf = 0; - constant = null; - } - - ExpressionTree(Operator op, ExpressionTree... kids) { - operator = op; - children = new ArrayList(); - leaf = -1; - this.constant = null; - Collections.addAll(children, kids); - } - - ExpressionTree(int leaf) { - operator = Operator.LEAF; - children = null; - this.leaf = leaf; - this.constant = null; - } - - ExpressionTree(SearchArgument.TruthValue constant) { - operator = Operator.CONSTANT; - children = null; - this.leaf = -1; - this.constant = constant; - } - - ExpressionTree(ExpressionTree other) { - this.operator = other.operator; - if (other.children == null) { - this.children = null; - } else { - this.children = new ArrayList(); - for(ExpressionTree child: other.children) { - children.add(new ExpressionTree(child)); - } - } - this.leaf = other.leaf; - this.constant = other.constant; - } - - public SearchArgument.TruthValue evaluate(SearchArgument.TruthValue[] leaves - ) { - SearchArgument.TruthValue result = null; - switch (operator) { - case OR: - for(ExpressionTree child: children) { - result = child.evaluate(leaves).or(result); - } - return result; - case AND: - for(ExpressionTree child: children) { - result = child.evaluate(leaves).and(result); - } - return result; - case NOT: - return children.get(0).evaluate(leaves).not(); - case LEAF: - return leaves[leaf]; - case CONSTANT: - return constant; - default: - throw new IllegalStateException("Unknown operator: " + operator); - } - } - - @Override - public String toString() { - StringBuilder buffer = new StringBuilder(); - switch (operator) { - case OR: - buffer.append("(or"); - for(ExpressionTree child: children) { - buffer.append(' '); - buffer.append(child.toString()); - } - buffer.append(')'); - break; - case AND: - buffer.append("(and"); - for(ExpressionTree child: children) { - buffer.append(' '); - buffer.append(child.toString()); - } - buffer.append(')'); - break; - case NOT: - buffer.append("(not "); - buffer.append(children.get(0)); - buffer.append(')'); - break; - case LEAF: - buffer.append("leaf-"); - buffer.append(leaf); - break; - case CONSTANT: - buffer.append(constant); - break; - } - return buffer.toString(); - } - - public Operator getOperator() { - return operator; - } - - public List getChildren() { - return children; - } - - public SearchArgument.TruthValue getConstant() { - return constant; - } - - public int getLeaf() { - return leaf; - } -} - diff --git serde/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java serde/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java deleted file mode 100644 index 3a92565..0000000 --- serde/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.sarg; - -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; - -import java.sql.Date; -import java.sql.Timestamp; -import java.util.List; - -/** - * The primitive predicates that form a SearchArgument. - */ -public interface PredicateLeaf { - - /** - * The possible operators for predicates. To get the opposites, construct - * an expression with a not operator. - */ - public static enum Operator { - EQUALS, - NULL_SAFE_EQUALS, - LESS_THAN, - LESS_THAN_EQUALS, - IN, - BETWEEN, - IS_NULL - } - - /** - * The possible types for sargs. - */ - public static enum Type { - INTEGER(Integer.class), // all of the integer types except long - LONG(Long.class), - FLOAT(Double.class), // float and double - STRING(String.class), // string, char, varchar - DATE(Date.class), - DECIMAL(HiveDecimalWritable.class), - TIMESTAMP(Timestamp.class), - BOOLEAN(Boolean.class); - - private final Class cls; - Type(Class cls) { - this.cls = cls; - } - - /** - * For all SARG leaves, the values must be the matching class. - * @return the value class - */ - public Class getValueClass() { - return cls; - } - } - - /** - * Get the operator for the leaf. - */ - public Operator getOperator(); - - /** - * Get the type of the column and literal by the file format. - */ - public Type getType(); - - /** - * Get the simple column name. - * @return the column name - */ - public String getColumnName(); - - /** - * Get the literal half of the predicate leaf. Adapt the original type for what orc needs - * - * @return an Integer, Long, Double, or String - */ - public Object getLiteral(); - - /** - * For operators with multiple literals (IN and BETWEEN), get the literals. - * - * @return the list of literals (Integer, Longs, Doubles, or Strings) - * - */ - public List getLiteralList(); - -} diff --git serde/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java serde/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java deleted file mode 100644 index bc0d503..0000000 --- serde/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java +++ /dev/null @@ -1,298 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.sarg; - -import java.util.List; - -/** - * Primary interface for - * SearchArgument, which are the subset of predicates - * that can be pushed down to the RecordReader. Each SearchArgument consists - * of a series of SearchClauses that must each be true for the row to be - * accepted by the filter. - * - * This requires that the filter be normalized into conjunctive normal form - * (CNF). - */ -public interface SearchArgument { - - /** - * The potential result sets of logical operations. - */ - public static enum TruthValue { - YES, NO, NULL, YES_NULL, NO_NULL, YES_NO, YES_NO_NULL; - - /** - * Compute logical or between the two values. - * @param right the other argument or null - * @return the result - */ - public TruthValue or(TruthValue right) { - if (right == null || right == this) { - return this; - } - if (right == YES || this == YES) { - return YES; - } - if (right == YES_NULL || this == YES_NULL) { - return YES_NULL; - } - if (right == NO) { - return this; - } - if (this == NO) { - return right; - } - if (this == NULL) { - if (right == NO_NULL) { - return NULL; - } else { - return YES_NULL; - } - } - if (right == NULL) { - if (this == NO_NULL) { - return NULL; - } else { - return YES_NULL; - } - } - return YES_NO_NULL; - } - - /** - * Compute logical AND between the two values. - * @param right the other argument or null - * @return the result - */ - public TruthValue and(TruthValue right) { - if (right == null || right == this) { - return this; - } - if (right == NO || this == NO) { - return NO; - } - if (right == NO_NULL || this == NO_NULL) { - return NO_NULL; - } - if (right == YES) { - return this; - } - if (this == YES) { - return right; - } - if (this == NULL) { - if (right == YES_NULL) { - return NULL; - } else { - return NO_NULL; - } - } - if (right == NULL) { - if (this == YES_NULL) { - return NULL; - } else { - return NO_NULL; - } - } - return YES_NO_NULL; - } - - public TruthValue not() { - switch (this) { - case NO: - return YES; - case YES: - return NO; - case NULL: - case YES_NO: - case YES_NO_NULL: - return this; - case NO_NULL: - return YES_NULL; - case YES_NULL: - return NO_NULL; - default: - throw new IllegalArgumentException("Unknown value: " + this); - } - } - - /** - * Does the RecordReader need to include this set of records? - * @return true unless none of the rows qualify - */ - public boolean isNeeded() { - switch (this) { - case NO: - case NULL: - case NO_NULL: - return false; - default: - return true; - } - } - } - - /** - * Get the leaf predicates that are required to evaluate the predicate. The - * list will have the duplicates removed. - * @return the list of leaf predicates - */ - public List getLeaves(); - - /** - * Get the expression tree. This should only needed for file formats that - * need to translate the expression to an internal form. - */ - public ExpressionTree getExpression(); - - /** - * Evaluate the entire predicate based on the values for the leaf predicates. - * @param leaves the value of each leaf predicate - * @return the value of hte entire predicate - */ - public TruthValue evaluate(TruthValue[] leaves); - - /** - * Serialize the SARG as a kyro object and return the base64 string. - * - * Hive should replace the current XML-based AST serialization for predicate pushdown - * with the Kryo serialization of the SARG because the representation is much more - * compact and focused on what is needed for predicate pushdown. - * - * @return the serialized SARG - */ - public String toKryo(); - - /** - * A builder object for contexts outside of Hive where it isn't easy to - * get a ExprNodeDesc. The user must call startOr, startAnd, or startNot - * before adding any leaves. - */ - public interface Builder { - - /** - * Start building an or operation and push it on the stack. - * @return this - */ - public Builder startOr(); - - /** - * Start building an and operation and push it on the stack. - * @return this - */ - public Builder startAnd(); - - /** - * Start building a not operation and push it on the stack. - * @return this - */ - public Builder startNot(); - - /** - * Finish the current operation and pop it off of the stack. Each start - * call must have a matching end. - * @return this - */ - public Builder end(); - - /** - * Add a less than leaf to the current item on the stack. - * @param column the name of the column - * @param type the type of the expression - * @param literal the literal - * @return this - */ - public Builder lessThan(String column, PredicateLeaf.Type type, - Object literal); - - /** - * Add a less than equals leaf to the current item on the stack. - * @param column the name of the column - * @param type the type of the expression - * @param literal the literal - * @return this - */ - public Builder lessThanEquals(String column, PredicateLeaf.Type type, - Object literal); - - /** - * Add an equals leaf to the current item on the stack. - * @param column the name of the column - * @param type the type of the expression - * @param literal the literal - * @return this - */ - public Builder equals(String column, PredicateLeaf.Type type, - Object literal); - - /** - * Add a null safe equals leaf to the current item on the stack. - * @param column the name of the column - * @param type the type of the expression - * @param literal the literal - * @return this - */ - public Builder nullSafeEquals(String column, PredicateLeaf.Type type, - Object literal); - - /** - * Add an in leaf to the current item on the stack. - * @param column the name of the column - * @param type the type of the expression - * @param literal the literal - * @return this - */ - public Builder in(String column, PredicateLeaf.Type type, - Object... literal); - - /** - * Add an is null leaf to the current item on the stack. - * @param column the name of the column - * @param type the type of the expression - * @return this - */ - public Builder isNull(String column, PredicateLeaf.Type type); - - /** - * Add a between leaf to the current item on the stack. - * @param column the name of the column - * @param type the type of the expression - * @param lower the literal - * @param upper the literal - * @return this - */ - public Builder between(String column, PredicateLeaf.Type type, - Object lower, Object upper); - - /** - * Add a truth value to the expression. - * @param truth - * @return this - */ - public Builder literal(TruthValue truth); - - /** - * Build and return the SearchArgument that has been defined. All of the - * starts must have been ended before this call. - * @return the new SearchArgument - */ - public SearchArgument build(); - } -} diff --git serde/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java serde/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java deleted file mode 100644 index 0578d24..0000000 --- serde/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java +++ /dev/null @@ -1,174 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.serde2.io; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.math.BigInteger; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.common.type.HiveDecimal; - -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.io.WritableUtils; - -public class HiveDecimalWritable implements WritableComparable { - - static final private Log LOG = LogFactory.getLog(HiveDecimalWritable.class); - - private byte[] internalStorage = new byte[0]; - private int scale; - - public HiveDecimalWritable() { - } - - public HiveDecimalWritable(String value) { - set(HiveDecimal.create(value)); - } - - public HiveDecimalWritable(byte[] bytes, int scale) { - set(bytes, scale); - } - - public HiveDecimalWritable(HiveDecimalWritable writable) { - set(writable.getHiveDecimal()); - } - - public HiveDecimalWritable(HiveDecimal value) { - set(value); - } - - public HiveDecimalWritable(long value) { - set((HiveDecimal.create(value))); - } - - public void set(HiveDecimal value) { - set(value.unscaledValue().toByteArray(), value.scale()); - } - - public void set(HiveDecimal value, int maxPrecision, int maxScale) { - set(HiveDecimal.enforcePrecisionScale(value, maxPrecision, maxScale)); - } - - public void set(HiveDecimalWritable writable) { - set(writable.getHiveDecimal()); - } - - public void set(byte[] bytes, int scale) { - this.internalStorage = bytes; - this.scale = scale; - } - - public HiveDecimal getHiveDecimal() { - return HiveDecimal.create(new BigInteger(internalStorage), scale); - } - - /** - * Get a HiveDecimal instance from the writable and constraint it with maximum precision/scale. - * - * @param maxPrecision maximum precision - * @param maxScale maximum scale - * @return HiveDecimal instance - */ - public HiveDecimal getHiveDecimal(int maxPrecision, int maxScale) { - return HiveDecimal.enforcePrecisionScale(HiveDecimal. - create(new BigInteger(internalStorage), scale), - maxPrecision, maxScale); - } - - @Override - public void readFields(DataInput in) throws IOException { - scale = WritableUtils.readVInt(in); - int byteArrayLen = WritableUtils.readVInt(in); - if (internalStorage.length != byteArrayLen) { - internalStorage = new byte[byteArrayLen]; - } - in.readFully(internalStorage); - } - - @Override - public void write(DataOutput out) throws IOException { - WritableUtils.writeVInt(out, scale); - WritableUtils.writeVInt(out, internalStorage.length); - out.write(internalStorage); - } - - @Override - public int compareTo(HiveDecimalWritable that) { - return getHiveDecimal().compareTo(that.getHiveDecimal()); - } - - @Override - public String toString() { - return getHiveDecimal().toString(); - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - if (other == null || getClass() != other.getClass()) { - return false; - } - HiveDecimalWritable bdw = (HiveDecimalWritable) other; - - // 'equals' and 'compareTo' are not compatible with HiveDecimals. We want - // compareTo which returns true iff the numbers are equal (e.g.: 3.14 is - // the same as 3.140). 'Equals' returns true iff equal and the same scale - // is set in the decimals (e.g.: 3.14 is not the same as 3.140) - return getHiveDecimal().compareTo(bdw.getHiveDecimal()) == 0; - } - - @Override - public int hashCode() { - return getHiveDecimal().hashCode(); - } - - /* (non-Javadoc) - * In order to update a Decimal128 fast (w/o allocation) we need to expose access to the - * internal storage bytes and scale. - * @return - */ - public byte[] getInternalStorage() { - return internalStorage; - } - - /* (non-Javadoc) - * In order to update a Decimal128 fast (w/o allocation) we need to expose access to the - * internal storage bytes and scale. - */ - public int getScale() { - return scale; - } - - public static - HiveDecimalWritable enforcePrecisionScale(HiveDecimalWritable writable, - int precision, int scale) { - if (writable == null) { - return null; - } - - HiveDecimal dec = - HiveDecimal.enforcePrecisionScale(writable.getHiveDecimal(), precision, - scale); - return dec == null ? null : new HiveDecimalWritable(dec); - } -} diff --git storage-api/pom.xml storage-api/pom.xml new file mode 100644 index 0000000..71b51b8 --- /dev/null +++ storage-api/pom.xml @@ -0,0 +1,85 @@ + + + + 4.0.0 + + org.apache.hive + hive + 2.0.0-SNAPSHOT + ../pom.xml + + + hive-storage-api + jar + Hive Storage API + + + .. + + + + + + + log4j + log4j + ${log4j.version} + + + + junit + junit + ${junit.version} + test + + + + + + hadoop-1 + + + org.apache.hadoop + hadoop-core + ${hadoop-20S.version} + true + + + + + hadoop-2 + + + org.apache.hadoop + hadoop-common + ${hadoop-23.version} + true + + + + + + + ${basedir}/src/java + ${basedir}/src/test + + + ${basedir}/src/test/resources + + + + diff --git storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java new file mode 100644 index 0000000..7d7fb28 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java @@ -0,0 +1,312 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.type; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.math.RoundingMode; + +/** + * + * HiveDecimal. Simple wrapper for BigDecimal. Adds fixed max precision and non scientific string + * representation + * + */ +public class HiveDecimal implements Comparable { + public static final int MAX_PRECISION = 38; + public static final int MAX_SCALE = 38; + + /** + * Default precision/scale when user doesn't specify in the column metadata, such as + * decimal and decimal(8). + */ + public static final int USER_DEFAULT_PRECISION = 10; + public static final int USER_DEFAULT_SCALE = 0; + + /** + * Default precision/scale when system is not able to determine them, such as in case + * of a non-generic udf. + */ + public static final int SYSTEM_DEFAULT_PRECISION = 38; + public static final int SYSTEM_DEFAULT_SCALE = 18; + + public static final HiveDecimal ZERO = new HiveDecimal(BigDecimal.ZERO); + public static final HiveDecimal ONE = new HiveDecimal(BigDecimal.ONE); + + public static final int ROUND_FLOOR = BigDecimal.ROUND_FLOOR; + public static final int ROUND_CEILING = BigDecimal.ROUND_CEILING; + public static final int ROUND_HALF_UP = BigDecimal.ROUND_HALF_UP; + + private BigDecimal bd = BigDecimal.ZERO; + + private HiveDecimal(BigDecimal bd) { + this.bd = bd; + } + + public static HiveDecimal create(BigDecimal b) { + return create(b, true); + } + + public static HiveDecimal create(BigDecimal b, boolean allowRounding) { + BigDecimal bd = normalize(b, allowRounding); + return bd == null ? null : new HiveDecimal(bd); + } + + public static HiveDecimal create(BigInteger unscaled, int scale) { + BigDecimal bd = normalize(new BigDecimal(unscaled, scale), true); + return bd == null ? null : new HiveDecimal(bd); + } + + public static HiveDecimal create(String dec) { + BigDecimal bd; + try { + bd = new BigDecimal(dec.trim()); + } catch (NumberFormatException ex) { + return null; + } + + bd = normalize(bd, true); + return bd == null ? null : new HiveDecimal(bd); + } + + public static HiveDecimal create(BigInteger bi) { + BigDecimal bd = normalize(new BigDecimal(bi), true); + return bd == null ? null : new HiveDecimal(bd); + } + + public static HiveDecimal create(int i) { + return new HiveDecimal(new BigDecimal(i)); + } + + public static HiveDecimal create(long l) { + return new HiveDecimal(new BigDecimal(l)); + } + + @Override + public String toString() { + return bd.toPlainString(); + } + + public HiveDecimal setScale(int i) { + return new HiveDecimal(bd.setScale(i, RoundingMode.HALF_UP)); + } + + @Override + public int compareTo(HiveDecimal dec) { + return bd.compareTo(dec.bd); + } + + @Override + public int hashCode() { + return bd.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (obj == null || obj.getClass() != getClass()) { + return false; + } + return bd.equals(((HiveDecimal) obj).bd); + } + + public int scale() { + return bd.scale(); + } + + /** + * Returns the number of digits (integer and fractional) in the number, which is equivalent + * to SQL decimal precision. Note that this is different from BigDecimal.precision(), + * which returns the precision of the unscaled value (BigDecimal.valueOf(0.01).precision() = 1, + * whereas HiveDecimal.create("0.01").precision() = 2). + * If you want the BigDecimal precision, use HiveDecimal.bigDecimalValue().precision() + * @return + */ + public int precision() { + int bdPrecision = bd.precision(); + int bdScale = bd.scale(); + + if (bdPrecision < bdScale) { + // This can happen for numbers less than 0.1 + // For 0.001234: bdPrecision=4, bdScale=6 + // In this case, we'll set the type to have the same precision as the scale. + return bdScale; + } + return bdPrecision; + } + + public int intValue() { + return bd.intValue(); + } + + public double doubleValue() { + return bd.doubleValue(); + } + + public long longValue() { + return bd.longValue(); + } + + public short shortValue() { + return bd.shortValue(); + } + + public float floatValue() { + return bd.floatValue(); + } + + public BigDecimal bigDecimalValue() { + return bd; + } + + public byte byteValue() { + return bd.byteValue(); + } + + public HiveDecimal setScale(int adjustedScale, int rm) { + return create(bd.setScale(adjustedScale, rm)); + } + + public HiveDecimal subtract(HiveDecimal dec) { + return create(bd.subtract(dec.bd)); + } + + public HiveDecimal multiply(HiveDecimal dec) { + return create(bd.multiply(dec.bd), false); + } + + public BigInteger unscaledValue() { + return bd.unscaledValue(); + } + + public HiveDecimal scaleByPowerOfTen(int n) { + return create(bd.scaleByPowerOfTen(n)); + } + + public HiveDecimal abs() { + return create(bd.abs()); + } + + public HiveDecimal negate() { + return create(bd.negate()); + } + + public HiveDecimal add(HiveDecimal dec) { + return create(bd.add(dec.bd)); + } + + public HiveDecimal pow(int n) { + BigDecimal result = normalize(bd.pow(n), false); + return result == null ? null : new HiveDecimal(result); + } + + public HiveDecimal remainder(HiveDecimal dec) { + return create(bd.remainder(dec.bd)); + } + + public HiveDecimal divide(HiveDecimal dec) { + return create(bd.divide(dec.bd, MAX_SCALE, RoundingMode.HALF_UP), true); + } + + /** + * Get the sign of the underlying decimal. + * @return 0 if the decimal is equal to 0, -1 if less than zero, and 1 if greater than 0 + */ + public int signum() { + return bd.signum(); + } + + private static BigDecimal trim(BigDecimal d) { + if (d.compareTo(BigDecimal.ZERO) == 0) { + // Special case for 0, because java doesn't strip zeros correctly on that number. + d = BigDecimal.ZERO; + } else { + d = d.stripTrailingZeros(); + if (d.scale() < 0) { + // no negative scale decimals + d = d.setScale(0); + } + } + return d; + } + + private static BigDecimal normalize(BigDecimal bd, boolean allowRounding) { + if (bd == null) { + return null; + } + + bd = trim(bd); + + int intDigits = bd.precision() - bd.scale(); + + if (intDigits > MAX_PRECISION) { + return null; + } + + int maxScale = Math.min(MAX_SCALE, Math.min(MAX_PRECISION - intDigits, bd.scale())); + if (bd.scale() > maxScale ) { + if (allowRounding) { + bd = bd.setScale(maxScale, RoundingMode.HALF_UP); + // Trimming is again necessary, because rounding may introduce new trailing 0's. + bd = trim(bd); + } else { + bd = null; + } + } + + return bd; + } + + public static BigDecimal enforcePrecisionScale(BigDecimal bd, int maxPrecision, int maxScale) { + if (bd == null) { + return null; + } + + bd = trim(bd); + + if (bd.scale() > maxScale) { + bd = bd.setScale(maxScale, RoundingMode.HALF_UP); + } + + int maxIntDigits = maxPrecision - maxScale; + int intDigits = bd.precision() - bd.scale(); + if (intDigits > maxIntDigits) { + return null; + } + + return bd; + } + + public static HiveDecimal enforcePrecisionScale(HiveDecimal dec, int maxPrecision, int maxScale) { + if (dec == null) { + return null; + } + + // Minor optimization, avoiding creating new objects. + if (dec.precision() - dec.scale() <= maxPrecision - maxScale && + dec.scale() <= maxScale) { + return dec; + } + + BigDecimal bd = enforcePrecisionScale(dec.bd, maxPrecision, maxScale); + if (bd == null) { + return null; + } + + return HiveDecimal.create(bd); + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java new file mode 100644 index 0000000..02c52fa --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -0,0 +1,322 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +/** + * This class supports string and binary data by value reference -- i.e. each field is + * explicitly present, as opposed to provided by a dictionary reference. + * In some cases, all the values will be in the same byte array to begin with, + * but this need not be the case. If each value is in a separate byte + * array to start with, or not all of the values are in the same original + * byte array, you can still assign data by reference into this column vector. + * This gives flexibility to use this in multiple situations. + *

+ * When setting data by reference, the caller + * is responsible for allocating the byte arrays used to hold the data. + * You can also set data by value, as long as you call the initBuffer() method first. + * You can mix "by value" and "by reference" in the same column vector, + * though that use is probably not typical. + */ +public class BytesColumnVector extends ColumnVector { + public byte[][] vector; + public int[] start; // start offset of each field + + /* + * The length of each field. If the value repeats for every entry, then it is stored + * in vector[0] and isRepeating from the superclass is set to true. + */ + public int[] length; + private byte[] buffer; // optional buffer to use when actually copying in data + private int nextFree; // next free position in buffer + + // Estimate that there will be 16 bytes per entry + static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE; + + // Proportion of extra space to provide when allocating more buffer space. + static final float EXTRA_SPACE_FACTOR = (float) 1.2; + + /** + * Use this constructor for normal operation. + * All column vectors should be the default size normally. + */ + public BytesColumnVector() { + this(VectorizedRowBatch.DEFAULT_SIZE); + } + + /** + * Don't call this constructor except for testing purposes. + * + * @param size number of elements in the column vector + */ + public BytesColumnVector(int size) { + super(size); + vector = new byte[size][]; + start = new int[size]; + length = new int[size]; + } + + /** + * Additional reset work for BytesColumnVector (releasing scratch bytes for by value strings). + */ + @Override + public void reset() { + super.reset(); + initBuffer(0); + } + + /** Set a field by reference. + * + * @param elementNum index within column vector to set + * @param sourceBuf container of source data + * @param start start byte position within source + * @param length length of source byte sequence + */ + public void setRef(int elementNum, byte[] sourceBuf, int start, int length) { + vector[elementNum] = sourceBuf; + this.start[elementNum] = start; + this.length[elementNum] = length; + } + + /** + * You must call initBuffer first before using setVal(). + * Provide the estimated number of bytes needed to hold + * a full column vector worth of byte string data. + * + * @param estimatedValueSize Estimated size of buffer space needed + */ + public void initBuffer(int estimatedValueSize) { + nextFree = 0; + + // if buffer is already allocated, keep using it, don't re-allocate + if (buffer != null) { + return; + } + + // allocate a little extra space to limit need to re-allocate + int bufferSize = this.vector.length * (int)(estimatedValueSize * EXTRA_SPACE_FACTOR); + if (bufferSize < DEFAULT_BUFFER_SIZE) { + bufferSize = DEFAULT_BUFFER_SIZE; + } + buffer = new byte[bufferSize]; + } + + /** + * Initialize buffer to default size. + */ + public void initBuffer() { + initBuffer(0); + } + + /** + * @return amount of buffer space currently allocated + */ + public int bufferSize() { + if (buffer == null) { + return 0; + } + return buffer.length; + } + + /** + * Set a field by actually copying in to a local buffer. + * If you must actually copy data in to the array, use this method. + * DO NOT USE this method unless it's not practical to set data by reference with setRef(). + * Setting data by reference tends to run a lot faster than copying data in. + * + * @param elementNum index within column vector to set + * @param sourceBuf container of source data + * @param start start byte position within source + * @param length length of source byte sequence + */ + public void setVal(int elementNum, byte[] sourceBuf, int start, int length) { + if ((nextFree + length) > buffer.length) { + increaseBufferSpace(length); + } + System.arraycopy(sourceBuf, start, buffer, nextFree, length); + vector[elementNum] = buffer; + this.start[elementNum] = nextFree; + this.length[elementNum] = length; + nextFree += length; + } + + /** + * Set a field to the concatenation of two string values. Result data is copied + * into the internal buffer. + * + * @param elementNum index within column vector to set + * @param leftSourceBuf container of left argument + * @param leftStart start of left argument + * @param leftLen length of left argument + * @param rightSourceBuf container of right argument + * @param rightStart start of right argument + * @param rightLen length of right arugment + */ + public void setConcat(int elementNum, byte[] leftSourceBuf, int leftStart, int leftLen, + byte[] rightSourceBuf, int rightStart, int rightLen) { + int newLen = leftLen + rightLen; + if ((nextFree + newLen) > buffer.length) { + increaseBufferSpace(newLen); + } + vector[elementNum] = buffer; + this.start[elementNum] = nextFree; + this.length[elementNum] = newLen; + + System.arraycopy(leftSourceBuf, leftStart, buffer, nextFree, leftLen); + nextFree += leftLen; + System.arraycopy(rightSourceBuf, rightStart, buffer, nextFree, rightLen); + nextFree += rightLen; + } + + /** + * Increase buffer space enough to accommodate next element. + * This uses an exponential increase mechanism to rapidly + * increase buffer size to enough to hold all data. + * As batches get re-loaded, buffer space allocated will quickly + * stabilize. + * + * @param nextElemLength size of next element to be added + */ + public void increaseBufferSpace(int nextElemLength) { + + // Keep doubling buffer size until there will be enough space for next element. + int newLength = 2 * buffer.length; + while((nextFree + nextElemLength) > newLength) { + newLength *= 2; + } + + // Allocate new buffer, copy data to it, and set buffer to new buffer. + byte[] newBuffer = new byte[newLength]; + System.arraycopy(buffer, 0, newBuffer, 0, nextFree); + buffer = newBuffer; + } + + /** Copy the current object contents into the output. Only copy selected entries, + * as indicated by selectedInUse and the sel array. + */ + public void copySelected( + boolean selectedInUse, int[] sel, int size, BytesColumnVector output) { + + // Output has nulls if and only if input has nulls. + output.noNulls = noNulls; + output.isRepeating = false; + + // Handle repeating case + if (isRepeating) { + output.setVal(0, vector[0], start[0], length[0]); + output.isNull[0] = isNull[0]; + output.isRepeating = true; + return; + } + + // Handle normal case + + // Copy data values over + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.setVal(i, vector[i], start[i], length[i]); + } + } + else { + for (int i = 0; i < size; i++) { + output.setVal(i, vector[i], start[i], length[i]); + } + } + + // Copy nulls over if needed + if (!noNulls) { + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.isNull[i] = isNull[i]; + } + } + else { + System.arraycopy(isNull, 0, output.isNull, 0, size); + } + } + } + + /** Simplify vector by brute-force flattening noNulls and isRepeating + * This can be used to reduce combinatorial explosion of code paths in VectorExpressions + * with many arguments, at the expense of loss of some performance. + */ + public void flatten(boolean selectedInUse, int[] sel, int size) { + flattenPush(); + if (isRepeating) { + isRepeating = false; + + // setRef is used below and this is safe, because the reference + // is to data owned by this column vector. If this column vector + // gets re-used, the whole thing is re-used together so there + // is no danger of a dangling reference. + + // Only copy data values if entry is not null. The string value + // at position 0 is undefined if the position 0 value is null. + if (noNulls || !isNull[0]) { + + // loops start at position 1 because position 0 is already set + if (selectedInUse) { + for (int j = 1; j < size; j++) { + int i = sel[j]; + this.setRef(i, vector[0], start[0], length[0]); + } + } else { + for (int i = 1; i < size; i++) { + this.setRef(i, vector[0], start[0], length[0]); + } + } + } + flattenRepeatingNulls(selectedInUse, sel, size); + } + flattenNoNulls(selectedInUse, sel, size); + } + + // Fill the all the vector entries with provided value + public void fill(byte[] value) { + noNulls = true; + isRepeating = true; + setRef(0, value, 0, value.length); + } + + @Override + public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { + BytesColumnVector in = (BytesColumnVector) inputVector; + setVal(outElementNum, in.vector[inputElementNum], in.start[inputElementNum], in.length[inputElementNum]); + } + + @Override + public void init() { + initBuffer(0); + } + + @Override + public void stringifyValue(StringBuilder buffer, int row) { + if (isRepeating) { + row = 0; + } + if (noNulls || !isNull[row]) { + buffer.append('"'); + buffer.append(new String(this.buffer, start[row], length[row])); + buffer.append('"'); + } else { + buffer.append("null"); + } + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java new file mode 100644 index 0000000..4b5cf39 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java @@ -0,0 +1,173 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.Arrays; + +/** + * ColumnVector contains the shared structure for the sub-types, + * including NULL information, and whether this vector + * repeats, i.e. has all values the same, so only the first + * one is set. This is used to accelerate query performance + * by handling a whole vector in O(1) time when applicable. + * + * The fields are public by design since this is a performance-critical + * structure that is used in the inner loop of query execution. + */ +public abstract class ColumnVector { + + /* + * The current kinds of column vectors. + */ + public static enum Type { + LONG, + DOUBLE, + BYTES, + DECIMAL + } + + /* + * If hasNulls is true, then this array contains true if the value + * is null, otherwise false. The array is always allocated, so a batch can be re-used + * later and nulls added. + */ + public boolean[] isNull; + + // If the whole column vector has no nulls, this is true, otherwise false. + public boolean noNulls; + + /* + * True if same value repeats for whole column vector. + * If so, vector[0] holds the repeating value. + */ + public boolean isRepeating; + + // Variables to hold state from before flattening so it can be easily restored. + private boolean preFlattenIsRepeating; + private boolean preFlattenNoNulls; + + /** + * Constructor for super-class ColumnVector. This is not called directly, + * but used to initialize inherited fields. + * + * @param len Vector length + */ + public ColumnVector(int len) { + isNull = new boolean[len]; + noNulls = true; + isRepeating = false; + } + + /** + * Resets the column to default state + * - fills the isNull array with false + * - sets noNulls to true + * - sets isRepeating to false + */ + public void reset() { + if (false == noNulls) { + Arrays.fill(isNull, false); + } + noNulls = true; + isRepeating = false; + } + + abstract public void flatten(boolean selectedInUse, int[] sel, int size); + + // Simplify vector by brute-force flattening noNulls if isRepeating + // This can be used to reduce combinatorial explosion of code paths in VectorExpressions + // with many arguments. + public void flattenRepeatingNulls(boolean selectedInUse, int[] sel, int size) { + + boolean nullFillValue; + + if (noNulls) { + nullFillValue = false; + } else { + nullFillValue = isNull[0]; + } + + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + isNull[i] = nullFillValue; + } + } else { + Arrays.fill(isNull, 0, size, nullFillValue); + } + + // all nulls are now explicit + noNulls = false; + } + + public void flattenNoNulls(boolean selectedInUse, int[] sel, int size) { + if (noNulls) { + noNulls = false; + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + isNull[i] = false; + } + } else { + Arrays.fill(isNull, 0, size, false); + } + } + } + + /** + * Restore the state of isRepeating and noNulls to what it was + * before flattening. This must only be called just after flattening + * and then evaluating a VectorExpression on the column vector. + * It is an optimization that allows other operations on the same + * column to continue to benefit from the isRepeating and noNulls + * indicators. + */ + public void unFlatten() { + isRepeating = preFlattenIsRepeating; + noNulls = preFlattenNoNulls; + } + + // Record repeating and no nulls state to be restored later. + protected void flattenPush() { + preFlattenIsRepeating = isRepeating; + preFlattenNoNulls = noNulls; + } + + /** + * Set the element in this column vector from the given input vector. + */ + public abstract void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector); + + /** + * Initialize the column vector. This method can be overridden by specific column vector types. + * Use this method only if the individual type of the column vector is not known, otherwise its + * preferable to call specific initialization methods. + */ + public void init() { + // Do nothing by default + } + + /** + * Print the value for this column into the given string builder. + * @param buffer the buffer to print into + * @param row the id of the row to print + */ + public abstract void stringifyValue(StringBuilder buffer, + int row); + } diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java new file mode 100644 index 0000000..74a9d5f --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.math.BigInteger; + +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.common.type.HiveDecimal; + +public class DecimalColumnVector extends ColumnVector { + + /** + * A vector of HiveDecimalWritable objects. + * + * For high performance and easy access to this low-level structure, + * the fields are public by design (as they are in other ColumnVector + * types). + */ + public HiveDecimalWritable[] vector; + public short scale; + public short precision; + + public DecimalColumnVector(int precision, int scale) { + this(VectorizedRowBatch.DEFAULT_SIZE, precision, scale); + } + + public DecimalColumnVector(int size, int precision, int scale) { + super(size); + this.precision = (short) precision; + this.scale = (short) scale; + vector = new HiveDecimalWritable[size]; + for (int i = 0; i < size; i++) { + vector[i] = new HiveDecimalWritable(HiveDecimal.ZERO); + } + } + + @Override + public void flatten(boolean selectedInUse, int[] sel, int size) { + // TODO Auto-generated method stub + } + + @Override + public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { + HiveDecimal hiveDec = ((DecimalColumnVector) inputVector).vector[inputElementNum].getHiveDecimal(precision, scale); + if (hiveDec == null) { + noNulls = false; + isNull[outElementNum] = true; + } else { + vector[outElementNum].set(hiveDec); + } + } + + @Override + public void stringifyValue(StringBuilder buffer, int row) { + if (isRepeating) { + row = 0; + } + if (noNulls || !isNull[row]) { + buffer.append(vector[row].toString()); + } else { + buffer.append("null"); + } + } + + public void set(int elementNum, HiveDecimalWritable writeable) { + HiveDecimal hiveDec = writeable.getHiveDecimal(precision, scale); + if (hiveDec == null) { + noNulls = false; + isNull[elementNum] = true; + } else { + vector[elementNum].set(hiveDec); + } + } + + public void set(int elementNum, HiveDecimal hiveDec) { + HiveDecimal checkedDec = HiveDecimal.enforcePrecisionScale(hiveDec, precision, scale); + if (checkedDec == null) { + noNulls = false; + isNull[elementNum] = true; + } else { + vector[elementNum].set(checkedDec); + } + } + + public void setNullDataValue(int elementNum) { + // E.g. For scale 2 the minimum is "0.01" + HiveDecimal minimumNonZeroValue = HiveDecimal.create(BigInteger.ONE, scale); + vector[elementNum].set(minimumNonZeroValue); + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java new file mode 100644 index 0000000..4a7811d --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java @@ -0,0 +1,143 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.Arrays; + +/** + * This class represents a nullable double precision floating point column vector. + * This class will be used for operations on all floating point types (float, double) + * and as such will use a 64-bit double value to hold the biggest possible value. + * During copy-in/copy-out, smaller types (i.e. float) will be converted as needed. This will + * reduce the amount of code that needs to be generated and also will run fast since the + * machine operates with 64-bit words. + * + * The vector[] field is public by design for high-performance access in the inner + * loop of query execution. + */ +public class DoubleColumnVector extends ColumnVector { + public double[] vector; + public static final double NULL_VALUE = Double.NaN; + + /** + * Use this constructor by default. All column vectors + * should normally be the default size. + */ + public DoubleColumnVector() { + this(VectorizedRowBatch.DEFAULT_SIZE); + } + + /** + * Don't use this except for testing purposes. + * + * @param len + */ + public DoubleColumnVector(int len) { + super(len); + vector = new double[len]; + } + + // Copy the current object contents into the output. Only copy selected entries, + // as indicated by selectedInUse and the sel array. + public void copySelected( + boolean selectedInUse, int[] sel, int size, DoubleColumnVector output) { + + // Output has nulls if and only if input has nulls. + output.noNulls = noNulls; + output.isRepeating = false; + + // Handle repeating case + if (isRepeating) { + output.vector[0] = vector[0]; + output.isNull[0] = isNull[0]; + output.isRepeating = true; + return; + } + + // Handle normal case + + // Copy data values over + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.vector[i] = vector[i]; + } + } + else { + System.arraycopy(vector, 0, output.vector, 0, size); + } + + // Copy nulls over if needed + if (!noNulls) { + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.isNull[i] = isNull[i]; + } + } + else { + System.arraycopy(isNull, 0, output.isNull, 0, size); + } + } + } + + // Fill the column vector with the provided value + public void fill(double value) { + noNulls = true; + isRepeating = true; + vector[0] = value; + } + + // Simplify vector by brute-force flattening noNulls and isRepeating + // This can be used to reduce combinatorial explosion of code paths in VectorExpressions + // with many arguments. + public void flatten(boolean selectedInUse, int[] sel, int size) { + flattenPush(); + if (isRepeating) { + isRepeating = false; + double repeatVal = vector[0]; + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + vector[i] = repeatVal; + } + } else { + Arrays.fill(vector, 0, size, repeatVal); + } + flattenRepeatingNulls(selectedInUse, sel, size); + } + flattenNoNulls(selectedInUse, sel, size); + } + + @Override + public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { + vector[outElementNum] = ((DoubleColumnVector) inputVector).vector[inputElementNum]; + } + + @Override + public void stringifyValue(StringBuilder buffer, int row) { + if (isRepeating) { + row = 0; + } + if (noNulls || !isNull[row]) { + buffer.append(vector[row]); + } else { + buffer.append("null"); + } + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java new file mode 100644 index 0000000..5702584 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java @@ -0,0 +1,189 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.Arrays; + +/** + * This class represents a nullable int column vector. + * This class will be used for operations on all integer types (tinyint, smallint, int, bigint) + * and as such will use a 64-bit long value to hold the biggest possible value. + * During copy-in/copy-out, smaller int types will be converted as needed. This will + * reduce the amount of code that needs to be generated and also will run fast since the + * machine operates with 64-bit words. + * + * The vector[] field is public by design for high-performance access in the inner + * loop of query execution. + */ +public class LongColumnVector extends ColumnVector { + public long[] vector; + public static final long NULL_VALUE = 1; + + /** + * Use this constructor by default. All column vectors + * should normally be the default size. + */ + public LongColumnVector() { + this(VectorizedRowBatch.DEFAULT_SIZE); + } + + /** + * Don't use this except for testing purposes. + * + * @param len the number of rows + */ + public LongColumnVector(int len) { + super(len); + vector = new long[len]; + } + + // Copy the current object contents into the output. Only copy selected entries, + // as indicated by selectedInUse and the sel array. + public void copySelected( + boolean selectedInUse, int[] sel, int size, LongColumnVector output) { + + // Output has nulls if and only if input has nulls. + output.noNulls = noNulls; + output.isRepeating = false; + + // Handle repeating case + if (isRepeating) { + output.vector[0] = vector[0]; + output.isNull[0] = isNull[0]; + output.isRepeating = true; + return; + } + + // Handle normal case + + // Copy data values over + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.vector[i] = vector[i]; + } + } + else { + System.arraycopy(vector, 0, output.vector, 0, size); + } + + // Copy nulls over if needed + if (!noNulls) { + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.isNull[i] = isNull[i]; + } + } + else { + System.arraycopy(isNull, 0, output.isNull, 0, size); + } + } + } + + // Copy the current object contents into the output. Only copy selected entries, + // as indicated by selectedInUse and the sel array. + public void copySelected( + boolean selectedInUse, int[] sel, int size, DoubleColumnVector output) { + + // Output has nulls if and only if input has nulls. + output.noNulls = noNulls; + output.isRepeating = false; + + // Handle repeating case + if (isRepeating) { + output.vector[0] = vector[0]; // automatic conversion to double is done here + output.isNull[0] = isNull[0]; + output.isRepeating = true; + return; + } + + // Handle normal case + + // Copy data values over + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.vector[i] = vector[i]; + } + } + else { + for(int i = 0; i < size; ++i) { + output.vector[i] = vector[i]; + } + } + + // Copy nulls over if needed + if (!noNulls) { + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.isNull[i] = isNull[i]; + } + } + else { + System.arraycopy(isNull, 0, output.isNull, 0, size); + } + } + } + + // Fill the column vector with the provided value + public void fill(long value) { + noNulls = true; + isRepeating = true; + vector[0] = value; + } + + // Simplify vector by brute-force flattening noNulls and isRepeating + // This can be used to reduce combinatorial explosion of code paths in VectorExpressions + // with many arguments. + public void flatten(boolean selectedInUse, int[] sel, int size) { + flattenPush(); + if (isRepeating) { + isRepeating = false; + long repeatVal = vector[0]; + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + vector[i] = repeatVal; + } + } else { + Arrays.fill(vector, 0, size, repeatVal); + } + flattenRepeatingNulls(selectedInUse, sel, size); + } + flattenNoNulls(selectedInUse, sel, size); + } + + @Override + public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { + vector[outElementNum] = ((LongColumnVector) inputVector).vector[inputElementNum]; + } + + @Override + public void stringifyValue(StringBuilder buffer, int row) { + if (isRepeating) { + row = 0; + } + if (noNulls || !isNull[row]) { + buffer.append(vector[row]); + } else { + buffer.append("null"); + } + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java new file mode 100644 index 0000000..7c18da6 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java @@ -0,0 +1,186 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; + +/** + * A VectorizedRowBatch is a set of rows, organized with each column + * as a vector. It is the unit of query execution, organized to minimize + * the cost per row and achieve high cycles-per-instruction. + * The major fields are public by design to allow fast and convenient + * access by the vectorized query execution code. + */ +public class VectorizedRowBatch implements Writable { + public int numCols; // number of columns + public ColumnVector[] cols; // a vector for each column + public int size; // number of rows that qualify (i.e. haven't been filtered out) + public int[] selected; // array of positions of selected values + public int[] projectedColumns; + public int projectionSize; + + /* + * If no filtering has been applied yet, selectedInUse is false, + * meaning that all rows qualify. If it is true, then the selected[] array + * records the offsets of qualifying rows. + */ + public boolean selectedInUse; + + // If this is true, then there is no data in the batch -- we have hit the end of input. + public boolean endOfFile; + + /* + * This number is carefully chosen to minimize overhead and typically allows + * one VectorizedRowBatch to fit in cache. + */ + public static final int DEFAULT_SIZE = 1024; + + /** + * Return a batch with the specified number of columns. + * This is the standard constructor -- all batches should be the same size + * + * @param numCols the number of columns to include in the batch + */ + public VectorizedRowBatch(int numCols) { + this(numCols, DEFAULT_SIZE); + } + + /** + * Return a batch with the specified number of columns and rows. + * Only call this constructor directly for testing purposes. + * Batch size should normally always be defaultSize. + * + * @param numCols the number of columns to include in the batch + * @param size the number of rows to include in the batch + */ + public VectorizedRowBatch(int numCols, int size) { + this.numCols = numCols; + this.size = size; + selected = new int[size]; + selectedInUse = false; + this.cols = new ColumnVector[numCols]; + projectedColumns = new int[numCols]; + + // Initially all columns are projected and in the same order + projectionSize = numCols; + for (int i = 0; i < numCols; i++) { + projectedColumns[i] = i; + } + } + + /** + * Returns the maximum size of the batch (number of rows it can hold) + */ + public int getMaxSize() { + return selected.length; + } + + /** + * Return count of qualifying rows. + * + * @return number of rows that have not been filtered out + */ + public long count() { + return size; + } + + private static String toUTF8(Object o) { + if(o == null || o instanceof NullWritable) { + return "\\N"; /* as found in LazySimpleSerDe's nullSequence */ + } + return o.toString(); + } + + @Override + public String toString() { + if (size == 0) { + return ""; + } + StringBuilder b = new StringBuilder(); + if (this.selectedInUse) { + for (int j = 0; j < size; j++) { + int i = selected[j]; + b.append('['); + for (int k = 0; k < projectionSize; k++) { + int projIndex = projectedColumns[k]; + ColumnVector cv = cols[projIndex]; + if (k > 0) { + b.append(", "); + } + cv.stringifyValue(b, i); + } + b.append(']'); + if (j < size - 1) { + b.append('\n'); + } + } + } else { + for (int i = 0; i < size; i++) { + b.append('['); + for (int k = 0; k < projectionSize; k++) { + int projIndex = projectedColumns[k]; + ColumnVector cv = cols[projIndex]; + if (k > 0) { + b.append(", "); + } + cv.stringifyValue(b, i); + } + b.append(']'); + if (i < size - 1) { + b.append('\n'); + } + } + } + return b.toString(); + } + + @Override + public void readFields(DataInput arg0) throws IOException { + throw new UnsupportedOperationException("Do you really need me?"); + } + + @Override + public void write(DataOutput arg0) throws IOException { + throw new UnsupportedOperationException("Don't call me"); + } + + /** + * Resets the row batch to default state + * - sets selectedInUse to false + * - sets size to 0 + * - sets endOfFile to false + * - resets each column + * - inits each column + */ + public void reset() { + selectedInUse = false; + size = 0; + endOfFile = false; + for (ColumnVector vc : cols) { + if (vc != null) { + vc.reset(); + vc.init(); + } + } + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java new file mode 100644 index 0000000..2dd3a45 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java @@ -0,0 +1,156 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * The inner representation of the SearchArgument. Most users should not + * need this interface, it is only for file formats that need to translate + * the SearchArgument into an internal form. + */ +public class ExpressionTree { + public enum Operator {OR, AND, NOT, LEAF, CONSTANT} + private final Operator operator; + private final List children; + private final int leaf; + private final SearchArgument.TruthValue constant; + + ExpressionTree() { + operator = null; + children = null; + leaf = 0; + constant = null; + } + + ExpressionTree(Operator op, ExpressionTree... kids) { + operator = op; + children = new ArrayList(); + leaf = -1; + this.constant = null; + Collections.addAll(children, kids); + } + + ExpressionTree(int leaf) { + operator = Operator.LEAF; + children = null; + this.leaf = leaf; + this.constant = null; + } + + ExpressionTree(SearchArgument.TruthValue constant) { + operator = Operator.CONSTANT; + children = null; + this.leaf = -1; + this.constant = constant; + } + + ExpressionTree(ExpressionTree other) { + this.operator = other.operator; + if (other.children == null) { + this.children = null; + } else { + this.children = new ArrayList(); + for(ExpressionTree child: other.children) { + children.add(new ExpressionTree(child)); + } + } + this.leaf = other.leaf; + this.constant = other.constant; + } + + public SearchArgument.TruthValue evaluate(SearchArgument.TruthValue[] leaves + ) { + SearchArgument.TruthValue result = null; + switch (operator) { + case OR: + for(ExpressionTree child: children) { + result = child.evaluate(leaves).or(result); + } + return result; + case AND: + for(ExpressionTree child: children) { + result = child.evaluate(leaves).and(result); + } + return result; + case NOT: + return children.get(0).evaluate(leaves).not(); + case LEAF: + return leaves[leaf]; + case CONSTANT: + return constant; + default: + throw new IllegalStateException("Unknown operator: " + operator); + } + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + switch (operator) { + case OR: + buffer.append("(or"); + for(ExpressionTree child: children) { + buffer.append(' '); + buffer.append(child.toString()); + } + buffer.append(')'); + break; + case AND: + buffer.append("(and"); + for(ExpressionTree child: children) { + buffer.append(' '); + buffer.append(child.toString()); + } + buffer.append(')'); + break; + case NOT: + buffer.append("(not "); + buffer.append(children.get(0)); + buffer.append(')'); + break; + case LEAF: + buffer.append("leaf-"); + buffer.append(leaf); + break; + case CONSTANT: + buffer.append(constant); + break; + } + return buffer.toString(); + } + + public Operator getOperator() { + return operator; + } + + public List getChildren() { + return children; + } + + public SearchArgument.TruthValue getConstant() { + return constant; + } + + public int getLeaf() { + return leaf; + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java new file mode 100644 index 0000000..3a92565 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import java.sql.Date; +import java.sql.Timestamp; +import java.util.List; + +/** + * The primitive predicates that form a SearchArgument. + */ +public interface PredicateLeaf { + + /** + * The possible operators for predicates. To get the opposites, construct + * an expression with a not operator. + */ + public static enum Operator { + EQUALS, + NULL_SAFE_EQUALS, + LESS_THAN, + LESS_THAN_EQUALS, + IN, + BETWEEN, + IS_NULL + } + + /** + * The possible types for sargs. + */ + public static enum Type { + INTEGER(Integer.class), // all of the integer types except long + LONG(Long.class), + FLOAT(Double.class), // float and double + STRING(String.class), // string, char, varchar + DATE(Date.class), + DECIMAL(HiveDecimalWritable.class), + TIMESTAMP(Timestamp.class), + BOOLEAN(Boolean.class); + + private final Class cls; + Type(Class cls) { + this.cls = cls; + } + + /** + * For all SARG leaves, the values must be the matching class. + * @return the value class + */ + public Class getValueClass() { + return cls; + } + } + + /** + * Get the operator for the leaf. + */ + public Operator getOperator(); + + /** + * Get the type of the column and literal by the file format. + */ + public Type getType(); + + /** + * Get the simple column name. + * @return the column name + */ + public String getColumnName(); + + /** + * Get the literal half of the predicate leaf. Adapt the original type for what orc needs + * + * @return an Integer, Long, Double, or String + */ + public Object getLiteral(); + + /** + * For operators with multiple literals (IN and BETWEEN), get the literals. + * + * @return the list of literals (Integer, Longs, Doubles, or Strings) + * + */ + public List getLiteralList(); + +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java new file mode 100644 index 0000000..d008268 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java @@ -0,0 +1,287 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import java.util.List; + +/** + * Primary interface for + * SearchArgument, which are the subset of predicates + * that can be pushed down to the RecordReader. Each SearchArgument consists + * of a series of SearchClauses that must each be true for the row to be + * accepted by the filter. + * + * This requires that the filter be normalized into conjunctive normal form + * (CNF). + */ +public interface SearchArgument { + + /** + * The potential result sets of logical operations. + */ + public static enum TruthValue { + YES, NO, NULL, YES_NULL, NO_NULL, YES_NO, YES_NO_NULL; + + /** + * Compute logical or between the two values. + * @param right the other argument or null + * @return the result + */ + public TruthValue or(TruthValue right) { + if (right == null || right == this) { + return this; + } + if (right == YES || this == YES) { + return YES; + } + if (right == YES_NULL || this == YES_NULL) { + return YES_NULL; + } + if (right == NO) { + return this; + } + if (this == NO) { + return right; + } + if (this == NULL) { + if (right == NO_NULL) { + return NULL; + } else { + return YES_NULL; + } + } + if (right == NULL) { + if (this == NO_NULL) { + return NULL; + } else { + return YES_NULL; + } + } + return YES_NO_NULL; + } + + /** + * Compute logical AND between the two values. + * @param right the other argument or null + * @return the result + */ + public TruthValue and(TruthValue right) { + if (right == null || right == this) { + return this; + } + if (right == NO || this == NO) { + return NO; + } + if (right == NO_NULL || this == NO_NULL) { + return NO_NULL; + } + if (right == YES) { + return this; + } + if (this == YES) { + return right; + } + if (this == NULL) { + if (right == YES_NULL) { + return NULL; + } else { + return NO_NULL; + } + } + if (right == NULL) { + if (this == YES_NULL) { + return NULL; + } else { + return NO_NULL; + } + } + return YES_NO_NULL; + } + + public TruthValue not() { + switch (this) { + case NO: + return YES; + case YES: + return NO; + case NULL: + case YES_NO: + case YES_NO_NULL: + return this; + case NO_NULL: + return YES_NULL; + case YES_NULL: + return NO_NULL; + default: + throw new IllegalArgumentException("Unknown value: " + this); + } + } + + /** + * Does the RecordReader need to include this set of records? + * @return true unless none of the rows qualify + */ + public boolean isNeeded() { + switch (this) { + case NO: + case NULL: + case NO_NULL: + return false; + default: + return true; + } + } + } + + /** + * Get the leaf predicates that are required to evaluate the predicate. The + * list will have the duplicates removed. + * @return the list of leaf predicates + */ + public List getLeaves(); + + /** + * Get the expression tree. This should only needed for file formats that + * need to translate the expression to an internal form. + */ + public ExpressionTree getExpression(); + + /** + * Evaluate the entire predicate based on the values for the leaf predicates. + * @param leaves the value of each leaf predicate + * @return the value of hte entire predicate + */ + public TruthValue evaluate(TruthValue[] leaves); + + /** + * A builder object for contexts outside of Hive where it isn't easy to + * get a ExprNodeDesc. The user must call startOr, startAnd, or startNot + * before adding any leaves. + */ + public interface Builder { + + /** + * Start building an or operation and push it on the stack. + * @return this + */ + public Builder startOr(); + + /** + * Start building an and operation and push it on the stack. + * @return this + */ + public Builder startAnd(); + + /** + * Start building a not operation and push it on the stack. + * @return this + */ + public Builder startNot(); + + /** + * Finish the current operation and pop it off of the stack. Each start + * call must have a matching end. + * @return this + */ + public Builder end(); + + /** + * Add a less than leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder lessThan(String column, PredicateLeaf.Type type, + Object literal); + + /** + * Add a less than equals leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder lessThanEquals(String column, PredicateLeaf.Type type, + Object literal); + + /** + * Add an equals leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder equals(String column, PredicateLeaf.Type type, + Object literal); + + /** + * Add a null safe equals leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder nullSafeEquals(String column, PredicateLeaf.Type type, + Object literal); + + /** + * Add an in leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder in(String column, PredicateLeaf.Type type, + Object... literal); + + /** + * Add an is null leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @return this + */ + public Builder isNull(String column, PredicateLeaf.Type type); + + /** + * Add a between leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param lower the literal + * @param upper the literal + * @return this + */ + public Builder between(String column, PredicateLeaf.Type type, + Object lower, Object upper); + + /** + * Add a truth value to the expression. + * @param truth + * @return this + */ + public Builder literal(TruthValue truth); + + /** + * Build and return the SearchArgument that has been defined. All of the + * starts must have been ended before this call. + * @return the new SearchArgument + */ + public SearchArgument build(); + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java new file mode 100644 index 0000000..0778935 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +/** + * A factory for creating SearchArguments. + */ +public class SearchArgumentFactory { + public static SearchArgument.Builder newBuilder() { + return new SearchArgumentImpl.BuilderImpl(); + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java new file mode 100644 index 0000000..d27ac16 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java @@ -0,0 +1,687 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import java.sql.Timestamp; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Deque; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * The implementation of SearchArguments. + */ +final class SearchArgumentImpl implements SearchArgument { + public static final Log LOG = LogFactory.getLog(SearchArgumentImpl.class); + + static final class PredicateLeafImpl implements PredicateLeaf { + private final Operator operator; + private final Type type; + private final String columnName; + private final Object literal; + private final List literalList; + + // Used by kryo + @SuppressWarnings("unused") + PredicateLeafImpl() { + operator = null; + type = null; + columnName = null; + literal = null; + literalList = null; + } + + PredicateLeafImpl(Operator operator, + Type type, + String columnName, + Object literal, + List literalList) { + this.operator = operator; + this.type = type; + this.columnName = columnName; + this.literal = literal; + if (literal != null) { + if (literal.getClass() != type.getValueClass()) { + throw new IllegalArgumentException("Wrong value class " + + literal.getClass().getName() + " for " + type + "." + operator + + " leaf"); + } + } + this.literalList = literalList; + if (literalList != null) { + Class valueCls = type.getValueClass(); + for(Object lit: literalList) { + if (lit != null && lit.getClass() != valueCls) { + throw new IllegalArgumentException("Wrong value class item " + + lit.getClass().getName() + " for " + type + "." + operator + + " leaf"); + } + } + } + } + + @Override + public Operator getOperator() { + return operator; + } + + @Override + public Type getType(){ + return type; + } + + @Override + public String getColumnName() { + return columnName; + } + + @Override + public Object getLiteral() { + // To get around a kryo 2.22 bug while deserialize a Timestamp into Date + // (https://github.com/EsotericSoftware/kryo/issues/88) + // When we see a Date, convert back into Timestamp + if (literal instanceof java.util.Date) { + return new Timestamp(((java.util.Date)literal).getTime()); + } + return literal; + } + + @Override + public List getLiteralList() { + return literalList; + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + buffer.append('('); + buffer.append(operator); + buffer.append(' '); + buffer.append(columnName); + if (literal != null) { + buffer.append(' '); + buffer.append(literal); + } else if (literalList != null) { + for(Object lit: literalList) { + buffer.append(' '); + buffer.append(lit == null ? "null" : lit.toString()); + } + } + buffer.append(')'); + return buffer.toString(); + } + + private static boolean isEqual(Object left, Object right) { + + return left == right || + (left != null && right != null && left.equals(right)); + } + + @Override + public boolean equals(Object other) { + if (other == null || other.getClass() != getClass()) { + return false; + } else if (other == this) { + return true; + } else { + PredicateLeafImpl o = (PredicateLeafImpl) other; + return operator == o.operator && + type == o.type && + columnName.equals(o.columnName) && + isEqual(literal, o.literal) && + isEqual(literalList, o.literalList); + } + } + + @Override + public int hashCode() { + return operator.hashCode() + + type.hashCode() * 17 + + columnName.hashCode() * 3 * 17+ + (literal == null ? 0 : literal.hashCode()) * 101 * 3 * 17 + + (literalList == null ? 0 : literalList.hashCode()) * + 103 * 101 * 3 * 17; + } + } + + + private final List leaves; + private final ExpressionTree expression; + + SearchArgumentImpl(ExpressionTree expression, List leaves) { + this.expression = expression; + this.leaves = leaves; + } + + // Used by kyro + @SuppressWarnings("unused") + SearchArgumentImpl() { + leaves = null; + expression = null; + } + + @Override + public List getLeaves() { + return leaves; + } + + @Override + public TruthValue evaluate(TruthValue[] leaves) { + return expression == null ? TruthValue.YES : expression.evaluate(leaves); + } + + @Override + public ExpressionTree getExpression() { + return expression; + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + for(int i=0; i < leaves.size(); ++i) { + buffer.append("leaf-"); + buffer.append(i); + buffer.append(" = "); + buffer.append(leaves.get(i).toString()); + buffer.append('\n'); + } + buffer.append("expr = "); + buffer.append(expression); + return buffer.toString(); + } + + static class BuilderImpl implements Builder { + + // max threshold for CNF conversion. having >8 elements in andList will be + // converted to maybe + private static final int CNF_COMBINATIONS_THRESHOLD = 256; + + private final Deque currentTree = + new ArrayDeque(); + private final Map leaves = + new HashMap(); + private final ExpressionTree root = + new ExpressionTree(ExpressionTree.Operator.AND); + { + currentTree.add(root); + } + + @Override + public Builder startOr() { + ExpressionTree node = new ExpressionTree(ExpressionTree.Operator.OR); + currentTree.getFirst().getChildren().add(node); + currentTree.addFirst(node); + return this; + } + + @Override + public Builder startAnd() { + ExpressionTree node = new ExpressionTree(ExpressionTree.Operator.AND); + currentTree.getFirst().getChildren().add(node); + currentTree.addFirst(node); + return this; + } + + @Override + public Builder startNot() { + ExpressionTree node = new ExpressionTree(ExpressionTree.Operator.NOT); + currentTree.getFirst().getChildren().add(node); + currentTree.addFirst(node); + return this; + } + + @Override + public Builder end() { + ExpressionTree current = currentTree.removeFirst(); + if (current.getChildren().size() == 0) { + throw new IllegalArgumentException("Can't create expression " + root + + " with no children."); + } + if (current.getOperator() == ExpressionTree.Operator.NOT && + current.getChildren().size() != 1) { + throw new IllegalArgumentException("Can't create not expression " + + current + " with more than 1 child."); + } + return this; + } + + private int addLeaf(PredicateLeaf leaf) { + Integer result = leaves.get(leaf); + if (result == null) { + int id = leaves.size(); + leaves.put(leaf, id); + return id; + } else { + return result; + } + } + + @Override + public Builder lessThan(String column, PredicateLeaf.Type type, + Object literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN, + type, column, literal, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder lessThanEquals(String column, PredicateLeaf.Type type, + Object literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN_EQUALS, + type, column, literal, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder equals(String column, PredicateLeaf.Type type, + Object literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.EQUALS, + type, column, literal, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder nullSafeEquals(String column, PredicateLeaf.Type type, + Object literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + type, column, literal, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder in(String column, PredicateLeaf.Type type, + Object... literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + if (literal.length == 0) { + throw new IllegalArgumentException("Can't create in expression with " + + "no arguments"); + } + List argList = new ArrayList(); + argList.addAll(Arrays.asList(literal)); + + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.IN, + type, column, null, argList); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder isNull(String column, PredicateLeaf.Type type) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.IS_NULL, + type, column, null, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder between(String column, PredicateLeaf.Type type, Object lower, + Object upper) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || lower == null || upper == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + List argList = new ArrayList(); + argList.add(lower); + argList.add(upper); + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.BETWEEN, + type, column, null, argList); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder literal(TruthValue truth) { + ExpressionTree parent = currentTree.getFirst(); + parent.getChildren().add(new ExpressionTree(truth)); + return this; + } + + /** + * Recursively explore the tree to find the leaves that are still reachable + * after optimizations. + * @param tree the node to check next + * @param next the next available leaf id + * @param leafReorder + * @return the next available leaf id + */ + static int compactLeaves(ExpressionTree tree, int next, int[] leafReorder) { + if (tree.getOperator() == ExpressionTree.Operator.LEAF) { + int oldLeaf = tree.getLeaf(); + if (leafReorder[oldLeaf] == -1) { + leafReorder[oldLeaf] = next++; + } + } else if (tree.getChildren() != null){ + for(ExpressionTree child: tree.getChildren()) { + next = compactLeaves(child, next, leafReorder); + } + } + return next; + } + + /** + * Rewrite expression tree to update the leaves. + * @param root the root of the tree to fix + * @param leafReorder a map from old leaf ids to new leaf ids + * @return the fixed root + */ + static ExpressionTree rewriteLeaves(ExpressionTree root, + int[] leafReorder) { + if (root.getOperator() == ExpressionTree.Operator.LEAF) { + return new ExpressionTree(leafReorder[root.getLeaf()]); + } else if (root.getChildren() != null){ + List children = root.getChildren(); + for(int i=0; i < children.size(); ++i) { + children.set(i, rewriteLeaves(children.get(i), leafReorder)); + } + } + return root; + } + + @Override + public SearchArgument build() { + if (currentTree.size() != 1) { + throw new IllegalArgumentException("Failed to end " + + currentTree.size() + " operations."); + } + ExpressionTree optimized = pushDownNot(root); + optimized = foldMaybe(optimized); + optimized = flatten(optimized); + optimized = convertToCNF(optimized); + optimized = flatten(optimized); + int leafReorder[] = new int[leaves.size()]; + Arrays.fill(leafReorder, -1); + int newLeafCount = compactLeaves(optimized, 0, leafReorder); + optimized = rewriteLeaves(optimized, leafReorder); + ArrayList leafList = new ArrayList<>(newLeafCount); + // expand list to correct size + for(int i=0; i < newLeafCount; ++i) { + leafList.add(null); + } + // build the new list + for(Map.Entry elem: leaves.entrySet()) { + int newLoc = leafReorder[elem.getValue()]; + if (newLoc != -1) { + leafList.set(newLoc, elem.getKey()); + } + } + return new SearchArgumentImpl(optimized, leafList); + } + + /** + * Push the negations all the way to just before the leaves. Also remove + * double negatives. + * @param root the expression to normalize + * @return the normalized expression, which may share some or all of the + * nodes of the original expression. + */ + static ExpressionTree pushDownNot(ExpressionTree root) { + if (root.getOperator() == ExpressionTree.Operator.NOT) { + ExpressionTree child = root.getChildren().get(0); + switch (child.getOperator()) { + case NOT: + return pushDownNot(child.getChildren().get(0)); + case CONSTANT: + return new ExpressionTree(child.getConstant().not()); + case AND: + root = new ExpressionTree(ExpressionTree.Operator.OR); + for(ExpressionTree kid: child.getChildren()) { + root.getChildren().add(pushDownNot(new + ExpressionTree(ExpressionTree.Operator.NOT, kid))); + } + break; + case OR: + root = new ExpressionTree(ExpressionTree.Operator.AND); + for(ExpressionTree kid: child.getChildren()) { + root.getChildren().add(pushDownNot(new ExpressionTree + (ExpressionTree.Operator.NOT, kid))); + } + break; + // for leaf, we don't do anything + default: + break; + } + } else if (root.getChildren() != null) { + // iterate through children and push down not for each one + for(int i=0; i < root.getChildren().size(); ++i) { + root.getChildren().set(i, pushDownNot(root.getChildren().get(i))); + } + } + return root; + } + + /** + * Remove MAYBE values from the expression. If they are in an AND operator, + * they are dropped. If they are in an OR operator, they kill their parent. + * This assumes that pushDownNot has already been called. + * @param expr The expression to clean up + * @return The cleaned up expression + */ + static ExpressionTree foldMaybe(ExpressionTree expr) { + if (expr.getChildren() != null) { + for(int i=0; i < expr.getChildren().size(); ++i) { + ExpressionTree child = foldMaybe(expr.getChildren().get(i)); + if (child.getConstant() == TruthValue.YES_NO_NULL) { + switch (expr.getOperator()) { + case AND: + expr.getChildren().remove(i); + i -= 1; + break; + case OR: + // a maybe will kill the or condition + return child; + default: + throw new IllegalStateException("Got a maybe as child of " + + expr); + } + } else { + expr.getChildren().set(i, child); + } + } + if (expr.getChildren().isEmpty()) { + return new ExpressionTree(TruthValue.YES_NO_NULL); + } + } + return expr; + } + + /** + * Converts multi-level ands and ors into single level ones. + * @param root the expression to flatten + * @return the flattened expression, which will always be root with + * potentially modified children. + */ + static ExpressionTree flatten(ExpressionTree root) { + if (root.getChildren() != null) { + // iterate through the index, so that if we add more children, + // they don't get re-visited + for(int i=0; i < root.getChildren().size(); ++i) { + ExpressionTree child = flatten(root.getChildren().get(i)); + // do we need to flatten? + if (child.getOperator() == root.getOperator() && + child.getOperator() != ExpressionTree.Operator.NOT) { + boolean first = true; + for(ExpressionTree grandkid: child.getChildren()) { + // for the first grandkid replace the original parent + if (first) { + first = false; + root.getChildren().set(i, grandkid); + } else { + root.getChildren().add(++i, grandkid); + } + } + } else { + root.getChildren().set(i, child); + } + } + // if we have a singleton AND or OR, just return the child + if ((root.getOperator() == ExpressionTree.Operator.OR || + root.getOperator() == ExpressionTree.Operator.AND) && + root.getChildren().size() == 1) { + return root.getChildren().get(0); + } + } + return root; + } + + /** + * Generate all combinations of items on the andList. For each item on the + * andList, it generates all combinations of one child from each and + * expression. Thus, (and a b) (and c d) will be expanded to: (or a c) + * (or a d) (or b c) (or b d). If there are items on the nonAndList, they + * are added to each or expression. + * @param result a list to put the results onto + * @param andList a list of and expressions + * @param nonAndList a list of non-and expressions + */ + private static void generateAllCombinations(List result, + List andList, + List nonAndList + ) { + List kids = andList.get(0).getChildren(); + if (result.isEmpty()) { + for(ExpressionTree kid: kids) { + ExpressionTree or = new ExpressionTree(ExpressionTree.Operator.OR); + result.add(or); + for(ExpressionTree node: nonAndList) { + or.getChildren().add(new ExpressionTree(node)); + } + or.getChildren().add(kid); + } + } else { + List work = new ArrayList(result); + result.clear(); + for(ExpressionTree kid: kids) { + for(ExpressionTree or: work) { + ExpressionTree copy = new ExpressionTree(or); + copy.getChildren().add(kid); + result.add(copy); + } + } + } + if (andList.size() > 1) { + generateAllCombinations(result, andList.subList(1, andList.size()), + nonAndList); + } + } + + /** + * Convert an expression so that the top level operator is AND with OR + * operators under it. This routine assumes that all of the NOT operators + * have been pushed to the leaves via pushdDownNot. + * @param root the expression + * @return the normalized expression + */ + static ExpressionTree convertToCNF(ExpressionTree root) { + if (root.getChildren() != null) { + // convert all of the children to CNF + int size = root.getChildren().size(); + for(int i=0; i < size; ++i) { + root.getChildren().set(i, convertToCNF(root.getChildren().get(i))); + } + if (root.getOperator() == ExpressionTree.Operator.OR) { + // a list of leaves that weren't under AND expressions + List nonAndList = new ArrayList(); + // a list of AND expressions that we need to distribute + List andList = new ArrayList(); + for(ExpressionTree child: root.getChildren()) { + if (child.getOperator() == ExpressionTree.Operator.AND) { + andList.add(child); + } else if (child.getOperator() == ExpressionTree.Operator.OR) { + // pull apart the kids of the OR expression + for(ExpressionTree grandkid: child.getChildren()) { + nonAndList.add(grandkid); + } + } else { + nonAndList.add(child); + } + } + if (!andList.isEmpty()) { + if (checkCombinationsThreshold(andList)) { + root = new ExpressionTree(ExpressionTree.Operator.AND); + generateAllCombinations(root.getChildren(), andList, nonAndList); + } else { + root = new ExpressionTree(TruthValue.YES_NO_NULL); + } + } + } + } + return root; + } + + private static boolean checkCombinationsThreshold(List andList) { + int numComb = 1; + for (ExpressionTree tree : andList) { + numComb *= tree.getChildren().size(); + if (numComb > CNF_COMBINATIONS_THRESHOLD) { + return false; + } + } + return true; + } + + } +} diff --git storage-api/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java storage-api/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java new file mode 100644 index 0000000..0578d24 --- /dev/null +++ storage-api/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java @@ -0,0 +1,174 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.io; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigInteger; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveDecimal; + +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableUtils; + +public class HiveDecimalWritable implements WritableComparable { + + static final private Log LOG = LogFactory.getLog(HiveDecimalWritable.class); + + private byte[] internalStorage = new byte[0]; + private int scale; + + public HiveDecimalWritable() { + } + + public HiveDecimalWritable(String value) { + set(HiveDecimal.create(value)); + } + + public HiveDecimalWritable(byte[] bytes, int scale) { + set(bytes, scale); + } + + public HiveDecimalWritable(HiveDecimalWritable writable) { + set(writable.getHiveDecimal()); + } + + public HiveDecimalWritable(HiveDecimal value) { + set(value); + } + + public HiveDecimalWritable(long value) { + set((HiveDecimal.create(value))); + } + + public void set(HiveDecimal value) { + set(value.unscaledValue().toByteArray(), value.scale()); + } + + public void set(HiveDecimal value, int maxPrecision, int maxScale) { + set(HiveDecimal.enforcePrecisionScale(value, maxPrecision, maxScale)); + } + + public void set(HiveDecimalWritable writable) { + set(writable.getHiveDecimal()); + } + + public void set(byte[] bytes, int scale) { + this.internalStorage = bytes; + this.scale = scale; + } + + public HiveDecimal getHiveDecimal() { + return HiveDecimal.create(new BigInteger(internalStorage), scale); + } + + /** + * Get a HiveDecimal instance from the writable and constraint it with maximum precision/scale. + * + * @param maxPrecision maximum precision + * @param maxScale maximum scale + * @return HiveDecimal instance + */ + public HiveDecimal getHiveDecimal(int maxPrecision, int maxScale) { + return HiveDecimal.enforcePrecisionScale(HiveDecimal. + create(new BigInteger(internalStorage), scale), + maxPrecision, maxScale); + } + + @Override + public void readFields(DataInput in) throws IOException { + scale = WritableUtils.readVInt(in); + int byteArrayLen = WritableUtils.readVInt(in); + if (internalStorage.length != byteArrayLen) { + internalStorage = new byte[byteArrayLen]; + } + in.readFully(internalStorage); + } + + @Override + public void write(DataOutput out) throws IOException { + WritableUtils.writeVInt(out, scale); + WritableUtils.writeVInt(out, internalStorage.length); + out.write(internalStorage); + } + + @Override + public int compareTo(HiveDecimalWritable that) { + return getHiveDecimal().compareTo(that.getHiveDecimal()); + } + + @Override + public String toString() { + return getHiveDecimal().toString(); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + HiveDecimalWritable bdw = (HiveDecimalWritable) other; + + // 'equals' and 'compareTo' are not compatible with HiveDecimals. We want + // compareTo which returns true iff the numbers are equal (e.g.: 3.14 is + // the same as 3.140). 'Equals' returns true iff equal and the same scale + // is set in the decimals (e.g.: 3.14 is not the same as 3.140) + return getHiveDecimal().compareTo(bdw.getHiveDecimal()) == 0; + } + + @Override + public int hashCode() { + return getHiveDecimal().hashCode(); + } + + /* (non-Javadoc) + * In order to update a Decimal128 fast (w/o allocation) we need to expose access to the + * internal storage bytes and scale. + * @return + */ + public byte[] getInternalStorage() { + return internalStorage; + } + + /* (non-Javadoc) + * In order to update a Decimal128 fast (w/o allocation) we need to expose access to the + * internal storage bytes and scale. + */ + public int getScale() { + return scale; + } + + public static + HiveDecimalWritable enforcePrecisionScale(HiveDecimalWritable writable, + int precision, int scale) { + if (writable == null) { + return null; + } + + HiveDecimal dec = + HiveDecimal.enforcePrecisionScale(writable.getHiveDecimal(), precision, + scale); + return dec == null ? null : new HiveDecimalWritable(dec); + } +}