From 4c6be08440965893b25cdd74f198784844acfdc5 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Thu, 1 Feb 2018 23:37:51 -0800 Subject: [PATCH] HIVE-18611 : Avoid memory allocation of aggregation buffer during stats computation --- .../ql/optimizer/stats/annotation/StatsRulesProcFactory.java | 7 ++++++- .../hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java | 12 +++++++++++- .../hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java | 11 +++++++++++ .../src/java/org/apache/hive/common/util/BloomKFilter.java | 6 +++--- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index cbadfa4f07..9a3f81c98f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -488,7 +488,7 @@ private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, long currNumRow factor *= columnFactor > 1d ? 1d : columnFactor; } float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR); - return Math.round( (double) numRows * factor * inFactor); + return Math.round( numRows * factor * inFactor); } private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, long currNumRows, AnnotateStatsProcCtx aspCtx, @@ -1313,6 +1313,11 @@ private boolean checkMapSideAggregation(GroupByOperator gop, // each evaluator has constant java object overhead avgValSize += gop.javaObjectOverHead; GenericUDAFEvaluator.AggregationBuffer agg = null; + int evaluatorEstimate = aggregationEvaluators[i].estimate(); + if (evaluatorEstimate > 0) { + avgValSize += evaluatorEstimate; + continue; + } try { agg = aggregationEvaluators[i].getNewAggregationBuffer(); } catch (HiveException e) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java index 0c92d2acab..ca8bc8f42e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java @@ -82,7 +82,7 @@ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticE private PrimitiveObjectInspector inputOI; // Bloom filter rest - private ByteArrayOutputStream result = new ByteArrayOutputStream(); + private final ByteArrayOutputStream result = new ByteArrayOutputStream(); private transient byte[] scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES]; @@ -102,6 +102,16 @@ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveExc return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; } + @Override + public int estimate() { + long entries = Math.min(getExpectedEntries(), maxEntries); + long numBits = (long) (-entries * Math.log(BloomKFilter.DEFAULT_FPP) / (Math.log(2) * Math.log(2))); + int nLongs = (int) Math.ceil((double) numBits / (double) Long.SIZE); + // additional bits to pad long array to block size + int padLongs = 8 - nLongs % 8; + return (nLongs + padLongs) * Long.SIZE / 8; + } + /** * Class for storing the BloomFilter */ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java index c3498b796d..3a3e4b6158 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java @@ -69,6 +69,17 @@ public static boolean isEstimable(AggregationBuffer buffer) { return false; } + /** + * Although similar to AbstractAggregationBuffer::estimate(), it differs from it in 2 aspects + * 1) This avoids creation of AggregationBuffer which may result in large memory allocation + * 2) This is used only while compiling query as oppose to AbstractAggregationBuffer version + * which may be used in both runtime as well as compile time. + * @return + */ + public int estimate() { + return -1; + } + /** * Mode. * diff --git a/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java b/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java index 9ecc2ba98e..6ccf5ab4dd 100644 --- a/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java +++ b/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java @@ -36,15 +36,15 @@ * This implementation has much lesser L1 data cache misses than {@link BloomFilter}. */ public class BloomKFilter { - private byte[] BYTE_ARRAY_4 = new byte[4]; - private byte[] BYTE_ARRAY_8 = new byte[8]; + private final byte[] BYTE_ARRAY_4 = new byte[4]; + private final byte[] BYTE_ARRAY_8 = new byte[8]; public static final float DEFAULT_FPP = 0.05f; private static final int DEFAULT_BLOCK_SIZE = 8; private static final int DEFAULT_BLOCK_SIZE_BITS = (int) (Math.log(DEFAULT_BLOCK_SIZE) / Math.log(2)); private static final int DEFAULT_BLOCK_OFFSET_MASK = DEFAULT_BLOCK_SIZE - 1; private static final int DEFAULT_BIT_OFFSET_MASK = Long.SIZE - 1; private final long[] masks = new long[DEFAULT_BLOCK_SIZE]; - private BitSet bitSet; + private final BitSet bitSet; private final int m; private final int k; // spread k-1 bits to adjacent longs, default is 8 -- 2.14.3 (Apple Git-98)