From 5554d92347e1b3e36d79c1ed0c2a7a6465d98c5c Mon Sep 17 00:00:00 2001 From: Zhong Date: Mon, 18 Sep 2017 14:53:49 +0800 Subject: [PATCH] APACHE-KYLIN-2867: split large fuzzy key set --- .../org/apache/kylin/common/KylinConfigBase.java | 4 ++ .../storage/gtrecord/CubeScanRangePlanner.java | 49 +++++++++++++++++----- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java index 66805df..914ba7e 100644 --- a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java +++ b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java @@ -895,6 +895,10 @@ abstract public class KylinConfigBase implements Serializable { return Integer.parseInt(this.getOptional("kylin.storage.hbase.max-fuzzykey-scan", "200")); } + public int getQueryScanFuzzyKeySplitMax() { + return Integer.parseInt(this.getOptional("kylin.storage.hbase.max-fuzzykey-scan-split", "1")); + } + public int getQueryStorageVisitScanRangeMax() { return Integer.valueOf(this.getOptional("kylin.storage.hbase.max-visit-scanrange", "1000000")); } diff --git a/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java b/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java index 7e6f7c4..ef1114a 100644 --- a/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java +++ b/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java @@ -63,6 +63,7 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { private static final Logger logger = LoggerFactory.getLogger(CubeScanRangePlanner.class); protected int maxScanRanges; + protected int maxFuzzyKeysPerSplit; protected int maxFuzzyKeys; //non-GT @@ -77,7 +78,8 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { this.context = context; this.maxScanRanges = cubeSegment.getConfig().getQueryStorageVisitScanRangeMax(); - this.maxFuzzyKeys = cubeSegment.getConfig().getQueryScanFuzzyKeyMax(); + this.maxFuzzyKeysPerSplit = cubeSegment.getConfig().getQueryScanFuzzyKeyMax(); + this.maxFuzzyKeys = maxFuzzyKeysPerSplit * cubeSegment.getConfig().getQueryScanFuzzyKeySplitMax(); this.cubeSegment = cubeSegment; this.cubeDesc = cubeSegment.getCubeDesc(); @@ -124,7 +126,8 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { public CubeScanRangePlanner(GTInfo info, Pair gtStartAndEnd, TblColRef gtPartitionCol, TupleFilter gtFilter) { this.maxScanRanges = KylinConfig.getInstanceFromEnv().getQueryStorageVisitScanRangeMax(); - this.maxFuzzyKeys = KylinConfig.getInstanceFromEnv().getQueryScanFuzzyKeyMax(); + this.maxFuzzyKeysPerSplit = KylinConfig.getInstanceFromEnv().getQueryScanFuzzyKeyMax(); + this.maxFuzzyKeys = maxFuzzyKeysPerSplit * KylinConfig.getInstanceFromEnv().getQueryScanFuzzyKeySplitMax(); this.gtInfo = info; @@ -172,6 +175,7 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { } List mergedRanges = mergeOverlapRanges(scanRanges); + mergedRanges = splitFuzzyKeys(mergedRanges); mergedRanges = mergeTooManyRanges(mergedRanges, maxScanRanges); return mergedRanges; @@ -196,8 +200,6 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { GTRecord pkEnd = new GTRecord(gtInfo); Map> fuzzyValues = Maps.newHashMap(); - List fuzzyKeys; - for (ColumnRange range : andDimRanges) { if (gtPartitionCol != null && range.column.equals(gtPartitionCol)) { int beginCompare = rangeStartEndComparator.comparator.compare(range.begin, gtStartAndEnd.getSecond()); @@ -224,9 +226,8 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { } } - fuzzyKeys = + List fuzzyKeys = buildFuzzyKeys(fuzzyValues); - buildFuzzyKeys(fuzzyValues); return new GTScanRange(pkStart, pkEnd, fuzzyKeys); } @@ -243,7 +244,6 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { } List> fuzzyValueCombinations = FuzzyValueCombination.calculate(fuzzyValueSet, maxFuzzyKeys); - for (Map fuzzyValue : fuzzyValueCombinations) { // BitSet bitSet = new BitSet(gtInfo.getColumnCount()); @@ -309,7 +309,7 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { GTRecord start = first.pkStart; GTRecord end = first.pkEnd; - List newFuzzyKeys = new ArrayList(); + Set newFuzzyKeys = Sets.newLinkedHashSet(); boolean hasNonFuzzyRange = false; for (GTScanRange range : ranges) { @@ -319,12 +319,15 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { } // if any range is non-fuzzy, then all fuzzy keys must be cleared - // also too many fuzzy keys will slow down HBase scan + // too many fuzzy keys will slow down HBase scan if (hasNonFuzzyRange || newFuzzyKeys.size() > maxFuzzyKeys) { + if (newFuzzyKeys.size() > maxFuzzyKeys) { + logger.debug("too many FuzzyKeys, clean it!"); + } newFuzzyKeys.clear(); } - return new GTScanRange(start, end, newFuzzyKeys); + return new GTScanRange(start, end, Lists.newArrayList(newFuzzyKeys)); } protected List mergeTooManyRanges(List ranges, int maxRanges) { @@ -336,6 +339,32 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { List result = new ArrayList(1); GTScanRange mergedRange = mergeKeyRange(ranges); result.add(mergedRange); + + result = splitFuzzyKeys(result); + return result; + } + + private List splitFuzzyKeys(List mergedRanges) { + List result = Lists.newArrayList(); + for (GTScanRange range : mergedRanges) { + // if the fuzzy key is huge but still within in split range, then we split fuzzy keys to multiple ones. + if (range.fuzzyKeys.size() > maxFuzzyKeysPerSplit && range.fuzzyKeys.size() <= maxFuzzyKeys) { + List fuzzyKeys = range.fuzzyKeys; + Collections.sort(fuzzyKeys); + int nSplit = (fuzzyKeys.size() - 1) / maxFuzzyKeysPerSplit + 1; + int nFuzzyKeysPerSplit = fuzzyKeys.size() / nSplit; + int startIndex = 0; + for (int i = 1; i <= nSplit; i++) { + int endIndex = i == nSplit ? fuzzyKeys.size() : i * nFuzzyKeysPerSplit; + List subFuzzyKeys = fuzzyKeys.subList(startIndex, endIndex); + result.add(new GTScanRange(range.pkStart, range.pkEnd, subFuzzyKeys)); + startIndex = endIndex; + } + logger.debug("large FuzzyKeys split size : " + result.size()); + } else { + result.add(range); + } + } return result; } -- 2.5.4 (Apple Git-61)