From b256585b20ad15de358bbf72de81089c5fdaeea8 Mon Sep 17 00:00:00 2001 From: Zhong Date: Wed, 15 Aug 2018 16:04:30 +0800 Subject: [PATCH] KYLIN-3489 improve the efficiency of enumerating dictionary values by pre-order visiting --- .../org/apache/kylin/common/util/Dictionary.java | 11 +++++ .../java/org/apache/kylin/dict/TrieDictionary.java | 51 ++++++++++++++++++++++ .../org/apache/kylin/dict/TrieDictionaryTest.java | 30 +++++++++++++ 3 files changed, 92 insertions(+) diff --git a/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java b/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java index 7c1e8ab..5c47399 100644 --- a/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java +++ b/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java @@ -24,11 +24,14 @@ import java.io.IOException; import java.io.PrintStream; import java.io.Serializable; import java.io.UnsupportedEncodingException; +import java.util.List; import org.apache.kylin.common.KylinConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.collect.Lists; + /** * A bi-way dictionary that maps from dimension/column values to IDs and vice * versa. By storing IDs instead of real values, the size of cube is @@ -170,6 +173,14 @@ abstract public class Dictionary implements Serializable { abstract public void dump(PrintStream out); + public List enumeratorValues() { + List ret = Lists.newArrayListWithExpectedSize(getSize()); + for (int i = getMinId(); i <= getMaxId(); i++) { + ret.add(getValueFromId(i)); + } + return ret; + } + public int nullId() { return NULL_ID[getSizeOfId()]; } diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java index d531c05..8303bb0 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java @@ -28,6 +28,7 @@ import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.PrintStream; import java.util.Arrays; +import java.util.List; import org.apache.kylin.common.util.Bytes; import org.apache.kylin.common.util.BytesUtil; @@ -36,7 +37,9 @@ import org.apache.kylin.common.util.Dictionary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; /** * A dictionary based on Trie data structure that maps enumerations of byte[] to @@ -359,6 +362,54 @@ public class TrieDictionary extends CacheDictionary { } @Override + public List enumeratorValues() { + List result = Lists.newArrayListWithExpectedSize(getSize()); + byte[] buf = new byte[maxValueLength]; + visitNode(headSize, buf, 0, result); + return result; + } + + @VisibleForTesting + List enumeratorValuesByParent() { + return super.enumeratorValues(); + } + + /** + * Visit the trie tree by pre-order + * @param n -- the offset of current node in trieBytes + * @param returnValue -- where return value is written to + */ + private void visitNode(int n, byte[] returnValue, int offset, List result) { + int o = offset; + + // write current node value + int p = n + firstByteOffset; + int len = BytesUtil.readUnsigned(trieBytes, p - 1, 1); + System.arraycopy(trieBytes, p, returnValue, o, len); + o += len; + + // if the value is ended + boolean isEndOfValue = checkFlag(n, BIT_IS_END_OF_VALUE); + if (isEndOfValue) { + T curNodeValue = bytesConvert.convertFromBytes(returnValue, 0, o); + result.add(curNodeValue); + } + + // find a child to continue + int c = getChildOffset(n); + if (c == headSize) // has no children + return; + while (true) { + visitNode(c, returnValue, o, result); + if (checkFlag(c, BIT_IS_LAST_CHILD)) + return; + // go to next child + p = c + firstByteOffset; + c = p + BytesUtil.readUnsigned(trieBytes, p - 1, 1); + } + } + + @Override public void dump(PrintStream out) { out.println("Total " + nValues + " values"); for (int i = 0; i < nValues; i++) { diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java index 600b072..c873035 100644 --- a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java +++ b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java @@ -35,13 +35,18 @@ import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import java.util.List; import java.util.NoSuchElementException; import java.util.Random; import java.util.TreeSet; +import java.util.concurrent.TimeUnit; import org.junit.Assert; import org.junit.Test; +import com.google.common.base.Stopwatch; +import com.google.common.collect.Sets; + public class TrieDictionaryTest { public static void main(String[] args) throws Exception { @@ -209,6 +214,31 @@ public class TrieDictionaryTest { } @Test + public void testEnumeratorValues() throws Exception { + testEnumeratorValues("src/test/resources/dict/english-words.80 (scowl-2015.05.18).txt"); + testEnumeratorValues("src/test/resources/dict/dw_category_grouping_names.dat"); + } + + private void testEnumeratorValues(String file) throws Exception { + InputStream is = new FileInputStream(file); + ArrayList str = loadStrings(is); + TrieDictionaryBuilder b = newDictBuilder(str); + TrieDictionary dict = b.build(0); + System.out.println("Dictionary size for file " + file + " is " + dict.getSize()); + + Stopwatch sw = new Stopwatch(); + sw.start(); + List values1 = dict.enumeratorValuesByParent(); + System.out.println("By iterating id visit the time cost " + sw.elapsed(TimeUnit.MILLISECONDS) + " ms"); + sw.reset(); + sw.start(); + List values2 = dict.enumeratorValues(); + System.out.println("By pre-order visit the time cost " + sw.elapsed(TimeUnit.MILLISECONDS) + " ms"); + sw.stop(); + assertEquals(Sets.newHashSet(values1), Sets.newHashSet(values2)); + } + + @Test public void englishWordsTest() throws Exception { InputStream is = new FileInputStream("src/test/resources/dict/english-words.80 (scowl-2015.05.18).txt"); ArrayList str = loadStrings(is); -- 2.5.4 (Apple Git-61)