Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -502,6 +502,8 @@ // Maximum fraction of heap that can be used by ORC file writers HIVE_ORC_FILE_MEMORY_POOL("hive.exec.orc.memory.pool", 0.5f), // 50% + HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD("hive.exec.orc.dictionary.key.size.threshold", 0.8f), + HIVESKEWJOIN("hive.optimize.skewjoin", false), HIVECONVERTJOIN("hive.auto.convert.join", true), HIVECONVERTJOINNOCONDITIONALTASK("hive.auto.convert.join.noconditionaltask", true), Index: conf/hive-default.xml.template =================================================================== --- conf/hive-default.xml.template +++ conf/hive-default.xml.template @@ -1701,6 +1701,15 @@ + hive.exec.orc.dictionary.key.size.threshold + 0.8 + + If the number of keys in a dictionary is greater than this fraction of the total number of + non-null rows, turn off dictionary encoding. Use 1 to always use dictionary encoding. + + + + hive.multi.insert.move.tasks.share.dependencies false Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java @@ -97,7 +97,7 @@ CompressionKind compress, int bufferSize, int rowIndexStride) throws IOException { - return new WriterImpl(fs, path, inspector, stripeSize, compress, + return new WriterImpl(fs, path, conf, inspector, stripeSize, compress, bufferSize, rowIndexStride, getMemoryManager(conf)); } Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java @@ -76,11 +76,7 @@ } public void clear() throws IOException { - uncompressedBytes = 0; - compressedBytes = 0; - compressed = null; - overflow = null; - current = null; + flush(); suppress = false; } @@ -246,7 +242,10 @@ receiver.output(compressed); compressed = null; } - clear(); + uncompressedBytes = 0; + compressedBytes = 0; + overflow = null; + current = null; } @Override Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -27,6 +27,7 @@ import java.util.List; import java.util.Map; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -713,18 +714,21 @@ } } + /** + * A tree reader that will read string columns. At the start of the + * stripe, it creates an internal reader based on whether a direct or + * dictionary encoding was used. + */ private static class StringTreeReader extends TreeReader { - private DynamicByteArray dictionaryBuffer = null; - private int dictionarySize; - private int[] dictionaryOffsets; - private RunLengthIntegerReader reader; + private TreeReader reader; StringTreeReader(Path path, int columnId) { super(path, columnId); } void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { - if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY) { + if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY && + encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) { throw new IOException("Unknown encoding " + encoding + " in column " + columnId + " of " + path); } @@ -734,10 +738,138 @@ void startStripe(Map streams, List encodings ) throws IOException { + // For each stripe, checks the encoding and initializes the appropriate + // reader + switch (encodings.get(columnId).getKind()) { + case DIRECT: + reader = new StringDirectTreeReader(path, columnId); + break; + case DICTIONARY: + reader = new StringDictionaryTreeReader(path, columnId); + break; + default: + throw new IllegalArgumentException("Unsupported encoding " + + encodings.get(columnId).getKind()); + } + reader.startStripe(streams, encodings); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + reader.seek(index); + } + + @Override + Object next(Object previous) throws IOException { + return reader.next(previous); + } + + @Override + void skipRows(long items) throws IOException { + reader.skipRows(items); + } + } + + /** + * A reader for string columns that are direct encoded in the current + * stripe. + */ + private static class StringDirectTreeReader extends TreeReader { + private InStream stream; + private RunLengthIntegerReader lengths; + + StringDirectTreeReader(Path path, int columnId) { + super(path, columnId); + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + // PASS + } + + @Override + void startStripe(Map streams, + List encodings + ) throws IOException { + super.startStripe(streams, encodings); + StreamName name = new StreamName(columnId, + OrcProto.Stream.Kind.DATA); + stream = streams.get(name); + lengths = new RunLengthIntegerReader(streams.get(new + StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), + false); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + super.seek(index); + stream.seek(index[columnId]); + lengths.seek(index[columnId]); + } + + @Override + Object next(Object previous) throws IOException { + super.next(previous); + Text result = null; + if (valuePresent) { + if (previous == null) { + result = new Text(); + } else { + result = (Text) previous; + } + int len = (int) lengths.next(); + int offset = 0; + byte[] bytes = new byte[len]; + while (len > 0) { + int written = stream.read(bytes, offset, len); + if (written < 0) { + throw new EOFException("Can't finish byte read from " + stream); + } + len -= written; + offset += written; + } + result.set(bytes); + } + return result; + } + + @Override + void skipRows(long items) throws IOException { + items = countNonNulls(items); + long lengthToSkip = 0; + for(int i=0; i < items; ++i) { + lengthToSkip += lengths.next(); + } + stream.skip(lengthToSkip); + } + } + + /** + * A reader for string columns that are dictionary encoded in the current + * stripe. + */ + private static class StringDictionaryTreeReader extends TreeReader { + private DynamicByteArray dictionaryBuffer; + private int[] dictionaryOffsets; + private RunLengthIntegerReader reader; + + StringDictionaryTreeReader(Path path, int columnId) { + super(path, columnId); + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + // PASS + } + + @Override + void startStripe(Map streams, + List encodings + ) throws IOException { super.startStripe(streams, encodings); // read the dictionary blob - dictionarySize = encodings.get(columnId).getDictionarySize(); + int dictionarySize = encodings.get(columnId).getDictionarySize(); StreamName name = new StreamName(columnId, OrcProto.Stream.Kind.DICTIONARY_DATA); InStream in = streams.get(name); Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java @@ -17,11 +17,11 @@ */ package org.apache.hadoop.hive.ql.io.orc; -import org.apache.hadoop.io.Text; - import java.io.IOException; import java.io.OutputStream; +import org.apache.hadoop.io.Text; + /** * A red-black tree that stores strings. The strings are stored as UTF-8 bytes * and an offset for each entry. @@ -147,7 +147,7 @@ /** * Visit all of the nodes in the tree in sorted order. - * @param visitor the action to be applied to each ndoe + * @param visitor the action to be applied to each node * @throws IOException */ public void visit(Visitor visitor) throws IOException { @@ -163,6 +163,17 @@ keyOffsets.clear(); } + public void getText(Text result, int originalPosition) { + int offset = keyOffsets.get(originalPosition); + int length; + if (originalPosition + 1 == keyOffsets.size()) { + length = byteArray.size() - offset; + } else { + length = keyOffsets.get(originalPosition + 1) - offset; + } + byteArray.setText(result, offset, length); + } + /** * Get the size of the character data in the table. * @return the bytes used by the table Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java @@ -27,11 +27,16 @@ import java.util.Map; import java.util.TreeMap; + +import com.google.protobuf.ByteString; +import com.google.protobuf.CodedOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; import org.apache.hadoop.hive.serde2.io.DateWritable; @@ -55,9 +60,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; import org.apache.hadoop.io.BytesWritable; - -import com.google.protobuf.ByteString; -import com.google.protobuf.CodedOutputStream; +import org.apache.hadoop.io.Text; /** * An ORC file writer. The file is divided into stripes, which is the natural @@ -111,16 +114,20 @@ private final boolean buildIndex; private final MemoryManager memoryManager; + private final Configuration conf; + WriterImpl(FileSystem fs, Path path, + Configuration conf, ObjectInspector inspector, long stripeSize, CompressionKind compress, int bufferSize, int rowIndexStride, MemoryManager memoryManager) throws IOException { this.fs = fs; this.path = path; + this.conf = conf; this.stripeSize = stripeSize; this.compress = compress; this.bufferSize = bufferSize; @@ -344,6 +351,14 @@ public boolean isCompressed() { return codec != null; } + + /** + * Get the writer's configuration. + * @return configuration + */ + public Configuration getConfiguration() { + return conf; + } } /** @@ -760,16 +775,22 @@ private static class StringTreeWriter extends TreeWriter { private static final int INITIAL_DICTIONARY_SIZE = 4096; - private final PositionedOutputStream stringOutput; + private final OutStream stringOutput; private final RunLengthIntegerWriter lengthOutput; private final RunLengthIntegerWriter rowOutput; private final StringRedBlackTree dictionary = new StringRedBlackTree(INITIAL_DICTIONARY_SIZE); private final DynamicIntArray rows = new DynamicIntArray(); + private final PositionedOutputStream directStreamOutput; + private final RunLengthIntegerWriter directLengthOutput; private final List savedRowIndex = new ArrayList(); private final boolean buildIndex; private final List rowIndexValueCount = new ArrayList(); + // If the number of keys in a dictionary is greater than this fraction of + //the total number of non-null rows, turn off dictionary encoding + private final float dictionaryKeySizeThreshold; + private boolean useDictionaryEncoding = true; StringTreeWriter(int columnId, ObjectInspector inspector, @@ -785,6 +806,14 @@ recordPosition(rowIndexPosition); rowIndexValueCount.add(0L); buildIndex = writer.buildIndex(); + directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA); + directLengthOutput = + new RunLengthIntegerWriter(writer.createStream + (id, OrcProto.Stream.Kind.LENGTH), false); + dictionaryKeySizeThreshold = writer.getConfiguration().getFloat( + HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, + HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD. + defaultFloatVal); } @Override @@ -801,22 +830,36 @@ @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { - // Traverse the red-black tree writing out the bytes and lengths; and - // creating the map from the original order to the final sorted order. + // Set the flag indicating whether or not to use dictionary encoding + // based on whether or not the fraction of distinct keys over number of + // non-null rows is less than the configured threshold + useDictionaryEncoding = rows.size() > 0 && + (float)(dictionary.size()) / rows.size() <= + dictionaryKeySizeThreshold; final int[] dumpOrder = new int[dictionary.size()]; - dictionary.visit(new StringRedBlackTree.Visitor() { - private int currentId = 0; - @Override - public void visit(StringRedBlackTree.VisitorContext context - ) throws IOException { - context.writeBytes(stringOutput); - lengthOutput.write(context.getLength()); - dumpOrder[context.getOriginalPosition()] = currentId++; - } - }); + + if (useDictionaryEncoding) { + // Write the dictionary by traversing the red-black tree writing out + // the bytes and lengths; and creating the map from the original order + // to the final sorted order. + dictionary.visit(new StringRedBlackTree.Visitor() { + private int currentId = 0; + @Override + public void visit(StringRedBlackTree.VisitorContext context + ) throws IOException { + context.writeBytes(stringOutput); + lengthOutput.write(context.getLength()); + dumpOrder[context.getOriginalPosition()] = currentId++; + } + }); + } else { + // for direct encoding, we don't want the dictionary data stream + stringOutput.suppress(); + } int length = rows.size(); int rowIndexEntry = 0; OrcProto.RowIndex.Builder rowIndex = getRowIndex(); + Text text = new Text(); // write the values translated into the dump order. for(int i = 0; i <= length; ++i) { // now that we are writing out the row values, we can finalize the @@ -826,20 +869,34 @@ rowIndexEntry < savedRowIndex.size()) { OrcProto.RowIndexEntry.Builder base = savedRowIndex.get(rowIndexEntry++).toBuilder(); - rowOutput.getPosition(new RowIndexPositionRecorder(base)); + if (useDictionaryEncoding) { + rowOutput.getPosition(new RowIndexPositionRecorder(base)); + } else { + PositionRecorder posn = new RowIndexPositionRecorder(base); + directStreamOutput.getPosition(posn); + directLengthOutput.getPosition(posn); + } rowIndex.addEntry(base.build()); } } if (i != length) { - rowOutput.write(dumpOrder[rows.get(i)]); + if (useDictionaryEncoding) { + rowOutput.write(dumpOrder[rows.get(i)]); + } else { + dictionary.getText(text, rows.get(i)); + directStreamOutput.write(text.getBytes(), 0, text.getLength()); + directLengthOutput.write(text.getLength()); + } } } // we need to build the rowindex before calling super, since it // writes it out. super.writeStripe(builder, requiredIndexEntries); stringOutput.flush(); lengthOutput.flush(); rowOutput.flush(); + directStreamOutput.flush(); + directLengthOutput.flush(); // reset all of the fields to be ready for the next stripe. dictionary.clear(); rows.clear(); @@ -849,11 +906,27 @@ rowIndexValueCount.add(0L); } + // Calls getPosition on the row output stream if dictionary encoding is used, and the direct + // output stream if direct encoding is used + private void recordOutputPosition(OrcProto.RowIndexEntry.Builder base) throws IOException { + if (useDictionaryEncoding) { + rowOutput.getPosition(new RowIndexPositionRecorder(base)); + } else { + directStreamOutput.getPosition(new RowIndexPositionRecorder(base)); + } + } + @Override OrcProto.ColumnEncoding getEncoding() { - return OrcProto.ColumnEncoding.newBuilder().setKind( - OrcProto.ColumnEncoding.Kind.DICTIONARY). - setDictionarySize(dictionary.size()).build(); + // Returns the encoding used for the last call to writeStripe + if (useDictionaryEncoding) { + return OrcProto.ColumnEncoding.newBuilder().setKind( + OrcProto.ColumnEncoding.Kind.DICTIONARY). + setDictionarySize(dictionary.size()).build(); + } else { + return OrcProto.ColumnEncoding.newBuilder().setKind( + OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } } /** Index: ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java =================================================================== --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java @@ -18,25 +18,24 @@ package org.apache.hadoop.hive.ql.io.orc; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.PrintStream; import java.util.Random; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.junit.Before; +import org.junit.Test; public class TestFileDump { @@ -69,9 +68,6 @@ } } - private static final String outputFilename = - File.separator + "orc-file-dump.out"; - private static void checkOutput(String expected, String actual) throws Exception { BufferedReader eStream = @@ -114,8 +110,62 @@ } writer.close(); PrintStream origOut = System.out; - FileOutputStream myOut = new FileOutputStream(workDir + File.separator + - "orc-file-dump.out"); + String outputFilename = File.separator + "orc-file-dump.out"; + FileOutputStream myOut = new FileOutputStream(workDir + outputFilename); + + // replace stdout and run command + System.setOut(new PrintStream(myOut)); + FileDump.main(new String[]{testFilePath.toString()}); + System.out.flush(); + System.setOut(origOut); + + + checkOutput(resourceDir + outputFilename, workDir + outputFilename); + } + + // Test that if the fraction of rows that have distinct strings is greater than the configured + // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length + // of the dictionary stream for the column will be 0 in the ORC file dump. + @Test + public void testDictionaryThreshold() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Configuration conf = new Configuration(); + conf.setFloat(HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, 0.49f); + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, + 100000, CompressionKind.ZLIB, 10000, 10000); + Random r1 = new Random(1); + String[] words = new String[]{"It", "was", "the", "best", "of", "times,", + "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age", + "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it", + "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch", + "of", "incredulity,", "it", "was", "the", "season", "of", "Light,", + "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the", + "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,", + "we", "had", "everything", "before", "us,", "we", "had", "nothing", + "before", "us,", "we", "were", "all", "going", "direct", "to", + "Heaven,", "we", "were", "all", "going", "direct", "the", "other", + "way"}; + int nextInt = 0; + for(int i=0; i < 21000; ++i) { + // Write out the same string twice, this guarantees the fraction of rows with + // distinct strings is 0.5 + if (i % 2 == 0) { + nextInt = r1.nextInt(words.length); + // Append the value of i to the word, this guarantees when an index or word is repeated + // the actual string is unique. + words[nextInt] += "-" + i; + } + writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), + words[nextInt])); + } + writer.close(); + PrintStream origOut = System.out; + String outputFilename = File.separator + "orc-file-dump-dictionary-threshold.out"; + FileOutputStream myOut = new FileOutputStream(workDir + outputFilename); // replace stdout and run command System.setOut(new PrintStream(myOut)); Index: ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java =================================================================== --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java @@ -993,7 +993,7 @@ ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1); - Writer writer = new WriterImpl(fs, testFilePath, inspector, + Writer writer = new WriterImpl(fs, testFilePath, conf, inspector, 50000, CompressionKind.NONE, 100, 0, memory); assertEquals(testFilePath, memory.path); for(int i=0; i < 2500; ++i) { Index: ql/src/test/queries/clientpositive/orc_dictionary_threshold.q =================================================================== --- /dev/null +++ ql/src/test/queries/clientpositive/orc_dictionary_threshold.q @@ -0,0 +1,60 @@ +set hive.exec.orc.dictionary.key.size.threshold=-1; + +-- Set the threshold to -1 to guarantee dictionary encoding is turned off +-- Tests that the data can be read back correctly when a string column is stored +-- without dictionary encoding + +CREATE TABLE test_orc (key STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; + +INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10; + +-- Test reading the column back + +SELECT * FROM test_orc; + +ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1'); + +CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '../data/files/kv1kv2.cogroup.txt' + INTO TABLE src_thousand; + +set hive.exec.orc.dictionary.key.size.threshold=0.5; + +-- Add data to the table in such a way that alternate stripes encode the column +-- differently. Setting orc.stripe.size = 1 guarantees the stripes each have +-- 5000 rows. The first stripe will have 5 * 630 distinct rows and thus be +-- above the cutoff of 50% and will be direct encoded. The second stripe +-- will have 5 * 1 distinct rows and thus be under the cutoff and will be +-- dictionary encoded. The final stripe will have 630 out of 1000 and be +-- direct encoded. + +INSERT OVERWRITE TABLE test_orc +SELECT key FROM ( +SELECT CONCAT("a", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("b", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("c", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("d", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("e", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("f", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("g", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("h", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("i", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("j", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("k", key) AS key FROM src_thousand +) a ORDER BY key LIMIT 11000; + +SELECT SUM(HASH(key)) FROM test_orc; +! cp /Users/owen/work/code/hive/build/ql/test/data/warehouse/test_orc/000000_0 /tmp/owen.orc Index: ql/src/test/resources/orc-file-dump-dictionary-threshold.out =================================================================== --- /dev/null +++ ql/src/test/resources/orc-file-dump-dictionary-threshold.out @@ -0,0 +1,78 @@ +Structure for TestFileDump.testDump.orc +Rows: 21000 +Compression: ZLIB +Compression size: 10000 +Type: struct + +Statistics: + Column 0: count: 21000 + Column 1: count: 21000 min: -2147390285 max: 2147453086 sum: 109128518326 + Column 2: count: 21000 min: -9222731174895935707 max: 9222919052987871506 + Column 3: count: 21000 min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 + +Stripes: + Stripe: offset: 3 data: 107035 rows: 4000 tail: 65 index: 217 + Stream: column 0 section ROW_INDEX start: 3 length 10 + Stream: column 1 section ROW_INDEX start: 13 length 36 + Stream: column 2 section ROW_INDEX start: 49 length 39 + Stream: column 3 section ROW_INDEX start: 88 length 132 + Stream: column 1 section DATA start: 220 length 18043 + Stream: column 2 section DATA start: 18263 length 34740 + Stream: column 3 section DATA start: 53003 length 50887 + Stream: column 3 section LENGTH start: 103890 length 3365 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 107320 data: 289727 rows: 5000 tail: 65 index: 349 + Stream: column 0 section ROW_INDEX start: 107320 length 10 + Stream: column 1 section ROW_INDEX start: 107330 length 36 + Stream: column 2 section ROW_INDEX start: 107366 length 39 + Stream: column 3 section ROW_INDEX start: 107405 length 264 + Stream: column 1 section DATA start: 107669 length 22581 + Stream: column 2 section DATA start: 130250 length 43426 + Stream: column 3 section DATA start: 173676 length 219588 + Stream: column 3 section LENGTH start: 393264 length 4132 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 397461 data: 496162 rows: 5000 tail: 66 index: 536 + Stream: column 0 section ROW_INDEX start: 397461 length 10 + Stream: column 1 section ROW_INDEX start: 397471 length 36 + Stream: column 2 section ROW_INDEX start: 397507 length 39 + Stream: column 3 section ROW_INDEX start: 397546 length 451 + Stream: column 1 section DATA start: 397997 length 22605 + Stream: column 2 section DATA start: 420602 length 43444 + Stream: column 3 section DATA start: 464046 length 425862 + Stream: column 3 section LENGTH start: 889908 length 4251 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 894225 data: 711982 rows: 5000 tail: 65 index: 677 + Stream: column 0 section ROW_INDEX start: 894225 length 10 + Stream: column 1 section ROW_INDEX start: 894235 length 36 + Stream: column 2 section ROW_INDEX start: 894271 length 39 + Stream: column 3 section ROW_INDEX start: 894310 length 592 + Stream: column 1 section DATA start: 894902 length 22591 + Stream: column 2 section DATA start: 917493 length 43414 + Stream: column 3 section DATA start: 960907 length 641580 + Stream: column 3 section LENGTH start: 1602487 length 4397 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 1606949 data: 350645 rows: 2000 tail: 66 index: 786 + Stream: column 0 section ROW_INDEX start: 1606949 length 10 + Stream: column 1 section ROW_INDEX start: 1606959 length 36 + Stream: column 2 section ROW_INDEX start: 1606995 length 39 + Stream: column 3 section ROW_INDEX start: 1607034 length 701 + Stream: column 1 section DATA start: 1607735 length 9027 + Stream: column 2 section DATA start: 1616762 length 17375 + Stream: column 3 section DATA start: 1634137 length 322259 + Stream: column 3 section LENGTH start: 1956396 length 1984 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT Index: ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out =================================================================== --- /dev/null +++ ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out @@ -0,0 +1,158 @@ +PREHOOK: query: -- Set the threshold to -1 to guarantee dictionary encoding is turned off +-- Tests that the data can be read back correctly when a string column is stored +-- without dictionary encoding + +CREATE TABLE test_orc (key STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Set the threshold to -1 to guarantee dictionary encoding is turned off +-- Tests that the data can be read back correctly when a string column is stored +-- without dictionary encoding + +CREATE TABLE test_orc (key STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_orc +PREHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_orc +POSTHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: -- Test reading the column back + +SELECT * FROM test_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: query: -- Test reading the column back + +SELECT * FROM test_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +238 +86 +311 +27 +165 +409 +255 +278 +98 +484 +PREHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1') +PREHOOK: type: ALTERTABLE_SERDEPROPERTIES +PREHOOK: Input: default@test_orc +PREHOOK: Output: default@test_orc +POSTHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1') +POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES +POSTHOOK: Input: default@test_orc +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@src_thousand +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/kv1kv2.cogroup.txt' + INTO TABLE src_thousand +PREHOOK: type: LOAD +PREHOOK: Output: default@src_thousand +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/kv1kv2.cogroup.txt' + INTO TABLE src_thousand +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_thousand +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: -- Add data to the table in such a way that alternate stripes encode the column +-- differently. Setting orc.stripe.size = 1 guarantees the stripes each have +-- 5000 rows. The first stripe will have 5 * 630 distinct rows and thus be +-- above the cutoff of 50% and will be direct encoded. The second stripe +-- will have 5 * 1 distinct rows and thus be under the cutoff and will be +-- dictionary encoded. The final stripe will have 630 out of 1000 and be +-- direct encoded. + +INSERT OVERWRITE TABLE test_orc +SELECT key FROM ( +SELECT CONCAT("a", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("b", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("c", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("d", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("e", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("f", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("g", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("h", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("i", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("j", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("k", key) AS key FROM src_thousand +) a ORDER BY key LIMIT 11000 +PREHOOK: type: QUERY +PREHOOK: Input: default@src_thousand +PREHOOK: Output: default@test_orc +POSTHOOK: query: -- Add data to the table in such a way that alternate stripes encode the column +-- differently. Setting orc.stripe.size = 1 guarantees the stripes each have +-- 5000 rows. The first stripe will have 5 * 630 distinct rows and thus be +-- above the cutoff of 50% and will be direct encoded. The second stripe +-- will have 5 * 1 distinct rows and thus be under the cutoff and will be +-- dictionary encoded. The final stripe will have 630 out of 1000 and be +-- direct encoded. + +INSERT OVERWRITE TABLE test_orc +SELECT key FROM ( +SELECT CONCAT("a", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("b", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("c", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("d", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("e", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("f", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("g", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("h", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("i", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("j", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("k", key) AS key FROM src_thousand +) a ORDER BY key LIMIT 11000 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_thousand +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ] +PREHOOK: query: SELECT SUM(HASH(key)) FROM test_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: query: SELECT SUM(HASH(key)) FROM test_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ] +1082202951192