Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -502,6 +502,8 @@
// Maximum fraction of heap that can be used by ORC file writers
HIVE_ORC_FILE_MEMORY_POOL("hive.exec.orc.memory.pool", 0.5f), // 50%
+ HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD("hive.exec.orc.dictionary.key.size.threshold", 0.8f),
+
HIVESKEWJOIN("hive.optimize.skewjoin", false),
HIVECONVERTJOIN("hive.auto.convert.join", true),
HIVECONVERTJOINNOCONDITIONALTASK("hive.auto.convert.join.noconditionaltask", true),
Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template
+++ conf/hive-default.xml.template
@@ -1701,6 +1701,15 @@
+ hive.exec.orc.dictionary.key.size.threshold
+ 0.8
+
+ If the number of keys in a dictionary is greater than this fraction of the total number of
+ non-null rows, turn off dictionary encoding. Use 1 to always use dictionary encoding.
+
+
+
+
hive.multi.insert.move.tasks.share.dependencies
false
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
@@ -97,7 +97,7 @@
CompressionKind compress,
int bufferSize,
int rowIndexStride) throws IOException {
- return new WriterImpl(fs, path, inspector, stripeSize, compress,
+ return new WriterImpl(fs, path, conf, inspector, stripeSize, compress,
bufferSize, rowIndexStride, getMemoryManager(conf));
}
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java
@@ -76,11 +76,7 @@
}
public void clear() throws IOException {
- uncompressedBytes = 0;
- compressedBytes = 0;
- compressed = null;
- overflow = null;
- current = null;
+ flush();
suppress = false;
}
@@ -246,7 +242,10 @@
receiver.output(compressed);
compressed = null;
}
- clear();
+ uncompressedBytes = 0;
+ compressedBytes = 0;
+ overflow = null;
+ current = null;
}
@Override
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
@@ -27,6 +27,7 @@
import java.util.List;
import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -713,18 +714,21 @@
}
}
+ /**
+ * A tree reader that will read string columns. At the start of the
+ * stripe, it creates an internal reader based on whether a direct or
+ * dictionary encoding was used.
+ */
private static class StringTreeReader extends TreeReader {
- private DynamicByteArray dictionaryBuffer = null;
- private int dictionarySize;
- private int[] dictionaryOffsets;
- private RunLengthIntegerReader reader;
+ private TreeReader reader;
StringTreeReader(Path path, int columnId) {
super(path, columnId);
}
void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY) {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY &&
+ encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) {
throw new IOException("Unknown encoding " + encoding + " in column " +
columnId + " of " + path);
}
@@ -734,10 +738,138 @@
void startStripe(Map streams,
List encodings
) throws IOException {
+ // For each stripe, checks the encoding and initializes the appropriate
+ // reader
+ switch (encodings.get(columnId).getKind()) {
+ case DIRECT:
+ reader = new StringDirectTreeReader(path, columnId);
+ break;
+ case DICTIONARY:
+ reader = new StringDictionaryTreeReader(path, columnId);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ encodings.get(columnId).getKind());
+ }
+ reader.startStripe(streams, encodings);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ Object next(Object previous) throws IOException {
+ return reader.next(previous);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skipRows(items);
+ }
+ }
+
+ /**
+ * A reader for string columns that are direct encoded in the current
+ * stripe.
+ */
+ private static class StringDirectTreeReader extends TreeReader {
+ private InStream stream;
+ private RunLengthIntegerReader lengths;
+
+ StringDirectTreeReader(Path path, int columnId) {
+ super(path, columnId);
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ // PASS
+ }
+
+ @Override
+ void startStripe(Map streams,
+ List encodings
+ ) throws IOException {
+ super.startStripe(streams, encodings);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ lengths = new RunLengthIntegerReader(streams.get(new
+ StreamName(columnId, OrcProto.Stream.Kind.LENGTH)),
+ false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ stream.seek(index[columnId]);
+ lengths.seek(index[columnId]);
+ }
+
+ @Override
+ Object next(Object previous) throws IOException {
+ super.next(previous);
+ Text result = null;
+ if (valuePresent) {
+ if (previous == null) {
+ result = new Text();
+ } else {
+ result = (Text) previous;
+ }
+ int len = (int) lengths.next();
+ int offset = 0;
+ byte[] bytes = new byte[len];
+ while (len > 0) {
+ int written = stream.read(bytes, offset, len);
+ if (written < 0) {
+ throw new EOFException("Can't finish byte read from " + stream);
+ }
+ len -= written;
+ offset += written;
+ }
+ result.set(bytes);
+ }
+ return result;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for(int i=0; i < items; ++i) {
+ lengthToSkip += lengths.next();
+ }
+ stream.skip(lengthToSkip);
+ }
+ }
+
+ /**
+ * A reader for string columns that are dictionary encoded in the current
+ * stripe.
+ */
+ private static class StringDictionaryTreeReader extends TreeReader {
+ private DynamicByteArray dictionaryBuffer;
+ private int[] dictionaryOffsets;
+ private RunLengthIntegerReader reader;
+
+ StringDictionaryTreeReader(Path path, int columnId) {
+ super(path, columnId);
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ // PASS
+ }
+
+ @Override
+ void startStripe(Map streams,
+ List encodings
+ ) throws IOException {
super.startStripe(streams, encodings);
// read the dictionary blob
- dictionarySize = encodings.get(columnId).getDictionarySize();
+ int dictionarySize = encodings.get(columnId).getDictionarySize();
StreamName name = new StreamName(columnId,
OrcProto.Stream.Kind.DICTIONARY_DATA);
InStream in = streams.get(name);
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java
@@ -17,11 +17,11 @@
*/
package org.apache.hadoop.hive.ql.io.orc;
-import org.apache.hadoop.io.Text;
-
import java.io.IOException;
import java.io.OutputStream;
+import org.apache.hadoop.io.Text;
+
/**
* A red-black tree that stores strings. The strings are stored as UTF-8 bytes
* and an offset for each entry.
@@ -147,7 +147,7 @@
/**
* Visit all of the nodes in the tree in sorted order.
- * @param visitor the action to be applied to each ndoe
+ * @param visitor the action to be applied to each node
* @throws IOException
*/
public void visit(Visitor visitor) throws IOException {
@@ -163,6 +163,17 @@
keyOffsets.clear();
}
+ public void getText(Text result, int originalPosition) {
+ int offset = keyOffsets.get(originalPosition);
+ int length;
+ if (originalPosition + 1 == keyOffsets.size()) {
+ length = byteArray.size() - offset;
+ } else {
+ length = keyOffsets.get(originalPosition + 1) - offset;
+ }
+ byteArray.setText(result, offset, length);
+ }
+
/**
* Get the size of the character data in the table.
* @return the bytes used by the table
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
@@ -27,11 +27,16 @@
import java.util.Map;
import java.util.TreeMap;
+
+import com.google.protobuf.ByteString;
+import com.google.protobuf.CodedOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry;
import org.apache.hadoop.hive.serde2.io.DateWritable;
@@ -55,9 +60,7 @@
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.io.BytesWritable;
-
-import com.google.protobuf.ByteString;
-import com.google.protobuf.CodedOutputStream;
+import org.apache.hadoop.io.Text;
/**
* An ORC file writer. The file is divided into stripes, which is the natural
@@ -111,16 +114,20 @@
private final boolean buildIndex;
private final MemoryManager memoryManager;
+ private final Configuration conf;
+
WriterImpl(FileSystem fs,
Path path,
+ Configuration conf,
ObjectInspector inspector,
long stripeSize,
CompressionKind compress,
int bufferSize,
int rowIndexStride,
MemoryManager memoryManager) throws IOException {
this.fs = fs;
this.path = path;
+ this.conf = conf;
this.stripeSize = stripeSize;
this.compress = compress;
this.bufferSize = bufferSize;
@@ -344,6 +351,14 @@
public boolean isCompressed() {
return codec != null;
}
+
+ /**
+ * Get the writer's configuration.
+ * @return configuration
+ */
+ public Configuration getConfiguration() {
+ return conf;
+ }
}
/**
@@ -760,16 +775,22 @@
private static class StringTreeWriter extends TreeWriter {
private static final int INITIAL_DICTIONARY_SIZE = 4096;
- private final PositionedOutputStream stringOutput;
+ private final OutStream stringOutput;
private final RunLengthIntegerWriter lengthOutput;
private final RunLengthIntegerWriter rowOutput;
private final StringRedBlackTree dictionary =
new StringRedBlackTree(INITIAL_DICTIONARY_SIZE);
private final DynamicIntArray rows = new DynamicIntArray();
+ private final PositionedOutputStream directStreamOutput;
+ private final RunLengthIntegerWriter directLengthOutput;
private final List savedRowIndex =
new ArrayList();
private final boolean buildIndex;
private final List rowIndexValueCount = new ArrayList();
+ // If the number of keys in a dictionary is greater than this fraction of
+ //the total number of non-null rows, turn off dictionary encoding
+ private final float dictionaryKeySizeThreshold;
+ private boolean useDictionaryEncoding = true;
StringTreeWriter(int columnId,
ObjectInspector inspector,
@@ -785,6 +806,14 @@
recordPosition(rowIndexPosition);
rowIndexValueCount.add(0L);
buildIndex = writer.buildIndex();
+ directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA);
+ directLengthOutput =
+ new RunLengthIntegerWriter(writer.createStream
+ (id, OrcProto.Stream.Kind.LENGTH), false);
+ dictionaryKeySizeThreshold = writer.getConfiguration().getFloat(
+ HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname,
+ HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.
+ defaultFloatVal);
}
@Override
@@ -801,22 +830,36 @@
@Override
void writeStripe(OrcProto.StripeFooter.Builder builder,
int requiredIndexEntries) throws IOException {
- // Traverse the red-black tree writing out the bytes and lengths; and
- // creating the map from the original order to the final sorted order.
+ // Set the flag indicating whether or not to use dictionary encoding
+ // based on whether or not the fraction of distinct keys over number of
+ // non-null rows is less than the configured threshold
+ useDictionaryEncoding = rows.size() > 0 &&
+ (float)(dictionary.size()) / rows.size() <=
+ dictionaryKeySizeThreshold;
final int[] dumpOrder = new int[dictionary.size()];
- dictionary.visit(new StringRedBlackTree.Visitor() {
- private int currentId = 0;
- @Override
- public void visit(StringRedBlackTree.VisitorContext context
- ) throws IOException {
- context.writeBytes(stringOutput);
- lengthOutput.write(context.getLength());
- dumpOrder[context.getOriginalPosition()] = currentId++;
- }
- });
+
+ if (useDictionaryEncoding) {
+ // Write the dictionary by traversing the red-black tree writing out
+ // the bytes and lengths; and creating the map from the original order
+ // to the final sorted order.
+ dictionary.visit(new StringRedBlackTree.Visitor() {
+ private int currentId = 0;
+ @Override
+ public void visit(StringRedBlackTree.VisitorContext context
+ ) throws IOException {
+ context.writeBytes(stringOutput);
+ lengthOutput.write(context.getLength());
+ dumpOrder[context.getOriginalPosition()] = currentId++;
+ }
+ });
+ } else {
+ // for direct encoding, we don't want the dictionary data stream
+ stringOutput.suppress();
+ }
int length = rows.size();
int rowIndexEntry = 0;
OrcProto.RowIndex.Builder rowIndex = getRowIndex();
+ Text text = new Text();
// write the values translated into the dump order.
for(int i = 0; i <= length; ++i) {
// now that we are writing out the row values, we can finalize the
@@ -826,20 +869,34 @@
rowIndexEntry < savedRowIndex.size()) {
OrcProto.RowIndexEntry.Builder base =
savedRowIndex.get(rowIndexEntry++).toBuilder();
- rowOutput.getPosition(new RowIndexPositionRecorder(base));
+ if (useDictionaryEncoding) {
+ rowOutput.getPosition(new RowIndexPositionRecorder(base));
+ } else {
+ PositionRecorder posn = new RowIndexPositionRecorder(base);
+ directStreamOutput.getPosition(posn);
+ directLengthOutput.getPosition(posn);
+ }
rowIndex.addEntry(base.build());
}
}
if (i != length) {
- rowOutput.write(dumpOrder[rows.get(i)]);
+ if (useDictionaryEncoding) {
+ rowOutput.write(dumpOrder[rows.get(i)]);
+ } else {
+ dictionary.getText(text, rows.get(i));
+ directStreamOutput.write(text.getBytes(), 0, text.getLength());
+ directLengthOutput.write(text.getLength());
+ }
}
}
// we need to build the rowindex before calling super, since it
// writes it out.
super.writeStripe(builder, requiredIndexEntries);
stringOutput.flush();
lengthOutput.flush();
rowOutput.flush();
+ directStreamOutput.flush();
+ directLengthOutput.flush();
// reset all of the fields to be ready for the next stripe.
dictionary.clear();
rows.clear();
@@ -849,11 +906,27 @@
rowIndexValueCount.add(0L);
}
+ // Calls getPosition on the row output stream if dictionary encoding is used, and the direct
+ // output stream if direct encoding is used
+ private void recordOutputPosition(OrcProto.RowIndexEntry.Builder base) throws IOException {
+ if (useDictionaryEncoding) {
+ rowOutput.getPosition(new RowIndexPositionRecorder(base));
+ } else {
+ directStreamOutput.getPosition(new RowIndexPositionRecorder(base));
+ }
+ }
+
@Override
OrcProto.ColumnEncoding getEncoding() {
- return OrcProto.ColumnEncoding.newBuilder().setKind(
- OrcProto.ColumnEncoding.Kind.DICTIONARY).
- setDictionarySize(dictionary.size()).build();
+ // Returns the encoding used for the last call to writeStripe
+ if (useDictionaryEncoding) {
+ return OrcProto.ColumnEncoding.newBuilder().setKind(
+ OrcProto.ColumnEncoding.Kind.DICTIONARY).
+ setDictionarySize(dictionary.size()).build();
+ } else {
+ return OrcProto.ColumnEncoding.newBuilder().setKind(
+ OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
}
/**
Index: ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
===================================================================
--- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
+++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
@@ -18,25 +18,24 @@
package org.apache.hadoop.hive.ql.io.orc;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.PrintStream;
import java.util.Random;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.junit.Before;
+import org.junit.Test;
public class TestFileDump {
@@ -69,9 +68,6 @@
}
}
- private static final String outputFilename =
- File.separator + "orc-file-dump.out";
-
private static void checkOutput(String expected,
String actual) throws Exception {
BufferedReader eStream =
@@ -114,8 +110,62 @@
}
writer.close();
PrintStream origOut = System.out;
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator +
- "orc-file-dump.out");
+ String outputFilename = File.separator + "orc-file-dump.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString()});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(resourceDir + outputFilename, workDir + outputFilename);
+ }
+
+ // Test that if the fraction of rows that have distinct strings is greater than the configured
+ // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length
+ // of the dictionary stream for the column will be 0 in the ORC file dump.
+ @Test
+ public void testDictionaryThreshold() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector
+ (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Configuration conf = new Configuration();
+ conf.setFloat(HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, 0.49f);
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+ 100000, CompressionKind.ZLIB, 10000, 10000);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ int nextInt = 0;
+ for(int i=0; i < 21000; ++i) {
+ // Write out the same string twice, this guarantees the fraction of rows with
+ // distinct strings is 0.5
+ if (i % 2 == 0) {
+ nextInt = r1.nextInt(words.length);
+ // Append the value of i to the word, this guarantees when an index or word is repeated
+ // the actual string is unique.
+ words[nextInt] += "-" + i;
+ }
+ writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(),
+ words[nextInt]));
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = File.separator + "orc-file-dump-dictionary-threshold.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + outputFilename);
// replace stdout and run command
System.setOut(new PrintStream(myOut));
Index: ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
===================================================================
--- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
+++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
@@ -993,7 +993,7 @@
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
- Writer writer = new WriterImpl(fs, testFilePath, inspector,
+ Writer writer = new WriterImpl(fs, testFilePath, conf, inspector,
50000, CompressionKind.NONE, 100, 0, memory);
assertEquals(testFilePath, memory.path);
for(int i=0; i < 2500; ++i) {
Index: ql/src/test/queries/clientpositive/orc_dictionary_threshold.q
===================================================================
--- /dev/null
+++ ql/src/test/queries/clientpositive/orc_dictionary_threshold.q
@@ -0,0 +1,60 @@
+set hive.exec.orc.dictionary.key.size.threshold=-1;
+
+-- Set the threshold to -1 to guarantee dictionary encoding is turned off
+-- Tests that the data can be read back correctly when a string column is stored
+-- without dictionary encoding
+
+CREATE TABLE test_orc (key STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
+
+INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10;
+
+-- Test reading the column back
+
+SELECT * FROM test_orc;
+
+ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1');
+
+CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/kv1kv2.cogroup.txt'
+ INTO TABLE src_thousand;
+
+set hive.exec.orc.dictionary.key.size.threshold=0.5;
+
+-- Add data to the table in such a way that alternate stripes encode the column
+-- differently. Setting orc.stripe.size = 1 guarantees the stripes each have
+-- 5000 rows. The first stripe will have 5 * 630 distinct rows and thus be
+-- above the cutoff of 50% and will be direct encoded. The second stripe
+-- will have 5 * 1 distinct rows and thus be under the cutoff and will be
+-- dictionary encoded. The final stripe will have 630 out of 1000 and be
+-- direct encoded.
+
+INSERT OVERWRITE TABLE test_orc
+SELECT key FROM (
+SELECT CONCAT("a", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("b", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("c", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("d", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("e", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("f", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("g", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("h", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("i", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("j", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("k", key) AS key FROM src_thousand
+) a ORDER BY key LIMIT 11000;
+
+SELECT SUM(HASH(key)) FROM test_orc;
+! cp /Users/owen/work/code/hive/build/ql/test/data/warehouse/test_orc/000000_0 /tmp/owen.orc
Index: ql/src/test/resources/orc-file-dump-dictionary-threshold.out
===================================================================
--- /dev/null
+++ ql/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -0,0 +1,78 @@
+Structure for TestFileDump.testDump.orc
+Rows: 21000
+Compression: ZLIB
+Compression size: 10000
+Type: struct
+
+Statistics:
+ Column 0: count: 21000
+ Column 1: count: 21000 min: -2147390285 max: 2147453086 sum: 109128518326
+ Column 2: count: 21000 min: -9222731174895935707 max: 9222919052987871506
+ Column 3: count: 21000 min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936
+
+Stripes:
+ Stripe: offset: 3 data: 107035 rows: 4000 tail: 65 index: 217
+ Stream: column 0 section ROW_INDEX start: 3 length 10
+ Stream: column 1 section ROW_INDEX start: 13 length 36
+ Stream: column 2 section ROW_INDEX start: 49 length 39
+ Stream: column 3 section ROW_INDEX start: 88 length 132
+ Stream: column 1 section DATA start: 220 length 18043
+ Stream: column 2 section DATA start: 18263 length 34740
+ Stream: column 3 section DATA start: 53003 length 50887
+ Stream: column 3 section LENGTH start: 103890 length 3365
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 107320 data: 289727 rows: 5000 tail: 65 index: 349
+ Stream: column 0 section ROW_INDEX start: 107320 length 10
+ Stream: column 1 section ROW_INDEX start: 107330 length 36
+ Stream: column 2 section ROW_INDEX start: 107366 length 39
+ Stream: column 3 section ROW_INDEX start: 107405 length 264
+ Stream: column 1 section DATA start: 107669 length 22581
+ Stream: column 2 section DATA start: 130250 length 43426
+ Stream: column 3 section DATA start: 173676 length 219588
+ Stream: column 3 section LENGTH start: 393264 length 4132
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 397461 data: 496162 rows: 5000 tail: 66 index: 536
+ Stream: column 0 section ROW_INDEX start: 397461 length 10
+ Stream: column 1 section ROW_INDEX start: 397471 length 36
+ Stream: column 2 section ROW_INDEX start: 397507 length 39
+ Stream: column 3 section ROW_INDEX start: 397546 length 451
+ Stream: column 1 section DATA start: 397997 length 22605
+ Stream: column 2 section DATA start: 420602 length 43444
+ Stream: column 3 section DATA start: 464046 length 425862
+ Stream: column 3 section LENGTH start: 889908 length 4251
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 894225 data: 711982 rows: 5000 tail: 65 index: 677
+ Stream: column 0 section ROW_INDEX start: 894225 length 10
+ Stream: column 1 section ROW_INDEX start: 894235 length 36
+ Stream: column 2 section ROW_INDEX start: 894271 length 39
+ Stream: column 3 section ROW_INDEX start: 894310 length 592
+ Stream: column 1 section DATA start: 894902 length 22591
+ Stream: column 2 section DATA start: 917493 length 43414
+ Stream: column 3 section DATA start: 960907 length 641580
+ Stream: column 3 section LENGTH start: 1602487 length 4397
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 1606949 data: 350645 rows: 2000 tail: 66 index: 786
+ Stream: column 0 section ROW_INDEX start: 1606949 length 10
+ Stream: column 1 section ROW_INDEX start: 1606959 length 36
+ Stream: column 2 section ROW_INDEX start: 1606995 length 39
+ Stream: column 3 section ROW_INDEX start: 1607034 length 701
+ Stream: column 1 section DATA start: 1607735 length 9027
+ Stream: column 2 section DATA start: 1616762 length 17375
+ Stream: column 3 section DATA start: 1634137 length 322259
+ Stream: column 3 section LENGTH start: 1956396 length 1984
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
Index: ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out
===================================================================
--- /dev/null
+++ ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out
@@ -0,0 +1,158 @@
+PREHOOK: query: -- Set the threshold to -1 to guarantee dictionary encoding is turned off
+-- Tests that the data can be read back correctly when a string column is stored
+-- without dictionary encoding
+
+CREATE TABLE test_orc (key STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- Set the threshold to -1 to guarantee dictionary encoding is turned off
+-- Tests that the data can be read back correctly when a string column is stored
+-- without dictionary encoding
+
+CREATE TABLE test_orc (key STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@test_orc
+PREHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@test_orc
+POSTHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@test_orc
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: -- Test reading the column back
+
+SELECT * FROM test_orc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: query: -- Test reading the column back
+
+SELECT * FROM test_orc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+238
+86
+311
+27
+165
+409
+255
+278
+98
+484
+PREHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1')
+PREHOOK: type: ALTERTABLE_SERDEPROPERTIES
+PREHOOK: Input: default@test_orc
+PREHOOK: Output: default@test_orc
+POSTHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1')
+POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES
+POSTHOOK: Input: default@test_orc
+POSTHOOK: Output: default@test_orc
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@src_thousand
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/kv1kv2.cogroup.txt'
+ INTO TABLE src_thousand
+PREHOOK: type: LOAD
+PREHOOK: Output: default@src_thousand
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/kv1kv2.cogroup.txt'
+ INTO TABLE src_thousand
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@src_thousand
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: -- Add data to the table in such a way that alternate stripes encode the column
+-- differently. Setting orc.stripe.size = 1 guarantees the stripes each have
+-- 5000 rows. The first stripe will have 5 * 630 distinct rows and thus be
+-- above the cutoff of 50% and will be direct encoded. The second stripe
+-- will have 5 * 1 distinct rows and thus be under the cutoff and will be
+-- dictionary encoded. The final stripe will have 630 out of 1000 and be
+-- direct encoded.
+
+INSERT OVERWRITE TABLE test_orc
+SELECT key FROM (
+SELECT CONCAT("a", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("b", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("c", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("d", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("e", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("f", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("g", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("h", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("i", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("j", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("k", key) AS key FROM src_thousand
+) a ORDER BY key LIMIT 11000
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_thousand
+PREHOOK: Output: default@test_orc
+POSTHOOK: query: -- Add data to the table in such a way that alternate stripes encode the column
+-- differently. Setting orc.stripe.size = 1 guarantees the stripes each have
+-- 5000 rows. The first stripe will have 5 * 630 distinct rows and thus be
+-- above the cutoff of 50% and will be direct encoded. The second stripe
+-- will have 5 * 1 distinct rows and thus be under the cutoff and will be
+-- dictionary encoded. The final stripe will have 630 out of 1000 and be
+-- direct encoded.
+
+INSERT OVERWRITE TABLE test_orc
+SELECT key FROM (
+SELECT CONCAT("a", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("b", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("c", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("d", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("e", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("f", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("g", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("h", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("i", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("j", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("k", key) AS key FROM src_thousand
+) a ORDER BY key LIMIT 11000
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_thousand
+POSTHOOK: Output: default@test_orc
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ]
+PREHOOK: query: SELECT SUM(HASH(key)) FROM test_orc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT SUM(HASH(key)) FROM test_orc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ]
+1082202951192