Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1466174) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -487,6 +487,8 @@ HIVEUSEEXPLICITRCFILEHEADER("hive.exec.rcfile.use.explicit.header", true), HIVEUSERCFILESYNCCACHE("hive.exec.rcfile.use.sync.cache", true), + HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD("hive.exec.orc.dictionary.key.size.threshold", 0.8f), + HIVESKEWJOIN("hive.optimize.skewjoin", false), HIVECONVERTJOIN("hive.auto.convert.join", true), HIVECONVERTJOINNOCONDITIONALTASK("hive.auto.convert.join.noconditionaltask", true), Index: conf/hive-default.xml.template =================================================================== --- conf/hive-default.xml.template (revision 1466174) +++ conf/hive-default.xml.template (working copy) @@ -1685,6 +1685,15 @@ + hive.exec.orc.dictionary.key.size.threshold + 0.8 + + If the number of keys in a dictionary is greater than this fraction of the total number of + non-null rows, turn off dictionary encoding. Use 1 to always use dictionary encoding. + + + + hive.multi.insert.move.tasks.share.dependencies false Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java (revision 1466174) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java (working copy) @@ -18,12 +18,13 @@ package org.apache.hadoop.hive.ql.io.orc; +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import java.io.IOException; - /** * Contains factory methods to read or write ORC files. */ @@ -70,12 +71,13 @@ */ public static Writer createWriter(FileSystem fs, Path path, + Configuration conf, ObjectInspector inspector, long stripeSize, CompressionKind compress, int bufferSize, int rowIndexStride) throws IOException { - return new WriterImpl(fs, path, inspector, stripeSize, compress, + return new WriterImpl(fs, path, conf, inspector, stripeSize, compress, bufferSize, rowIndexStride); } Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java (revision 1466174) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java (working copy) @@ -71,8 +71,8 @@ public void write(NullWritable nullWritable, OrcSerdeRow row) throws IOException { if (writer == null) { - writer = OrcFile.createWriter(fs, path, row.getInspector(), stripeSize, - compress, compressionSize, rowIndexStride); + writer = OrcFile.createWriter(fs, path, this.conf, row.getInspector(), + stripeSize, compress, compressionSize, rowIndexStride); } writer.addRow(row.getRow()); } @@ -81,8 +81,9 @@ public void write(Writable row) throws IOException { OrcSerdeRow serdeRow = (OrcSerdeRow) row; if (writer == null) { - writer = OrcFile.createWriter(fs, path, serdeRow.getInspector(), - stripeSize, compress, compressionSize, rowIndexStride); + writer = OrcFile.createWriter(fs, path, this.conf, + serdeRow.getInspector(), stripeSize, compress, compressionSize, + rowIndexStride); } writer.addRow(serdeRow.getRow()); } @@ -101,8 +102,8 @@ ObjectInspector inspector = ObjectInspectorFactory. getStandardStructObjectInspector(new ArrayList(), new ArrayList()); - writer = OrcFile.createWriter(fs, path, inspector, stripeSize, - compress, compressionSize, rowIndexStride); + writer = OrcFile.createWriter(fs, path, this.conf, inspector, + stripeSize, compress, compressionSize, rowIndexStride); } writer.close(); } Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java (revision 1466174) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java (working copy) @@ -611,10 +611,7 @@ } private static class StringTreeReader extends TreeReader { - private DynamicByteArray dictionaryBuffer = null; - private int dictionarySize; - private int[] dictionaryOffsets; - private RunLengthIntegerReader reader; + private TreeReader reader; StringTreeReader(int columnId) { super(columnId); @@ -624,82 +621,193 @@ void startStripe(Map streams, List encodings ) throws IOException { - super.startStripe(streams, encodings); - - // read the dictionary blob - dictionarySize = encodings.get(columnId).getDictionarySize(); - StreamName name = new StreamName(columnId, - OrcProto.Stream.Kind.DICTIONARY_DATA); - InStream in = streams.get(name); - if (in.available() > 0) { - dictionaryBuffer = new DynamicByteArray(64, in.available()); - dictionaryBuffer.readAll(in); - } else { - dictionaryBuffer = null; + // For each stripe, checks the encoding and initializes the appropriate reader + switch (encodings.get(columnId).getKind()) { + case DIRECT: + reader = new StringDirectTreeReader(columnId); + break; + case DICTIONARY: + reader = new StringDictionaryTreeReader(columnId); + break; + default: + throw new IllegalArgumentException("Unsupported encoding " + + encodings.get(columnId).getKind()); } - in.close(); - // read the lengths - name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH); - in = streams.get(name); - RunLengthIntegerReader lenReader = new RunLengthIntegerReader(in, false); - int offset = 0; - if (dictionaryOffsets == null || - dictionaryOffsets.length < dictionarySize + 1) { - dictionaryOffsets = new int[dictionarySize + 1]; - } - for(int i=0; i < dictionarySize; ++i) { - dictionaryOffsets[i] = offset; - offset += (int) lenReader.next(); - } - dictionaryOffsets[dictionarySize] = offset; - in.close(); - - // set up the row reader - name = new StreamName(columnId, OrcProto.Stream.Kind.DATA); - reader = new RunLengthIntegerReader(streams.get(name), false); + reader.startStripe(streams, encodings); } @Override void seek(PositionProvider[] index) throws IOException { - super.seek(index); - reader.seek(index[columnId]); + reader.seek(index); } @Override Object next(Object previous) throws IOException { - super.next(previous); - Text result = null; - if (valuePresent) { - int entry = (int) reader.next(); - if (previous == null) { - result = new Text(); - } else { - result = (Text) previous; + return reader.next(previous); + } + + @Override + void skipRows(long items) throws IOException { + reader.skipRows(items); + } + + private static class StringDirectTreeReader extends TreeReader { + private InStream stream; + private RunLengthIntegerReader lengths; + + StringDirectTreeReader(int columnId) { + super(columnId); + } + + @Override + void startStripe(Map streams, + List encodings + ) throws IOException { + super.startStripe(streams, encodings); + StreamName name = new StreamName(columnId, + OrcProto.Stream.Kind.DATA); + stream = streams.get(name); + lengths = new RunLengthIntegerReader(streams.get(new + StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), + false); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + super.seek(index); + stream.seek(index[columnId]); + lengths.seek(index[columnId]); + } + + @Override + Object next(Object previous) throws IOException { + super.next(previous); + Text result = null; + if (valuePresent) { + if (previous == null) { + result = new Text(); + } else { + result = (Text) previous; + } + int len = (int) lengths.next(); + int offset = 0; + byte[] bytes = new byte[len]; + while (len > 0) { + int written = stream.read(bytes, offset, len); + if (written < 0) { + throw new EOFException("Can't finish byte read from " + stream); + } + len -= written; + offset += written; + } + result.set(bytes); } - int offset = dictionaryOffsets[entry]; - int length; - // if it isn't the last entry, subtract the offsets otherwise use - // the buffer length. - if (entry < dictionaryOffsets.length - 1) { - length = dictionaryOffsets[entry + 1] - offset; - } else { - length = dictionaryBuffer.size() - offset; + return result; + } + + @Override + void skipRows(long items) throws IOException { + items = countNonNulls(items); + long lengthToSkip = 0; + for(int i=0; i < items; ++i) { + lengthToSkip += lengths.next(); } - // If the column is just empty strings, the size will be zero, so the buffer will be null, - // in that case just return result as it will default to empty - if (dictionaryBuffer != null) { - dictionaryBuffer.setText(result, offset, length); + stream.skip(lengthToSkip); + } + } + + private static class StringDictionaryTreeReader extends TreeReader { + private DynamicByteArray dictionaryBuffer = null; + private int dictionarySize; + private int[] dictionaryOffsets; + private RunLengthIntegerReader reader; + + StringDictionaryTreeReader(int columnId) { + super(columnId); + } + + @Override + void startStripe(Map streams, + List encodings + ) throws IOException { + super.startStripe(streams, encodings); + + // read the dictionary blob + dictionarySize = encodings.get(columnId).getDictionarySize(); + StreamName name = new StreamName(columnId, + OrcProto.Stream.Kind.DICTIONARY_DATA); + InStream in = streams.get(name); + if (in.available() > 0) { + dictionaryBuffer = new DynamicByteArray(64, in.available()); + dictionaryBuffer.readAll(in); } else { - result.clear(); + dictionaryBuffer = null; } + in.close(); + + // read the lengths + name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH); + in = streams.get(name); + RunLengthIntegerReader lenReader = new RunLengthIntegerReader(in, false); + int offset = 0; + if (dictionaryOffsets == null || + dictionaryOffsets.length < dictionarySize + 1) { + dictionaryOffsets = new int[dictionarySize + 1]; + } + for(int i=0; i < dictionarySize; ++i) { + dictionaryOffsets[i] = offset; + offset += (int) lenReader.next(); + } + dictionaryOffsets[dictionarySize] = offset; + in.close(); + + // set up the row reader + name = new StreamName(columnId, OrcProto.Stream.Kind.DATA); + reader = new RunLengthIntegerReader(streams.get(name), false); } - return result; - } - @Override - void skipRows(long items) throws IOException { - reader.skip(countNonNulls(items)); + @Override + void seek(PositionProvider[] index) throws IOException { + super.seek(index); + reader.seek(index[columnId]); + } + + @Override + Object next(Object previous) throws IOException { + super.next(previous); + Text result = null; + if (valuePresent) { + int entry = (int) reader.next(); + if (previous == null) { + result = new Text(); + } else { + result = (Text) previous; + } + int offset = dictionaryOffsets[entry]; + int length; + // if it isn't the last entry, subtract the offsets otherwise use + // the buffer length. + if (entry < dictionaryOffsets.length - 1) { + length = dictionaryOffsets[entry + 1] - offset; + } else { + length = dictionaryBuffer.size() - offset; + } + // If the column is just empty strings, the size will be zero, so the buffer will be null, + // in that case just return result as it will default to empty + if (dictionaryBuffer != null) { + dictionaryBuffer.setText(result, offset, length); + } else { + result.clear(); + } + } + return result; + } + + @Override + void skipRows(long items) throws IOException { + reader.skip(countNonNulls(items)); + } } } Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java (revision 1466174) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java (working copy) @@ -17,11 +17,11 @@ */ package org.apache.hadoop.hive.ql.io.orc; -import org.apache.hadoop.io.Text; - import java.io.IOException; import java.io.OutputStream; +import org.apache.hadoop.io.Text; + /** * A red-black tree that stores strings. The strings are stored as UTF-8 bytes * and an offset/length for each entry. @@ -113,16 +113,16 @@ } public Text getText() { - byteArray.setText(text, keySizes.get(originalPosition * 2), getLength()); + StringRedBlackTree.this.getText(text, originalPosition); return text; } public void writeBytes(OutputStream out) throws IOException { - byteArray.write(out, keySizes.get(originalPosition * 2), getLength()); + byteArray.write(out, StringRedBlackTree.this.getOffset(originalPosition), getLength()); } public int getLength() { - return keySizes.get(originalPosition * 2 + 1); + return StringRedBlackTree.this.getLength(originalPosition); } public int getCount() { @@ -142,7 +142,7 @@ /** * Visit all of the nodes in the tree in sorted order. - * @param visitor the action to be applied to each ndoe + * @param visitor the action to be applied to each node * @throws IOException */ public void visit(Visitor visitor) throws IOException { @@ -158,6 +158,18 @@ keySizes.clear(); } + public void getText(Text result, int originalPosition) { + byteArray.setText(result, getOffset(originalPosition), getLength(originalPosition)); + } + + private int getOffset(int originalPosition) { + return keySizes.get(originalPosition * 2); + } + + private int getLength(int originalPosition) { + return keySizes.get(originalPosition * 2 + 1); + } + /** * Get the size of the character data in the table. * @return the bytes used by the table Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (revision 1466174) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (working copy) @@ -18,11 +18,20 @@ package org.apache.hadoop.hive.ql.io.orc; -import com.google.protobuf.ByteString; -import com.google.protobuf.CodedOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -41,15 +50,10 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; +import com.google.protobuf.ByteString; +import com.google.protobuf.CodedOutputStream; /** * An ORC file writer. The file is divided into stripes, which is the natural @@ -96,8 +100,11 @@ OrcProto.RowIndex.newBuilder(); private final boolean buildIndex; + private final Configuration conf; + WriterImpl(FileSystem fs, Path path, + Configuration conf, ObjectInspector inspector, long stripeSize, CompressionKind compress, @@ -105,13 +112,14 @@ int rowIndexStride) throws IOException { this.fs = fs; this.path = path; + this.conf = conf; this.stripeSize = stripeSize; this.compress = compress; this.bufferSize = bufferSize; this.rowIndexStride = rowIndexStride; buildIndex = rowIndexStride > 0; codec = createCodec(compress); - treeWriter = createTreeWriter(inspector, streamFactory, false); + treeWriter = createTreeWriter(inspector, streamFactory, false, conf); if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) { throw new IllegalArgumentException("Row stride must be at least " + MIN_ROW_INDEX_STRIDE); @@ -304,6 +312,7 @@ private final OrcProto.RowIndex.Builder rowIndex; private final OrcProto.RowIndexEntry.Builder rowIndexEntry; private final PositionedOutputStream rowIndexStream; + private final Configuration conf; /** * Create a tree writer @@ -315,9 +324,10 @@ */ TreeWriter(int columnId, ObjectInspector inspector, StreamFactory streamFactory, - boolean nullable) throws IOException { + boolean nullable, Configuration conf) throws IOException { this.id = columnId; this.inspector = inspector; + this.conf = conf; if (nullable) { isPresent = new BitFieldWriter(streamFactory.createStream(id, OrcProto.Stream.Kind.PRESENT), 1); @@ -455,8 +465,8 @@ BooleanTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); PositionedOutputStream out = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.writer = new BitFieldWriter(out, 1); @@ -494,8 +504,8 @@ ByteTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); this.writer = new RunLengthByteWriter(writer.createStream(id, OrcProto.Stream.Kind.DATA)); recordPosition(rowIndexPosition); @@ -535,8 +545,8 @@ IntegerTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); PositionedOutputStream out = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.writer = new RunLengthIntegerWriter(out, true); @@ -595,8 +605,8 @@ FloatTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); this.stream = writer.createStream(id, OrcProto.Stream.Kind.DATA); recordPosition(rowIndexPosition); @@ -633,8 +643,8 @@ DoubleTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); this.stream = writer.createStream(id, OrcProto.Stream.Kind.DATA); recordPosition(rowIndexPosition); @@ -672,28 +682,42 @@ private final RunLengthIntegerWriter countOutput; private final StringRedBlackTree dictionary = new StringRedBlackTree(); private final DynamicIntArray rows = new DynamicIntArray(); + private final PositionedOutputStream directStreamOutput; + private final RunLengthIntegerWriter directLengthOutput; private final List savedRowIndex = new ArrayList(); private final boolean buildIndex; private final List rowIndexValueCount = new ArrayList(); + // If the number of keys in a dictionary is greater than this fraction of the total number of + // non-null rows, turn off dictionary encoding + private final float dictionaryKeySizeThreshold; + private boolean useDictionaryEncoding = true; + StringTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); stringOutput = writer.createStream(id, OrcProto.Stream.Kind.DICTIONARY_DATA); lengthOutput = new RunLengthIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.LENGTH), false); rowOutput = new RunLengthIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.DATA), false); + directStreamOutput = writer.createStream(id, + OrcProto.Stream.Kind.DATA); + directLengthOutput = new RunLengthIntegerWriter(writer.createStream(id, + OrcProto.Stream.Kind.LENGTH), false); if (writer.buildIndex()) { countOutput = new RunLengthIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.DICTIONARY_COUNT), false); } else { countOutput = null; } + dictionaryKeySizeThreshold = conf.getFloat( + HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, + HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.defaultFloatVal); recordPosition(rowIndexPosition); rowIndexValueCount.add(0L); buildIndex = writer.buildIndex(); @@ -713,25 +737,40 @@ @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { - // Traverse the red-black tree writing out the bytes and lengths; and - // creating the map from the original order to the final sorted order. + // Set the flag indicating whether or not to use dictionary encoding based on whether + // or not the fraction of distinct keys over number of non-null rows is less than the + // configured threshold + if (rows.size() > 0 && + (float)(dictionary.size()) / (float)rows.size() <= dictionaryKeySizeThreshold) { + useDictionaryEncoding = true; + } else { + useDictionaryEncoding = false; + } + final int[] dumpOrder = new int[dictionary.size()]; - dictionary.visit(new StringRedBlackTree.Visitor() { - private int currentId = 0; - @Override - public void visit(StringRedBlackTree.VisitorContext context - ) throws IOException { - context.writeBytes(stringOutput); - lengthOutput.write(context.getLength()); - dumpOrder[context.getOriginalPosition()] = currentId++; - if (countOutput != null) { - countOutput.write(context.getCount()); + + if (useDictionaryEncoding) { + // Traverse the red-black tree writing out the bytes and lengths; and + // creating the map from the original order to the final sorted order. + dictionary.visit(new StringRedBlackTree.Visitor() { + private int currentId = 0; + @Override + public void visit(StringRedBlackTree.VisitorContext context + ) throws IOException { + context.writeBytes(stringOutput); + lengthOutput.write(context.getLength()); + dumpOrder[context.getOriginalPosition()] = currentId++; + if (countOutput != null) { + countOutput.write(context.getCount()); + } } - } - }); + }); + } + int length = rows.size(); int rowIndexEntry = 0; OrcProto.RowIndex.Builder rowIndex = getRowIndex(); + // need to build the first index entry out here, to handle the case of // not having any values. if (buildIndex) { @@ -739,10 +778,12 @@ rowIndexEntry < savedRowIndex.size()) { OrcProto.RowIndexEntry.Builder base = savedRowIndex.get(rowIndexEntry++).toBuilder(); - rowOutput.getPosition(new RowIndexPositionRecorder(base)); + recordOutputPosition(base); rowIndex.addEntry(base.build()); } } + + Text text = new Text(); // write the values translated into the dump order. for(int i = 0; i < length; ++i) { // now that we are writing out the row values, we can finalize the @@ -752,11 +793,17 @@ rowIndexEntry < savedRowIndex.size()) { OrcProto.RowIndexEntry.Builder base = savedRowIndex.get(rowIndexEntry++).toBuilder(); - rowOutput.getPosition(new RowIndexPositionRecorder(base)); + recordOutputPosition(base); rowIndex.addEntry(base.build()); } } - rowOutput.write(dumpOrder[rows.get(i)]); + if (useDictionaryEncoding) { + rowOutput.write(dumpOrder[rows.get(i)]); + } else { + dictionary.getText(text, rows.get(i)); + directStreamOutput.write(text.getBytes(), 0, text.getLength()); + directLengthOutput.write(text.getLength()); + } } // we need to build the rowindex before calling super, since it // writes it out. @@ -764,6 +811,8 @@ stringOutput.flush(); lengthOutput.flush(); rowOutput.flush(); + directStreamOutput.flush(); + directLengthOutput.flush(); if (countOutput != null) { countOutput.flush(); } @@ -776,11 +825,27 @@ rowIndexValueCount.add(0L); } + // Calls getPosition on the row output stream if dictionary encoding is used, and the direct + // output stream if direct encoding is used + private void recordOutputPosition(OrcProto.RowIndexEntry.Builder base) throws IOException { + if (useDictionaryEncoding) { + rowOutput.getPosition(new RowIndexPositionRecorder(base)); + } else { + directStreamOutput.getPosition(new RowIndexPositionRecorder(base)); + } + } + @Override OrcProto.ColumnEncoding getEncoding() { - return OrcProto.ColumnEncoding.newBuilder().setKind( - OrcProto.ColumnEncoding.Kind.DICTIONARY). - setDictionarySize(dictionary.size()).build(); + // Returns the encoding used for the last call to writeStripe + if (useDictionaryEncoding) { + return OrcProto.ColumnEncoding.newBuilder().setKind( + OrcProto.ColumnEncoding.Kind.DICTIONARY). + setDictionarySize(dictionary.size()).build(); + } else { + return OrcProto.ColumnEncoding.newBuilder().setKind( + OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } } /** @@ -814,8 +879,8 @@ BinaryTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); this.stream = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.length = new RunLengthIntegerWriter(writer.createStream(id, @@ -862,8 +927,8 @@ TimestampTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); this.seconds = new RunLengthIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.DATA), true); this.nanos = new RunLengthIntegerWriter(writer.createStream(id, @@ -921,15 +986,15 @@ StructTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); StructObjectInspector structObjectInspector = (StructObjectInspector) inspector; fields = structObjectInspector.getAllStructFieldRefs(); childrenWriters = new TreeWriter[fields.size()]; for(int i=0; i < childrenWriters.length; ++i) { childrenWriters[i] = createTreeWriter( - fields.get(i).getFieldObjectInspector(), writer, true); + fields.get(i).getFieldObjectInspector(), writer, true, conf); } recordPosition(rowIndexPosition); } @@ -964,13 +1029,13 @@ ListTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); ListObjectInspector listObjectInspector = (ListObjectInspector) inspector; childrenWriters = new TreeWriter[1]; childrenWriters[0] = createTreeWriter(listObjectInspector.getListElementObjectInspector(), - writer, true); + writer, true, conf); lengths = new RunLengthIntegerWriter(writer.createStream(columnId, OrcProto.Stream.Kind.LENGTH), false); @@ -1014,14 +1079,14 @@ MapTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); MapObjectInspector insp = (MapObjectInspector) inspector; childrenWriters = new TreeWriter[2]; childrenWriters[0] = - createTreeWriter(insp.getMapKeyObjectInspector(), writer, true); + createTreeWriter(insp.getMapKeyObjectInspector(), writer, true, conf); childrenWriters[1] = - createTreeWriter(insp.getMapValueObjectInspector(), writer, true); + createTreeWriter(insp.getMapValueObjectInspector(), writer, true, conf); lengths = new RunLengthIntegerWriter(writer.createStream(columnId, OrcProto.Stream.Kind.LENGTH), false); @@ -1069,13 +1134,13 @@ UnionTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + boolean nullable, Configuration conf) throws IOException { + super(columnId, inspector, writer, nullable, conf); UnionObjectInspector insp = (UnionObjectInspector) inspector; List choices = insp.getObjectInspectors(); childrenWriters = new TreeWriter[choices.size()]; for(int i=0; i < childrenWriters.length; ++i) { - childrenWriters[i] = createTreeWriter(choices.get(i), writer, true); + childrenWriters[i] = createTreeWriter(choices.get(i), writer, true, conf); } tags = new RunLengthByteWriter(writer.createStream(columnId, @@ -1114,53 +1179,53 @@ private static TreeWriter createTreeWriter(ObjectInspector inspector, StreamFactory streamFactory, - boolean nullable + boolean nullable, Configuration conf ) throws IOException { switch (inspector.getCategory()) { case PRIMITIVE: switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { case BOOLEAN: return new BooleanTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); + inspector, streamFactory, nullable, conf); case BYTE: return new ByteTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); + inspector, streamFactory, nullable, conf); case SHORT: case INT: case LONG: return new IntegerTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); + inspector, streamFactory, nullable, conf); case FLOAT: return new FloatTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); + inspector, streamFactory, nullable, conf); case DOUBLE: return new DoubleTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); + inspector, streamFactory, nullable, conf); case STRING: return new StringTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); + inspector, streamFactory, nullable, conf); case BINARY: return new BinaryTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); + inspector, streamFactory, nullable, conf); case TIMESTAMP: return new TimestampTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); + inspector, streamFactory, nullable, conf); default: throw new IllegalArgumentException("Bad primitive category " + ((PrimitiveObjectInspector) inspector).getPrimitiveCategory()); } case STRUCT: return new StructTreeWriter(streamFactory.getNextColumnId(), inspector, - streamFactory, nullable); + streamFactory, nullable, conf); case MAP: return new MapTreeWriter(streamFactory.getNextColumnId(), inspector, - streamFactory, nullable); + streamFactory, nullable, conf); case LIST: return new ListTreeWriter(streamFactory.getNextColumnId(), inspector, - streamFactory, nullable); + streamFactory, nullable, conf); case UNION: return new UnionTreeWriter(streamFactory.getNextColumnId(), inspector, - streamFactory, nullable); + streamFactory, nullable, conf); default: throw new IllegalArgumentException("Bad category: " + inspector.getCategory()); Index: ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java =================================================================== --- ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java (revision 1466174) +++ ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java (working copy) @@ -198,7 +198,7 @@ public String getLogDirectory() { return logDir; } - + private String getHadoopMainVersion(String input) { if (input == null) { return null; Index: ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java =================================================================== --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java (revision 1466174) +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java (working copy) @@ -18,15 +18,8 @@ package org.apache.hadoop.hive.ql.io.orc; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; import java.io.BufferedReader; import java.io.File; @@ -35,8 +28,14 @@ import java.io.PrintStream; import java.util.Random; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.junit.Before; +import org.junit.Test; public class TestFileDump { @@ -69,9 +68,6 @@ } } - private static final String outputFilename = - File.separator + "orc-file-dump.out"; - private static void checkOutput(String expected, String actual) throws Exception { BufferedReader eStream = @@ -94,7 +90,7 @@ inspector = ObjectInspectorFactory.getReflectionObjectInspector (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } - Writer writer = OrcFile.createWriter(fs, testFilePath, inspector, + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.ZLIB, 10000, 10000); Random r1 = new Random(1); String[] words = new String[]{"It", "was", "the", "best", "of", "times,", @@ -114,8 +110,8 @@ } writer.close(); PrintStream origOut = System.out; - FileOutputStream myOut = new FileOutputStream(workDir + File.separator + - "orc-file-dump.out"); + String outputFilename = File.separator + "orc-file-dump.out"; + FileOutputStream myOut = new FileOutputStream(workDir + outputFilename); // replace stdout and run command System.setOut(new PrintStream(myOut)); @@ -123,6 +119,60 @@ System.out.flush(); System.setOut(origOut); + checkOutput(resourceDir + outputFilename, workDir + outputFilename); } + + // Test that if the fraction of rows that have distinct strings is greater than the configured + // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length + // of the dictionary stream for the column will be 0 in the ORC file dump. + @Test + public void testDictionaryThreshold() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Configuration conf = new Configuration(); + conf.setFloat(HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, 0.49f); + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, + 100000, CompressionKind.ZLIB, 10000, 10000); + Random r1 = new Random(1); + String[] words = new String[]{"It", "was", "the", "best", "of", "times,", + "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age", + "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it", + "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch", + "of", "incredulity,", "it", "was", "the", "season", "of", "Light,", + "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the", + "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,", + "we", "had", "everything", "before", "us,", "we", "had", "nothing", + "before", "us,", "we", "were", "all", "going", "direct", "to", + "Heaven,", "we", "were", "all", "going", "direct", "the", "other", + "way"}; + int nextInt = 0; + for(int i=0; i < 21000; ++i) { + // Write out the same string twice, this guarantees the fraction of rows with + // distinct strings is 0.5 + if (i % 2 == 0) { + nextInt = r1.nextInt(words.length); + // Append the value of i to the word, this guarantees when an index or word is repeated + // the actual string is unique. + words[nextInt] += "-" + i; + } + writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), + words[nextInt])); + } + writer.close(); + PrintStream origOut = System.out; + String outputFilename = File.separator + "orc-file-dump-dictionary-threshold.out"; + FileOutputStream myOut = new FileOutputStream(workDir + outputFilename); + + // replace stdout and run command + System.setOut(new PrintStream(myOut)); + FileDump.main(new String[]{testFilePath.toString()}); + System.out.flush(); + System.setOut(origOut); + + checkOutput(resourceDir + outputFilename, workDir + outputFilename); + } } Index: ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java =================================================================== --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java (revision 1466174) +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java (working copy) @@ -187,7 +187,7 @@ inspector = ObjectInspectorFactory.getReflectionObjectInspector (BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } - Writer writer = OrcFile.createWriter(fs, testFilePath, inspector, + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.ZLIB, 10000, 10000); writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536, Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0,1,2,3,4), "hi", @@ -419,7 +419,7 @@ (InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } - Writer writer = OrcFile.createWriter(fs, testFilePath, inspector, + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 1000, CompressionKind.NONE, 100, 1000); Random r1 = new Random(1); Random r2 = new Random(2); @@ -502,7 +502,7 @@ inspector = ObjectInspectorFactory.getReflectionObjectInspector (BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } - Writer writer = OrcFile.createWriter(fs, testFilePath, inspector, + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 1000, CompressionKind.NONE, 100, 10000); writer.close(); Reader reader = OrcFile.createReader(fs, testFilePath); @@ -522,7 +522,7 @@ inspector = ObjectInspectorFactory.getReflectionObjectInspector (BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } - Writer writer = OrcFile.createWriter(fs, testFilePath, inspector, + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 1000, CompressionKind.NONE, 100, 10000); writer.addUserMetadata("my.meta", byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, -128)); writer.addUserMetadata("clobber", byteBuf(1,2,3)); @@ -585,7 +585,7 @@ synchronized (TestOrcFile.class) { inspector = OrcStruct.createObjectInspector(0, types); } - Writer writer = OrcFile.createWriter(fs, testFilePath, inspector, + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 1000, CompressionKind.NONE, 100, 10000); OrcStruct row = new OrcStruct(2); OrcUnion union = new OrcUnion(); @@ -724,7 +724,7 @@ (InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } - Writer writer = OrcFile.createWriter(fs, testFilePath, inspector, + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 1000, CompressionKind.SNAPPY, 100, 10000); Random rand = new Random(12); for(int i=0; i < 10000; ++i) { @@ -759,7 +759,7 @@ (InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } - Writer writer = OrcFile.createWriter(fs, testFilePath, inspector, + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 5000, CompressionKind.SNAPPY, 1000, 0); Random rand = new Random(24); for(int i=0; i < 10000; ++i) { @@ -800,7 +800,7 @@ inspector = ObjectInspectorFactory.getReflectionObjectInspector (BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } - Writer writer = OrcFile.createWriter(fs, testFilePath, inspector, + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 200000, CompressionKind.ZLIB, 65536, 1000); Random rand = new Random(42); final int COUNT=32768; Index: ql/src/test/queries/clientpositive/orc_dictionary_threshold.q =================================================================== --- ql/src/test/queries/clientpositive/orc_dictionary_threshold.q (revision 0) +++ ql/src/test/queries/clientpositive/orc_dictionary_threshold.q (working copy) @@ -0,0 +1,43 @@ +set hive.exec.orc.dictionary.key.size.threshold=-1; + +-- Set the threshold to -1 to guarantee dictionary encoding is turned off +-- Tests that the data can be read back correctly when a string column is stored +-- without dictionary encoding + +CREATE TABLE test_orc (key STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; + +INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10; + +-- Test reading the column back + +SELECT * FROM test_orc; + +ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1'); + +CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '../data/files/one_thousand' INTO TABLE src_thousand; + +set hive.exec.orc.dictionary.key.size.threshold=0.01; + +-- Add data to the table in such a way that alternate stripes encode the column +-- differently. Setting orc.stripe.size = 1 guarantees the stripes each have 1000 +-- rows. Setting the threshold to 0.01 guarantees that if there are 1000 distinct values +-- for the column in a stripe, it is direct encoded, and if there is only 1 distinct value +-- it is dictionary encoded. The preceding letters are just to guarantee that the order by +-- orders them such that the encodings alternate. + +INSERT OVERWRITE TABLE test_orc +SELECT key FROM ( +SELECT CONCAT("a", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("b", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("c", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("d", 1) AS key FROM src_thousand +) a ORDER BY key LIMIT 4000; + +SELECT SUM(HASH(key)) FROM test_orc; Index: ql/src/test/resources/orc-file-dump-dictionary-threshold.out =================================================================== --- ql/src/test/resources/orc-file-dump-dictionary-threshold.out (revision 0) +++ ql/src/test/resources/orc-file-dump-dictionary-threshold.out (working copy) @@ -0,0 +1,337 @@ +Structure for TestFileDump.testDump.orc +Rows: 21000 +Compression: ZLIB +Compression size: 10000 +Type: struct + +Statistics: + Column 0: count: 21000 + Column 1: count: 21000 min: -2147390285 max: 2147453086 sum: 109128518326 + Column 2: count: 21000 min: -9222731174895935707 max: 9222919052987871506 + Column 3: count: 21000 min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 + +Stripes: + Stripe: offset: 3 data: 70427 rows: 3000 tail: 87 index: 214 + Stream: column 0 section ROW_INDEX start: 3 length 10 + Stream: column 1 section ROW_INDEX start: 13 length 39 + Stream: column 2 section ROW_INDEX start: 52 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 94 length 0 + Stream: column 3 section ROW_INDEX start: 94 length 123 + Stream: column 1 section PRESENT start: 217 length 9 + Stream: column 1 section DATA start: 226 length 13559 + Stream: column 2 section PRESENT start: 13785 length 9 + Stream: column 2 section DATA start: 13794 length 26056 + Stream: column 3 section PRESENT start: 39850 length 9 + Stream: column 3 section DATA start: 39859 length 28410 + Stream: column 3 section LENGTH start: 68269 length 2375 + Stream: column 3 section DICTIONARY_DATA start: 70644 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 70731 data: 80428 rows: 2000 tail: 88 index: 261 + Stream: column 0 section ROW_INDEX start: 70731 length 10 + Stream: column 1 section ROW_INDEX start: 70741 length 39 + Stream: column 2 section ROW_INDEX start: 70780 length 43 + Stream: column 3 section DICTIONARY_COUNT start: 70823 length 0 + Stream: column 3 section ROW_INDEX start: 70823 length 169 + Stream: column 1 section PRESENT start: 70992 length 7 + Stream: column 1 section DATA start: 70999 length 9022 + Stream: column 2 section PRESENT start: 80021 length 7 + Stream: column 2 section DATA start: 80028 length 17376 + Stream: column 3 section PRESENT start: 97404 length 7 + Stream: column 3 section DATA start: 97411 length 52208 + Stream: column 3 section LENGTH start: 149619 length 1801 + Stream: column 3 section DICTIONARY_DATA start: 151420 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 151508 data: 50612 rows: 1000 tail: 88 index: 324 + Stream: column 0 section ROW_INDEX start: 151508 length 10 + Stream: column 1 section ROW_INDEX start: 151518 length 39 + Stream: column 2 section ROW_INDEX start: 151557 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 151599 length 0 + Stream: column 3 section ROW_INDEX start: 151599 length 233 + Stream: column 1 section PRESENT start: 151832 length 5 + Stream: column 1 section DATA start: 151837 length 4539 + Stream: column 2 section PRESENT start: 156376 length 5 + Stream: column 2 section DATA start: 156381 length 8695 + Stream: column 3 section PRESENT start: 165076 length 5 + Stream: column 3 section DATA start: 165081 length 36461 + Stream: column 3 section LENGTH start: 201542 length 902 + Stream: column 3 section DICTIONARY_DATA start: 202444 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 202532 data: 56944 rows: 1000 tail: 88 index: 361 + Stream: column 0 section ROW_INDEX start: 202532 length 10 + Stream: column 1 section ROW_INDEX start: 202542 length 39 + Stream: column 2 section ROW_INDEX start: 202581 length 43 + Stream: column 3 section DICTIONARY_COUNT start: 202624 length 0 + Stream: column 3 section ROW_INDEX start: 202624 length 269 + Stream: column 1 section PRESENT start: 202893 length 5 + Stream: column 1 section DATA start: 202898 length 4540 + Stream: column 2 section PRESENT start: 207438 length 5 + Stream: column 2 section DATA start: 207443 length 8679 + Stream: column 3 section PRESENT start: 216122 length 5 + Stream: column 3 section DATA start: 216127 length 42824 + Stream: column 3 section LENGTH start: 258951 length 886 + Stream: column 3 section DICTIONARY_DATA start: 259837 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 259925 data: 66088 rows: 1000 tail: 88 index: 405 + Stream: column 0 section ROW_INDEX start: 259925 length 10 + Stream: column 1 section ROW_INDEX start: 259935 length 39 + Stream: column 2 section ROW_INDEX start: 259974 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 260016 length 0 + Stream: column 3 section ROW_INDEX start: 260016 length 314 + Stream: column 1 section PRESENT start: 260330 length 5 + Stream: column 1 section DATA start: 260335 length 4533 + Stream: column 2 section PRESENT start: 264868 length 5 + Stream: column 2 section DATA start: 264873 length 8683 + Stream: column 3 section PRESENT start: 273556 length 5 + Stream: column 3 section DATA start: 273561 length 51921 + Stream: column 3 section LENGTH start: 325482 length 936 + Stream: column 3 section DICTIONARY_DATA start: 326418 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 326506 data: 74066 rows: 1000 tail: 88 index: 432 + Stream: column 0 section ROW_INDEX start: 326506 length 10 + Stream: column 1 section ROW_INDEX start: 326516 length 39 + Stream: column 2 section ROW_INDEX start: 326555 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 326597 length 0 + Stream: column 3 section ROW_INDEX start: 326597 length 341 + Stream: column 1 section PRESENT start: 326938 length 5 + Stream: column 1 section DATA start: 326943 length 4534 + Stream: column 2 section PRESENT start: 331477 length 5 + Stream: column 2 section DATA start: 331482 length 8688 + Stream: column 3 section PRESENT start: 340170 length 5 + Stream: column 3 section DATA start: 340175 length 59841 + Stream: column 3 section LENGTH start: 400016 length 988 + Stream: column 3 section DICTIONARY_DATA start: 401004 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 401092 data: 81272 rows: 1000 tail: 87 index: 467 + Stream: column 0 section ROW_INDEX start: 401092 length 10 + Stream: column 1 section ROW_INDEX start: 401102 length 39 + Stream: column 2 section ROW_INDEX start: 401141 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 401183 length 0 + Stream: column 3 section ROW_INDEX start: 401183 length 376 + Stream: column 1 section PRESENT start: 401559 length 5 + Stream: column 1 section DATA start: 401564 length 4536 + Stream: column 2 section PRESENT start: 406100 length 5 + Stream: column 2 section DATA start: 406105 length 8699 + Stream: column 3 section PRESENT start: 414804 length 5 + Stream: column 3 section DATA start: 414809 length 67035 + Stream: column 3 section LENGTH start: 481844 length 987 + Stream: column 3 section DICTIONARY_DATA start: 482831 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 482918 data: 91489 rows: 1000 tail: 88 index: 504 + Stream: column 0 section ROW_INDEX start: 482918 length 10 + Stream: column 1 section ROW_INDEX start: 482928 length 39 + Stream: column 2 section ROW_INDEX start: 482967 length 43 + Stream: column 3 section DICTIONARY_COUNT start: 483010 length 0 + Stream: column 3 section ROW_INDEX start: 483010 length 412 + Stream: column 1 section PRESENT start: 483422 length 5 + Stream: column 1 section DATA start: 483427 length 4542 + Stream: column 2 section PRESENT start: 487969 length 5 + Stream: column 2 section DATA start: 487974 length 8687 + Stream: column 3 section PRESENT start: 496661 length 5 + Stream: column 3 section DATA start: 496666 length 77266 + Stream: column 3 section LENGTH start: 573932 length 979 + Stream: column 3 section DICTIONARY_DATA start: 574911 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 574999 data: 100263 rows: 1000 tail: 87 index: 536 + Stream: column 0 section ROW_INDEX start: 574999 length 10 + Stream: column 1 section ROW_INDEX start: 575009 length 38 + Stream: column 2 section ROW_INDEX start: 575047 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 575089 length 0 + Stream: column 3 section ROW_INDEX start: 575089 length 446 + Stream: column 1 section PRESENT start: 575535 length 5 + Stream: column 1 section DATA start: 575540 length 4542 + Stream: column 2 section PRESENT start: 580082 length 5 + Stream: column 2 section DATA start: 580087 length 8687 + Stream: column 3 section PRESENT start: 588774 length 5 + Stream: column 3 section DATA start: 588779 length 86035 + Stream: column 3 section LENGTH start: 674814 length 984 + Stream: column 3 section DICTIONARY_DATA start: 675798 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 675885 data: 108783 rows: 1000 tail: 87 index: 586 + Stream: column 0 section ROW_INDEX start: 675885 length 10 + Stream: column 1 section ROW_INDEX start: 675895 length 39 + Stream: column 2 section ROW_INDEX start: 675934 length 43 + Stream: column 3 section DICTIONARY_COUNT start: 675977 length 0 + Stream: column 3 section ROW_INDEX start: 675977 length 494 + Stream: column 1 section PRESENT start: 676471 length 5 + Stream: column 1 section DATA start: 676476 length 4545 + Stream: column 2 section PRESENT start: 681021 length 5 + Stream: column 2 section DATA start: 681026 length 8688 + Stream: column 3 section PRESENT start: 689714 length 5 + Stream: column 3 section DATA start: 689719 length 94539 + Stream: column 3 section LENGTH start: 784258 length 996 + Stream: column 3 section DICTIONARY_DATA start: 785254 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 785341 data: 115818 rows: 1000 tail: 88 index: 607 + Stream: column 0 section ROW_INDEX start: 785341 length 10 + Stream: column 1 section ROW_INDEX start: 785351 length 39 + Stream: column 2 section ROW_INDEX start: 785390 length 43 + Stream: column 3 section DICTIONARY_COUNT start: 785433 length 0 + Stream: column 3 section ROW_INDEX start: 785433 length 515 + Stream: column 1 section PRESENT start: 785948 length 5 + Stream: column 1 section DATA start: 785953 length 4546 + Stream: column 2 section PRESENT start: 790499 length 5 + Stream: column 2 section DATA start: 790504 length 8688 + Stream: column 3 section PRESENT start: 799192 length 5 + Stream: column 3 section DATA start: 799197 length 101544 + Stream: column 3 section LENGTH start: 900741 length 1025 + Stream: column 3 section DICTIONARY_DATA start: 901766 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 901854 data: 122636 rows: 1000 tail: 88 index: 626 + Stream: column 0 section ROW_INDEX start: 901854 length 10 + Stream: column 1 section ROW_INDEX start: 901864 length 39 + Stream: column 2 section ROW_INDEX start: 901903 length 43 + Stream: column 3 section DICTIONARY_COUNT start: 901946 length 0 + Stream: column 3 section ROW_INDEX start: 901946 length 534 + Stream: column 1 section PRESENT start: 902480 length 5 + Stream: column 1 section DATA start: 902485 length 4543 + Stream: column 2 section PRESENT start: 907028 length 5 + Stream: column 2 section DATA start: 907033 length 8686 + Stream: column 3 section PRESENT start: 915719 length 5 + Stream: column 3 section DATA start: 915724 length 108415 + Stream: column 3 section LENGTH start: 1024139 length 977 + Stream: column 3 section DICTIONARY_DATA start: 1025116 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 1025204 data: 135122 rows: 1000 tail: 88 index: 665 + Stream: column 0 section ROW_INDEX start: 1025204 length 10 + Stream: column 1 section ROW_INDEX start: 1025214 length 39 + Stream: column 2 section ROW_INDEX start: 1025253 length 43 + Stream: column 3 section DICTIONARY_COUNT start: 1025296 length 0 + Stream: column 3 section ROW_INDEX start: 1025296 length 573 + Stream: column 1 section PRESENT start: 1025869 length 5 + Stream: column 1 section DATA start: 1025874 length 4540 + Stream: column 2 section PRESENT start: 1030414 length 5 + Stream: column 2 section DATA start: 1030419 length 8683 + Stream: column 3 section PRESENT start: 1039102 length 5 + Stream: column 3 section DATA start: 1039107 length 120883 + Stream: column 3 section LENGTH start: 1159990 length 1001 + Stream: column 3 section DICTIONARY_DATA start: 1160991 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 1161079 data: 143016 rows: 1000 tail: 88 index: 679 + Stream: column 0 section ROW_INDEX start: 1161079 length 10 + Stream: column 1 section ROW_INDEX start: 1161089 length 39 + Stream: column 2 section ROW_INDEX start: 1161128 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 1161170 length 0 + Stream: column 3 section ROW_INDEX start: 1161170 length 588 + Stream: column 1 section PRESENT start: 1161758 length 5 + Stream: column 1 section DATA start: 1161763 length 4542 + Stream: column 2 section PRESENT start: 1166305 length 5 + Stream: column 2 section DATA start: 1166310 length 8683 + Stream: column 3 section PRESENT start: 1174993 length 5 + Stream: column 3 section DATA start: 1174998 length 128729 + Stream: column 3 section LENGTH start: 1303727 length 1047 + Stream: column 3 section DICTIONARY_DATA start: 1304774 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 1304862 data: 152284 rows: 1000 tail: 88 index: 727 + Stream: column 0 section ROW_INDEX start: 1304862 length 10 + Stream: column 1 section ROW_INDEX start: 1304872 length 39 + Stream: column 2 section ROW_INDEX start: 1304911 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 1304953 length 0 + Stream: column 3 section ROW_INDEX start: 1304953 length 636 + Stream: column 1 section PRESENT start: 1305589 length 5 + Stream: column 1 section DATA start: 1305594 length 4545 + Stream: column 2 section PRESENT start: 1310139 length 5 + Stream: column 2 section DATA start: 1310144 length 8684 + Stream: column 3 section PRESENT start: 1318828 length 5 + Stream: column 3 section DATA start: 1318833 length 137968 + Stream: column 3 section LENGTH start: 1456801 length 1072 + Stream: column 3 section DICTIONARY_DATA start: 1457873 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 1457961 data: 158956 rows: 1000 tail: 88 index: 746 + Stream: column 0 section ROW_INDEX start: 1457961 length 10 + Stream: column 1 section ROW_INDEX start: 1457971 length 39 + Stream: column 2 section ROW_INDEX start: 1458010 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 1458052 length 0 + Stream: column 3 section ROW_INDEX start: 1458052 length 655 + Stream: column 1 section PRESENT start: 1458707 length 5 + Stream: column 1 section DATA start: 1458712 length 4544 + Stream: column 2 section PRESENT start: 1463256 length 5 + Stream: column 2 section DATA start: 1463261 length 8682 + Stream: column 3 section PRESENT start: 1471943 length 5 + Stream: column 3 section DATA start: 1471948 length 144638 + Stream: column 3 section LENGTH start: 1616586 length 1077 + Stream: column 3 section DICTIONARY_DATA start: 1617663 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 1617751 data: 170658 rows: 1000 tail: 88 index: 782 + Stream: column 0 section ROW_INDEX start: 1617751 length 10 + Stream: column 1 section ROW_INDEX start: 1617761 length 39 + Stream: column 2 section ROW_INDEX start: 1617800 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 1617842 length 0 + Stream: column 3 section ROW_INDEX start: 1617842 length 691 + Stream: column 1 section PRESENT start: 1618533 length 5 + Stream: column 1 section DATA start: 1618538 length 4537 + Stream: column 2 section PRESENT start: 1623075 length 5 + Stream: column 2 section DATA start: 1623080 length 8691 + Stream: column 3 section PRESENT start: 1631771 length 5 + Stream: column 3 section DATA start: 1631776 length 156341 + Stream: column 3 section LENGTH start: 1788117 length 1074 + Stream: column 3 section DICTIONARY_DATA start: 1789191 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT + Stripe: offset: 1789279 data: 179008 rows: 1000 tail: 88 index: 808 + Stream: column 0 section ROW_INDEX start: 1789279 length 10 + Stream: column 1 section ROW_INDEX start: 1789289 length 39 + Stream: column 2 section ROW_INDEX start: 1789328 length 42 + Stream: column 3 section DICTIONARY_COUNT start: 1789370 length 0 + Stream: column 3 section ROW_INDEX start: 1789370 length 717 + Stream: column 1 section PRESENT start: 1790087 length 5 + Stream: column 1 section DATA start: 1790092 length 4542 + Stream: column 2 section PRESENT start: 1794634 length 5 + Stream: column 2 section DATA start: 1794639 length 8684 + Stream: column 3 section PRESENT start: 1803323 length 5 + Stream: column 3 section DATA start: 1803328 length 164661 + Stream: column 3 section LENGTH start: 1967989 length 1106 + Stream: column 3 section DICTIONARY_DATA start: 1969095 length 0 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT Index: ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out =================================================================== --- ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out (revision 0) +++ ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out (working copy) @@ -0,0 +1,100 @@ +PREHOOK: query: CREATE TABLE test_orc (key STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_orc (key STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_orc +PREHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_orc +POSTHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT * FROM test_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM test_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +238 +86 +311 +27 +165 +409 +255 +278 +98 +484 +PREHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1') +PREHOOK: type: ALTERTABLE_SERDEPROPERTIES +PREHOOK: Input: default@test_orc +PREHOOK: Output: default@test_orc +POSTHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1') +POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES +POSTHOOK: Input: default@test_orc +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@src_thousand +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/one_thousand' INTO TABLE src_thousand +PREHOOK: type: LOAD +PREHOOK: Output: default@src_thousand +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/one_thousand' INTO TABLE src_thousand +POSTHOOK: type: LOAD +POSTHOOK: Output: default@src_thousand +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: INSERT OVERWRITE TABLE test_orc +SELECT key FROM ( +SELECT CONCAT("a", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("b", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("c", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("d", 1) AS key FROM src_thousand +) a ORDER BY key LIMIT 4000 +PREHOOK: type: QUERY +PREHOOK: Input: default@src_thousand +PREHOOK: Output: default@test_orc +POSTHOOK: query: INSERT OVERWRITE TABLE test_orc +SELECT key FROM ( +SELECT CONCAT("a", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("b", 1) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("c", key) AS key FROM src_thousand +UNION ALL +SELECT CONCAT("d", 1) AS key FROM src_thousand +) a ORDER BY key LIMIT 4000 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_thousand +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ] +PREHOOK: query: SELECT SUM(HASH(key)) FROM test_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: query: SELECT SUM(HASH(key)) FROM test_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ] +5557409630