Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1466174)
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy)
@@ -487,6 +487,8 @@
HIVEUSEEXPLICITRCFILEHEADER("hive.exec.rcfile.use.explicit.header", true),
HIVEUSERCFILESYNCCACHE("hive.exec.rcfile.use.sync.cache", true),
+ HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD("hive.exec.orc.dictionary.key.size.threshold", 0.8f),
+
HIVESKEWJOIN("hive.optimize.skewjoin", false),
HIVECONVERTJOIN("hive.auto.convert.join", true),
HIVECONVERTJOINNOCONDITIONALTASK("hive.auto.convert.join.noconditionaltask", true),
Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template (revision 1466174)
+++ conf/hive-default.xml.template (working copy)
@@ -1685,6 +1685,15 @@
+ hive.exec.orc.dictionary.key.size.threshold
+ 0.8
+
+ If the number of keys in a dictionary is greater than this fraction of the total number of
+ non-null rows, turn off dictionary encoding. Use 1 to always use dictionary encoding.
+
+
+
+
hive.multi.insert.move.tasks.share.dependencies
false
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java (revision 1466174)
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java (working copy)
@@ -18,12 +18,13 @@
package org.apache.hadoop.hive.ql.io.orc;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import java.io.IOException;
-
/**
* Contains factory methods to read or write ORC files.
*/
@@ -70,12 +71,13 @@
*/
public static Writer createWriter(FileSystem fs,
Path path,
+ Configuration conf,
ObjectInspector inspector,
long stripeSize,
CompressionKind compress,
int bufferSize,
int rowIndexStride) throws IOException {
- return new WriterImpl(fs, path, inspector, stripeSize, compress,
+ return new WriterImpl(fs, path, conf, inspector, stripeSize, compress,
bufferSize, rowIndexStride);
}
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java (revision 1466174)
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java (working copy)
@@ -71,8 +71,8 @@
public void write(NullWritable nullWritable,
OrcSerdeRow row) throws IOException {
if (writer == null) {
- writer = OrcFile.createWriter(fs, path, row.getInspector(), stripeSize,
- compress, compressionSize, rowIndexStride);
+ writer = OrcFile.createWriter(fs, path, this.conf, row.getInspector(),
+ stripeSize, compress, compressionSize, rowIndexStride);
}
writer.addRow(row.getRow());
}
@@ -81,8 +81,9 @@
public void write(Writable row) throws IOException {
OrcSerdeRow serdeRow = (OrcSerdeRow) row;
if (writer == null) {
- writer = OrcFile.createWriter(fs, path, serdeRow.getInspector(),
- stripeSize, compress, compressionSize, rowIndexStride);
+ writer = OrcFile.createWriter(fs, path, this.conf,
+ serdeRow.getInspector(), stripeSize, compress, compressionSize,
+ rowIndexStride);
}
writer.addRow(serdeRow.getRow());
}
@@ -101,8 +102,8 @@
ObjectInspector inspector = ObjectInspectorFactory.
getStandardStructObjectInspector(new ArrayList(),
new ArrayList());
- writer = OrcFile.createWriter(fs, path, inspector, stripeSize,
- compress, compressionSize, rowIndexStride);
+ writer = OrcFile.createWriter(fs, path, this.conf, inspector,
+ stripeSize, compress, compressionSize, rowIndexStride);
}
writer.close();
}
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java (revision 1466174)
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java (working copy)
@@ -611,10 +611,7 @@
}
private static class StringTreeReader extends TreeReader {
- private DynamicByteArray dictionaryBuffer = null;
- private int dictionarySize;
- private int[] dictionaryOffsets;
- private RunLengthIntegerReader reader;
+ private TreeReader reader;
StringTreeReader(int columnId) {
super(columnId);
@@ -624,82 +621,193 @@
void startStripe(Map streams,
List encodings
) throws IOException {
- super.startStripe(streams, encodings);
-
- // read the dictionary blob
- dictionarySize = encodings.get(columnId).getDictionarySize();
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DICTIONARY_DATA);
- InStream in = streams.get(name);
- if (in.available() > 0) {
- dictionaryBuffer = new DynamicByteArray(64, in.available());
- dictionaryBuffer.readAll(in);
- } else {
- dictionaryBuffer = null;
+ // For each stripe, checks the encoding and initializes the appropriate reader
+ switch (encodings.get(columnId).getKind()) {
+ case DIRECT:
+ reader = new StringDirectTreeReader(columnId);
+ break;
+ case DICTIONARY:
+ reader = new StringDictionaryTreeReader(columnId);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ encodings.get(columnId).getKind());
}
- in.close();
- // read the lengths
- name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
- in = streams.get(name);
- RunLengthIntegerReader lenReader = new RunLengthIntegerReader(in, false);
- int offset = 0;
- if (dictionaryOffsets == null ||
- dictionaryOffsets.length < dictionarySize + 1) {
- dictionaryOffsets = new int[dictionarySize + 1];
- }
- for(int i=0; i < dictionarySize; ++i) {
- dictionaryOffsets[i] = offset;
- offset += (int) lenReader.next();
- }
- dictionaryOffsets[dictionarySize] = offset;
- in.close();
-
- // set up the row reader
- name = new StreamName(columnId, OrcProto.Stream.Kind.DATA);
- reader = new RunLengthIntegerReader(streams.get(name), false);
+ reader.startStripe(streams, encodings);
}
@Override
void seek(PositionProvider[] index) throws IOException {
- super.seek(index);
- reader.seek(index[columnId]);
+ reader.seek(index);
}
@Override
Object next(Object previous) throws IOException {
- super.next(previous);
- Text result = null;
- if (valuePresent) {
- int entry = (int) reader.next();
- if (previous == null) {
- result = new Text();
- } else {
- result = (Text) previous;
+ return reader.next(previous);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skipRows(items);
+ }
+
+ private static class StringDirectTreeReader extends TreeReader {
+ private InStream stream;
+ private RunLengthIntegerReader lengths;
+
+ StringDirectTreeReader(int columnId) {
+ super(columnId);
+ }
+
+ @Override
+ void startStripe(Map streams,
+ List encodings
+ ) throws IOException {
+ super.startStripe(streams, encodings);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ lengths = new RunLengthIntegerReader(streams.get(new
+ StreamName(columnId, OrcProto.Stream.Kind.LENGTH)),
+ false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ stream.seek(index[columnId]);
+ lengths.seek(index[columnId]);
+ }
+
+ @Override
+ Object next(Object previous) throws IOException {
+ super.next(previous);
+ Text result = null;
+ if (valuePresent) {
+ if (previous == null) {
+ result = new Text();
+ } else {
+ result = (Text) previous;
+ }
+ int len = (int) lengths.next();
+ int offset = 0;
+ byte[] bytes = new byte[len];
+ while (len > 0) {
+ int written = stream.read(bytes, offset, len);
+ if (written < 0) {
+ throw new EOFException("Can't finish byte read from " + stream);
+ }
+ len -= written;
+ offset += written;
+ }
+ result.set(bytes);
}
- int offset = dictionaryOffsets[entry];
- int length;
- // if it isn't the last entry, subtract the offsets otherwise use
- // the buffer length.
- if (entry < dictionaryOffsets.length - 1) {
- length = dictionaryOffsets[entry + 1] - offset;
- } else {
- length = dictionaryBuffer.size() - offset;
+ return result;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for(int i=0; i < items; ++i) {
+ lengthToSkip += lengths.next();
}
- // If the column is just empty strings, the size will be zero, so the buffer will be null,
- // in that case just return result as it will default to empty
- if (dictionaryBuffer != null) {
- dictionaryBuffer.setText(result, offset, length);
+ stream.skip(lengthToSkip);
+ }
+ }
+
+ private static class StringDictionaryTreeReader extends TreeReader {
+ private DynamicByteArray dictionaryBuffer = null;
+ private int dictionarySize;
+ private int[] dictionaryOffsets;
+ private RunLengthIntegerReader reader;
+
+ StringDictionaryTreeReader(int columnId) {
+ super(columnId);
+ }
+
+ @Override
+ void startStripe(Map streams,
+ List encodings
+ ) throws IOException {
+ super.startStripe(streams, encodings);
+
+ // read the dictionary blob
+ dictionarySize = encodings.get(columnId).getDictionarySize();
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ InStream in = streams.get(name);
+ if (in.available() > 0) {
+ dictionaryBuffer = new DynamicByteArray(64, in.available());
+ dictionaryBuffer.readAll(in);
} else {
- result.clear();
+ dictionaryBuffer = null;
}
+ in.close();
+
+ // read the lengths
+ name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
+ in = streams.get(name);
+ RunLengthIntegerReader lenReader = new RunLengthIntegerReader(in, false);
+ int offset = 0;
+ if (dictionaryOffsets == null ||
+ dictionaryOffsets.length < dictionarySize + 1) {
+ dictionaryOffsets = new int[dictionarySize + 1];
+ }
+ for(int i=0; i < dictionarySize; ++i) {
+ dictionaryOffsets[i] = offset;
+ offset += (int) lenReader.next();
+ }
+ dictionaryOffsets[dictionarySize] = offset;
+ in.close();
+
+ // set up the row reader
+ name = new StreamName(columnId, OrcProto.Stream.Kind.DATA);
+ reader = new RunLengthIntegerReader(streams.get(name), false);
}
- return result;
- }
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ reader.seek(index[columnId]);
+ }
+
+ @Override
+ Object next(Object previous) throws IOException {
+ super.next(previous);
+ Text result = null;
+ if (valuePresent) {
+ int entry = (int) reader.next();
+ if (previous == null) {
+ result = new Text();
+ } else {
+ result = (Text) previous;
+ }
+ int offset = dictionaryOffsets[entry];
+ int length;
+ // if it isn't the last entry, subtract the offsets otherwise use
+ // the buffer length.
+ if (entry < dictionaryOffsets.length - 1) {
+ length = dictionaryOffsets[entry + 1] - offset;
+ } else {
+ length = dictionaryBuffer.size() - offset;
+ }
+ // If the column is just empty strings, the size will be zero, so the buffer will be null,
+ // in that case just return result as it will default to empty
+ if (dictionaryBuffer != null) {
+ dictionaryBuffer.setText(result, offset, length);
+ } else {
+ result.clear();
+ }
+ }
+ return result;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
}
}
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java (revision 1466174)
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringRedBlackTree.java (working copy)
@@ -17,11 +17,11 @@
*/
package org.apache.hadoop.hive.ql.io.orc;
-import org.apache.hadoop.io.Text;
-
import java.io.IOException;
import java.io.OutputStream;
+import org.apache.hadoop.io.Text;
+
/**
* A red-black tree that stores strings. The strings are stored as UTF-8 bytes
* and an offset/length for each entry.
@@ -113,16 +113,16 @@
}
public Text getText() {
- byteArray.setText(text, keySizes.get(originalPosition * 2), getLength());
+ StringRedBlackTree.this.getText(text, originalPosition);
return text;
}
public void writeBytes(OutputStream out) throws IOException {
- byteArray.write(out, keySizes.get(originalPosition * 2), getLength());
+ byteArray.write(out, StringRedBlackTree.this.getOffset(originalPosition), getLength());
}
public int getLength() {
- return keySizes.get(originalPosition * 2 + 1);
+ return StringRedBlackTree.this.getLength(originalPosition);
}
public int getCount() {
@@ -142,7 +142,7 @@
/**
* Visit all of the nodes in the tree in sorted order.
- * @param visitor the action to be applied to each ndoe
+ * @param visitor the action to be applied to each node
* @throws IOException
*/
public void visit(Visitor visitor) throws IOException {
@@ -158,6 +158,18 @@
keySizes.clear();
}
+ public void getText(Text result, int originalPosition) {
+ byteArray.setText(result, getOffset(originalPosition), getLength(originalPosition));
+ }
+
+ private int getOffset(int originalPosition) {
+ return keySizes.get(originalPosition * 2);
+ }
+
+ private int getLength(int originalPosition) {
+ return keySizes.get(originalPosition * 2 + 1);
+ }
+
/**
* Get the size of the character data in the table.
* @return the bytes used by the table
Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (revision 1466174)
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (working copy)
@@ -18,11 +18,20 @@
package org.apache.hadoop.hive.ql.io.orc;
-import com.google.protobuf.ByteString;
-import com.google.protobuf.CodedOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -41,15 +50,10 @@
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
+import com.google.protobuf.ByteString;
+import com.google.protobuf.CodedOutputStream;
/**
* An ORC file writer. The file is divided into stripes, which is the natural
@@ -96,8 +100,11 @@
OrcProto.RowIndex.newBuilder();
private final boolean buildIndex;
+ private final Configuration conf;
+
WriterImpl(FileSystem fs,
Path path,
+ Configuration conf,
ObjectInspector inspector,
long stripeSize,
CompressionKind compress,
@@ -105,13 +112,14 @@
int rowIndexStride) throws IOException {
this.fs = fs;
this.path = path;
+ this.conf = conf;
this.stripeSize = stripeSize;
this.compress = compress;
this.bufferSize = bufferSize;
this.rowIndexStride = rowIndexStride;
buildIndex = rowIndexStride > 0;
codec = createCodec(compress);
- treeWriter = createTreeWriter(inspector, streamFactory, false);
+ treeWriter = createTreeWriter(inspector, streamFactory, false, conf);
if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
throw new IllegalArgumentException("Row stride must be at least " +
MIN_ROW_INDEX_STRIDE);
@@ -304,6 +312,7 @@
private final OrcProto.RowIndex.Builder rowIndex;
private final OrcProto.RowIndexEntry.Builder rowIndexEntry;
private final PositionedOutputStream rowIndexStream;
+ private final Configuration conf;
/**
* Create a tree writer
@@ -315,9 +324,10 @@
*/
TreeWriter(int columnId, ObjectInspector inspector,
StreamFactory streamFactory,
- boolean nullable) throws IOException {
+ boolean nullable, Configuration conf) throws IOException {
this.id = columnId;
this.inspector = inspector;
+ this.conf = conf;
if (nullable) {
isPresent = new BitFieldWriter(streamFactory.createStream(id,
OrcProto.Stream.Kind.PRESENT), 1);
@@ -455,8 +465,8 @@
BooleanTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
PositionedOutputStream out = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.writer = new BitFieldWriter(out, 1);
@@ -494,8 +504,8 @@
ByteTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
this.writer = new RunLengthByteWriter(writer.createStream(id,
OrcProto.Stream.Kind.DATA));
recordPosition(rowIndexPosition);
@@ -535,8 +545,8 @@
IntegerTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
PositionedOutputStream out = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.writer = new RunLengthIntegerWriter(out, true);
@@ -595,8 +605,8 @@
FloatTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
this.stream = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
recordPosition(rowIndexPosition);
@@ -633,8 +643,8 @@
DoubleTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
this.stream = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
recordPosition(rowIndexPosition);
@@ -672,28 +682,42 @@
private final RunLengthIntegerWriter countOutput;
private final StringRedBlackTree dictionary = new StringRedBlackTree();
private final DynamicIntArray rows = new DynamicIntArray();
+ private final PositionedOutputStream directStreamOutput;
+ private final RunLengthIntegerWriter directLengthOutput;
private final List savedRowIndex =
new ArrayList();
private final boolean buildIndex;
private final List rowIndexValueCount = new ArrayList();
+ // If the number of keys in a dictionary is greater than this fraction of the total number of
+ // non-null rows, turn off dictionary encoding
+ private final float dictionaryKeySizeThreshold;
+ private boolean useDictionaryEncoding = true;
+
StringTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
stringOutput = writer.createStream(id,
OrcProto.Stream.Kind.DICTIONARY_DATA);
lengthOutput = new RunLengthIntegerWriter(writer.createStream(id,
OrcProto.Stream.Kind.LENGTH), false);
rowOutput = new RunLengthIntegerWriter(writer.createStream(id,
OrcProto.Stream.Kind.DATA), false);
+ directStreamOutput = writer.createStream(id,
+ OrcProto.Stream.Kind.DATA);
+ directLengthOutput = new RunLengthIntegerWriter(writer.createStream(id,
+ OrcProto.Stream.Kind.LENGTH), false);
if (writer.buildIndex()) {
countOutput = new RunLengthIntegerWriter(writer.createStream(id,
OrcProto.Stream.Kind.DICTIONARY_COUNT), false);
} else {
countOutput = null;
}
+ dictionaryKeySizeThreshold = conf.getFloat(
+ HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname,
+ HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.defaultFloatVal);
recordPosition(rowIndexPosition);
rowIndexValueCount.add(0L);
buildIndex = writer.buildIndex();
@@ -713,25 +737,40 @@
@Override
void writeStripe(OrcProto.StripeFooter.Builder builder,
int requiredIndexEntries) throws IOException {
- // Traverse the red-black tree writing out the bytes and lengths; and
- // creating the map from the original order to the final sorted order.
+ // Set the flag indicating whether or not to use dictionary encoding based on whether
+ // or not the fraction of distinct keys over number of non-null rows is less than the
+ // configured threshold
+ if (rows.size() > 0 &&
+ (float)(dictionary.size()) / (float)rows.size() <= dictionaryKeySizeThreshold) {
+ useDictionaryEncoding = true;
+ } else {
+ useDictionaryEncoding = false;
+ }
+
final int[] dumpOrder = new int[dictionary.size()];
- dictionary.visit(new StringRedBlackTree.Visitor() {
- private int currentId = 0;
- @Override
- public void visit(StringRedBlackTree.VisitorContext context
- ) throws IOException {
- context.writeBytes(stringOutput);
- lengthOutput.write(context.getLength());
- dumpOrder[context.getOriginalPosition()] = currentId++;
- if (countOutput != null) {
- countOutput.write(context.getCount());
+
+ if (useDictionaryEncoding) {
+ // Traverse the red-black tree writing out the bytes and lengths; and
+ // creating the map from the original order to the final sorted order.
+ dictionary.visit(new StringRedBlackTree.Visitor() {
+ private int currentId = 0;
+ @Override
+ public void visit(StringRedBlackTree.VisitorContext context
+ ) throws IOException {
+ context.writeBytes(stringOutput);
+ lengthOutput.write(context.getLength());
+ dumpOrder[context.getOriginalPosition()] = currentId++;
+ if (countOutput != null) {
+ countOutput.write(context.getCount());
+ }
}
- }
- });
+ });
+ }
+
int length = rows.size();
int rowIndexEntry = 0;
OrcProto.RowIndex.Builder rowIndex = getRowIndex();
+
// need to build the first index entry out here, to handle the case of
// not having any values.
if (buildIndex) {
@@ -739,10 +778,12 @@
rowIndexEntry < savedRowIndex.size()) {
OrcProto.RowIndexEntry.Builder base =
savedRowIndex.get(rowIndexEntry++).toBuilder();
- rowOutput.getPosition(new RowIndexPositionRecorder(base));
+ recordOutputPosition(base);
rowIndex.addEntry(base.build());
}
}
+
+ Text text = new Text();
// write the values translated into the dump order.
for(int i = 0; i < length; ++i) {
// now that we are writing out the row values, we can finalize the
@@ -752,11 +793,17 @@
rowIndexEntry < savedRowIndex.size()) {
OrcProto.RowIndexEntry.Builder base =
savedRowIndex.get(rowIndexEntry++).toBuilder();
- rowOutput.getPosition(new RowIndexPositionRecorder(base));
+ recordOutputPosition(base);
rowIndex.addEntry(base.build());
}
}
- rowOutput.write(dumpOrder[rows.get(i)]);
+ if (useDictionaryEncoding) {
+ rowOutput.write(dumpOrder[rows.get(i)]);
+ } else {
+ dictionary.getText(text, rows.get(i));
+ directStreamOutput.write(text.getBytes(), 0, text.getLength());
+ directLengthOutput.write(text.getLength());
+ }
}
// we need to build the rowindex before calling super, since it
// writes it out.
@@ -764,6 +811,8 @@
stringOutput.flush();
lengthOutput.flush();
rowOutput.flush();
+ directStreamOutput.flush();
+ directLengthOutput.flush();
if (countOutput != null) {
countOutput.flush();
}
@@ -776,11 +825,27 @@
rowIndexValueCount.add(0L);
}
+ // Calls getPosition on the row output stream if dictionary encoding is used, and the direct
+ // output stream if direct encoding is used
+ private void recordOutputPosition(OrcProto.RowIndexEntry.Builder base) throws IOException {
+ if (useDictionaryEncoding) {
+ rowOutput.getPosition(new RowIndexPositionRecorder(base));
+ } else {
+ directStreamOutput.getPosition(new RowIndexPositionRecorder(base));
+ }
+ }
+
@Override
OrcProto.ColumnEncoding getEncoding() {
- return OrcProto.ColumnEncoding.newBuilder().setKind(
- OrcProto.ColumnEncoding.Kind.DICTIONARY).
- setDictionarySize(dictionary.size()).build();
+ // Returns the encoding used for the last call to writeStripe
+ if (useDictionaryEncoding) {
+ return OrcProto.ColumnEncoding.newBuilder().setKind(
+ OrcProto.ColumnEncoding.Kind.DICTIONARY).
+ setDictionarySize(dictionary.size()).build();
+ } else {
+ return OrcProto.ColumnEncoding.newBuilder().setKind(
+ OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
}
/**
@@ -814,8 +879,8 @@
BinaryTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
this.stream = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.length = new RunLengthIntegerWriter(writer.createStream(id,
@@ -862,8 +927,8 @@
TimestampTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
this.seconds = new RunLengthIntegerWriter(writer.createStream(id,
OrcProto.Stream.Kind.DATA), true);
this.nanos = new RunLengthIntegerWriter(writer.createStream(id,
@@ -921,15 +986,15 @@
StructTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
StructObjectInspector structObjectInspector =
(StructObjectInspector) inspector;
fields = structObjectInspector.getAllStructFieldRefs();
childrenWriters = new TreeWriter[fields.size()];
for(int i=0; i < childrenWriters.length; ++i) {
childrenWriters[i] = createTreeWriter(
- fields.get(i).getFieldObjectInspector(), writer, true);
+ fields.get(i).getFieldObjectInspector(), writer, true, conf);
}
recordPosition(rowIndexPosition);
}
@@ -964,13 +1029,13 @@
ListTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
ListObjectInspector listObjectInspector = (ListObjectInspector) inspector;
childrenWriters = new TreeWriter[1];
childrenWriters[0] =
createTreeWriter(listObjectInspector.getListElementObjectInspector(),
- writer, true);
+ writer, true, conf);
lengths =
new RunLengthIntegerWriter(writer.createStream(columnId,
OrcProto.Stream.Kind.LENGTH), false);
@@ -1014,14 +1079,14 @@
MapTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
MapObjectInspector insp = (MapObjectInspector) inspector;
childrenWriters = new TreeWriter[2];
childrenWriters[0] =
- createTreeWriter(insp.getMapKeyObjectInspector(), writer, true);
+ createTreeWriter(insp.getMapKeyObjectInspector(), writer, true, conf);
childrenWriters[1] =
- createTreeWriter(insp.getMapValueObjectInspector(), writer, true);
+ createTreeWriter(insp.getMapValueObjectInspector(), writer, true, conf);
lengths =
new RunLengthIntegerWriter(writer.createStream(columnId,
OrcProto.Stream.Kind.LENGTH), false);
@@ -1069,13 +1134,13 @@
UnionTreeWriter(int columnId,
ObjectInspector inspector,
StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ boolean nullable, Configuration conf) throws IOException {
+ super(columnId, inspector, writer, nullable, conf);
UnionObjectInspector insp = (UnionObjectInspector) inspector;
List choices = insp.getObjectInspectors();
childrenWriters = new TreeWriter[choices.size()];
for(int i=0; i < childrenWriters.length; ++i) {
- childrenWriters[i] = createTreeWriter(choices.get(i), writer, true);
+ childrenWriters[i] = createTreeWriter(choices.get(i), writer, true, conf);
}
tags =
new RunLengthByteWriter(writer.createStream(columnId,
@@ -1114,53 +1179,53 @@
private static TreeWriter createTreeWriter(ObjectInspector inspector,
StreamFactory streamFactory,
- boolean nullable
+ boolean nullable, Configuration conf
) throws IOException {
switch (inspector.getCategory()) {
case PRIMITIVE:
switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) {
case BOOLEAN:
return new BooleanTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
+ inspector, streamFactory, nullable, conf);
case BYTE:
return new ByteTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
+ inspector, streamFactory, nullable, conf);
case SHORT:
case INT:
case LONG:
return new IntegerTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
+ inspector, streamFactory, nullable, conf);
case FLOAT:
return new FloatTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
+ inspector, streamFactory, nullable, conf);
case DOUBLE:
return new DoubleTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
+ inspector, streamFactory, nullable, conf);
case STRING:
return new StringTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
+ inspector, streamFactory, nullable, conf);
case BINARY:
return new BinaryTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
+ inspector, streamFactory, nullable, conf);
case TIMESTAMP:
return new TimestampTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
+ inspector, streamFactory, nullable, conf);
default:
throw new IllegalArgumentException("Bad primitive category " +
((PrimitiveObjectInspector) inspector).getPrimitiveCategory());
}
case STRUCT:
return new StructTreeWriter(streamFactory.getNextColumnId(), inspector,
- streamFactory, nullable);
+ streamFactory, nullable, conf);
case MAP:
return new MapTreeWriter(streamFactory.getNextColumnId(), inspector,
- streamFactory, nullable);
+ streamFactory, nullable, conf);
case LIST:
return new ListTreeWriter(streamFactory.getNextColumnId(), inspector,
- streamFactory, nullable);
+ streamFactory, nullable, conf);
case UNION:
return new UnionTreeWriter(streamFactory.getNextColumnId(), inspector,
- streamFactory, nullable);
+ streamFactory, nullable, conf);
default:
throw new IllegalArgumentException("Bad category: " +
inspector.getCategory());
Index: ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java
===================================================================
--- ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java (revision 1466174)
+++ ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java (working copy)
@@ -198,7 +198,7 @@
public String getLogDirectory() {
return logDir;
}
-
+
private String getHadoopMainVersion(String input) {
if (input == null) {
return null;
Index: ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
===================================================================
--- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java (revision 1466174)
+++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java (working copy)
@@ -18,15 +18,8 @@
package org.apache.hadoop.hive.ql.io.orc;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
import java.io.BufferedReader;
import java.io.File;
@@ -35,8 +28,14 @@
import java.io.PrintStream;
import java.util.Random;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.junit.Before;
+import org.junit.Test;
public class TestFileDump {
@@ -69,9 +68,6 @@
}
}
- private static final String outputFilename =
- File.separator + "orc-file-dump.out";
-
private static void checkOutput(String expected,
String actual) throws Exception {
BufferedReader eStream =
@@ -94,7 +90,7 @@
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
- Writer writer = OrcFile.createWriter(fs, testFilePath, inspector,
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
100000, CompressionKind.ZLIB, 10000, 10000);
Random r1 = new Random(1);
String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
@@ -114,8 +110,8 @@
}
writer.close();
PrintStream origOut = System.out;
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator +
- "orc-file-dump.out");
+ String outputFilename = File.separator + "orc-file-dump.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + outputFilename);
// replace stdout and run command
System.setOut(new PrintStream(myOut));
@@ -123,6 +119,60 @@
System.out.flush();
System.setOut(origOut);
+
checkOutput(resourceDir + outputFilename, workDir + outputFilename);
}
+
+ // Test that if the fraction of rows that have distinct strings is greater than the configured
+ // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length
+ // of the dictionary stream for the column will be 0 in the ORC file dump.
+ @Test
+ public void testDictionaryThreshold() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector
+ (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Configuration conf = new Configuration();
+ conf.setFloat(HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, 0.49f);
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+ 100000, CompressionKind.ZLIB, 10000, 10000);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ int nextInt = 0;
+ for(int i=0; i < 21000; ++i) {
+ // Write out the same string twice, this guarantees the fraction of rows with
+ // distinct strings is 0.5
+ if (i % 2 == 0) {
+ nextInt = r1.nextInt(words.length);
+ // Append the value of i to the word, this guarantees when an index or word is repeated
+ // the actual string is unique.
+ words[nextInt] += "-" + i;
+ }
+ writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(),
+ words[nextInt]));
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = File.separator + "orc-file-dump-dictionary-threshold.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString()});
+ System.out.flush();
+ System.setOut(origOut);
+
+ checkOutput(resourceDir + outputFilename, workDir + outputFilename);
+ }
}
Index: ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
===================================================================
--- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java (revision 1466174)
+++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java (working copy)
@@ -187,7 +187,7 @@
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
- Writer writer = OrcFile.createWriter(fs, testFilePath, inspector,
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
100000, CompressionKind.ZLIB, 10000, 10000);
writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536,
Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0,1,2,3,4), "hi",
@@ -419,7 +419,7 @@
(InnerStruct.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
- Writer writer = OrcFile.createWriter(fs, testFilePath, inspector,
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.NONE, 100, 1000);
Random r1 = new Random(1);
Random r2 = new Random(2);
@@ -502,7 +502,7 @@
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
- Writer writer = OrcFile.createWriter(fs, testFilePath, inspector,
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.NONE, 100, 10000);
writer.close();
Reader reader = OrcFile.createReader(fs, testFilePath);
@@ -522,7 +522,7 @@
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
- Writer writer = OrcFile.createWriter(fs, testFilePath, inspector,
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.NONE, 100, 10000);
writer.addUserMetadata("my.meta", byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, -128));
writer.addUserMetadata("clobber", byteBuf(1,2,3));
@@ -585,7 +585,7 @@
synchronized (TestOrcFile.class) {
inspector = OrcStruct.createObjectInspector(0, types);
}
- Writer writer = OrcFile.createWriter(fs, testFilePath, inspector,
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.NONE, 100, 10000);
OrcStruct row = new OrcStruct(2);
OrcUnion union = new OrcUnion();
@@ -724,7 +724,7 @@
(InnerStruct.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
- Writer writer = OrcFile.createWriter(fs, testFilePath, inspector,
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.SNAPPY, 100, 10000);
Random rand = new Random(12);
for(int i=0; i < 10000; ++i) {
@@ -759,7 +759,7 @@
(InnerStruct.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
- Writer writer = OrcFile.createWriter(fs, testFilePath, inspector,
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
5000, CompressionKind.SNAPPY, 1000, 0);
Random rand = new Random(24);
for(int i=0; i < 10000; ++i) {
@@ -800,7 +800,7 @@
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
- Writer writer = OrcFile.createWriter(fs, testFilePath, inspector,
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
200000, CompressionKind.ZLIB, 65536, 1000);
Random rand = new Random(42);
final int COUNT=32768;
Index: ql/src/test/queries/clientpositive/orc_dictionary_threshold.q
===================================================================
--- ql/src/test/queries/clientpositive/orc_dictionary_threshold.q (revision 0)
+++ ql/src/test/queries/clientpositive/orc_dictionary_threshold.q (working copy)
@@ -0,0 +1,43 @@
+set hive.exec.orc.dictionary.key.size.threshold=-1;
+
+-- Set the threshold to -1 to guarantee dictionary encoding is turned off
+-- Tests that the data can be read back correctly when a string column is stored
+-- without dictionary encoding
+
+CREATE TABLE test_orc (key STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
+
+INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10;
+
+-- Test reading the column back
+
+SELECT * FROM test_orc;
+
+ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1');
+
+CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/one_thousand' INTO TABLE src_thousand;
+
+set hive.exec.orc.dictionary.key.size.threshold=0.01;
+
+-- Add data to the table in such a way that alternate stripes encode the column
+-- differently. Setting orc.stripe.size = 1 guarantees the stripes each have 1000
+-- rows. Setting the threshold to 0.01 guarantees that if there are 1000 distinct values
+-- for the column in a stripe, it is direct encoded, and if there is only 1 distinct value
+-- it is dictionary encoded. The preceding letters are just to guarantee that the order by
+-- orders them such that the encodings alternate.
+
+INSERT OVERWRITE TABLE test_orc
+SELECT key FROM (
+SELECT CONCAT("a", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("b", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("c", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("d", 1) AS key FROM src_thousand
+) a ORDER BY key LIMIT 4000;
+
+SELECT SUM(HASH(key)) FROM test_orc;
Index: ql/src/test/resources/orc-file-dump-dictionary-threshold.out
===================================================================
--- ql/src/test/resources/orc-file-dump-dictionary-threshold.out (revision 0)
+++ ql/src/test/resources/orc-file-dump-dictionary-threshold.out (working copy)
@@ -0,0 +1,337 @@
+Structure for TestFileDump.testDump.orc
+Rows: 21000
+Compression: ZLIB
+Compression size: 10000
+Type: struct
+
+Statistics:
+ Column 0: count: 21000
+ Column 1: count: 21000 min: -2147390285 max: 2147453086 sum: 109128518326
+ Column 2: count: 21000 min: -9222731174895935707 max: 9222919052987871506
+ Column 3: count: 21000 min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936
+
+Stripes:
+ Stripe: offset: 3 data: 70427 rows: 3000 tail: 87 index: 214
+ Stream: column 0 section ROW_INDEX start: 3 length 10
+ Stream: column 1 section ROW_INDEX start: 13 length 39
+ Stream: column 2 section ROW_INDEX start: 52 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 94 length 0
+ Stream: column 3 section ROW_INDEX start: 94 length 123
+ Stream: column 1 section PRESENT start: 217 length 9
+ Stream: column 1 section DATA start: 226 length 13559
+ Stream: column 2 section PRESENT start: 13785 length 9
+ Stream: column 2 section DATA start: 13794 length 26056
+ Stream: column 3 section PRESENT start: 39850 length 9
+ Stream: column 3 section DATA start: 39859 length 28410
+ Stream: column 3 section LENGTH start: 68269 length 2375
+ Stream: column 3 section DICTIONARY_DATA start: 70644 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 70731 data: 80428 rows: 2000 tail: 88 index: 261
+ Stream: column 0 section ROW_INDEX start: 70731 length 10
+ Stream: column 1 section ROW_INDEX start: 70741 length 39
+ Stream: column 2 section ROW_INDEX start: 70780 length 43
+ Stream: column 3 section DICTIONARY_COUNT start: 70823 length 0
+ Stream: column 3 section ROW_INDEX start: 70823 length 169
+ Stream: column 1 section PRESENT start: 70992 length 7
+ Stream: column 1 section DATA start: 70999 length 9022
+ Stream: column 2 section PRESENT start: 80021 length 7
+ Stream: column 2 section DATA start: 80028 length 17376
+ Stream: column 3 section PRESENT start: 97404 length 7
+ Stream: column 3 section DATA start: 97411 length 52208
+ Stream: column 3 section LENGTH start: 149619 length 1801
+ Stream: column 3 section DICTIONARY_DATA start: 151420 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 151508 data: 50612 rows: 1000 tail: 88 index: 324
+ Stream: column 0 section ROW_INDEX start: 151508 length 10
+ Stream: column 1 section ROW_INDEX start: 151518 length 39
+ Stream: column 2 section ROW_INDEX start: 151557 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 151599 length 0
+ Stream: column 3 section ROW_INDEX start: 151599 length 233
+ Stream: column 1 section PRESENT start: 151832 length 5
+ Stream: column 1 section DATA start: 151837 length 4539
+ Stream: column 2 section PRESENT start: 156376 length 5
+ Stream: column 2 section DATA start: 156381 length 8695
+ Stream: column 3 section PRESENT start: 165076 length 5
+ Stream: column 3 section DATA start: 165081 length 36461
+ Stream: column 3 section LENGTH start: 201542 length 902
+ Stream: column 3 section DICTIONARY_DATA start: 202444 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 202532 data: 56944 rows: 1000 tail: 88 index: 361
+ Stream: column 0 section ROW_INDEX start: 202532 length 10
+ Stream: column 1 section ROW_INDEX start: 202542 length 39
+ Stream: column 2 section ROW_INDEX start: 202581 length 43
+ Stream: column 3 section DICTIONARY_COUNT start: 202624 length 0
+ Stream: column 3 section ROW_INDEX start: 202624 length 269
+ Stream: column 1 section PRESENT start: 202893 length 5
+ Stream: column 1 section DATA start: 202898 length 4540
+ Stream: column 2 section PRESENT start: 207438 length 5
+ Stream: column 2 section DATA start: 207443 length 8679
+ Stream: column 3 section PRESENT start: 216122 length 5
+ Stream: column 3 section DATA start: 216127 length 42824
+ Stream: column 3 section LENGTH start: 258951 length 886
+ Stream: column 3 section DICTIONARY_DATA start: 259837 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 259925 data: 66088 rows: 1000 tail: 88 index: 405
+ Stream: column 0 section ROW_INDEX start: 259925 length 10
+ Stream: column 1 section ROW_INDEX start: 259935 length 39
+ Stream: column 2 section ROW_INDEX start: 259974 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 260016 length 0
+ Stream: column 3 section ROW_INDEX start: 260016 length 314
+ Stream: column 1 section PRESENT start: 260330 length 5
+ Stream: column 1 section DATA start: 260335 length 4533
+ Stream: column 2 section PRESENT start: 264868 length 5
+ Stream: column 2 section DATA start: 264873 length 8683
+ Stream: column 3 section PRESENT start: 273556 length 5
+ Stream: column 3 section DATA start: 273561 length 51921
+ Stream: column 3 section LENGTH start: 325482 length 936
+ Stream: column 3 section DICTIONARY_DATA start: 326418 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 326506 data: 74066 rows: 1000 tail: 88 index: 432
+ Stream: column 0 section ROW_INDEX start: 326506 length 10
+ Stream: column 1 section ROW_INDEX start: 326516 length 39
+ Stream: column 2 section ROW_INDEX start: 326555 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 326597 length 0
+ Stream: column 3 section ROW_INDEX start: 326597 length 341
+ Stream: column 1 section PRESENT start: 326938 length 5
+ Stream: column 1 section DATA start: 326943 length 4534
+ Stream: column 2 section PRESENT start: 331477 length 5
+ Stream: column 2 section DATA start: 331482 length 8688
+ Stream: column 3 section PRESENT start: 340170 length 5
+ Stream: column 3 section DATA start: 340175 length 59841
+ Stream: column 3 section LENGTH start: 400016 length 988
+ Stream: column 3 section DICTIONARY_DATA start: 401004 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 401092 data: 81272 rows: 1000 tail: 87 index: 467
+ Stream: column 0 section ROW_INDEX start: 401092 length 10
+ Stream: column 1 section ROW_INDEX start: 401102 length 39
+ Stream: column 2 section ROW_INDEX start: 401141 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 401183 length 0
+ Stream: column 3 section ROW_INDEX start: 401183 length 376
+ Stream: column 1 section PRESENT start: 401559 length 5
+ Stream: column 1 section DATA start: 401564 length 4536
+ Stream: column 2 section PRESENT start: 406100 length 5
+ Stream: column 2 section DATA start: 406105 length 8699
+ Stream: column 3 section PRESENT start: 414804 length 5
+ Stream: column 3 section DATA start: 414809 length 67035
+ Stream: column 3 section LENGTH start: 481844 length 987
+ Stream: column 3 section DICTIONARY_DATA start: 482831 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 482918 data: 91489 rows: 1000 tail: 88 index: 504
+ Stream: column 0 section ROW_INDEX start: 482918 length 10
+ Stream: column 1 section ROW_INDEX start: 482928 length 39
+ Stream: column 2 section ROW_INDEX start: 482967 length 43
+ Stream: column 3 section DICTIONARY_COUNT start: 483010 length 0
+ Stream: column 3 section ROW_INDEX start: 483010 length 412
+ Stream: column 1 section PRESENT start: 483422 length 5
+ Stream: column 1 section DATA start: 483427 length 4542
+ Stream: column 2 section PRESENT start: 487969 length 5
+ Stream: column 2 section DATA start: 487974 length 8687
+ Stream: column 3 section PRESENT start: 496661 length 5
+ Stream: column 3 section DATA start: 496666 length 77266
+ Stream: column 3 section LENGTH start: 573932 length 979
+ Stream: column 3 section DICTIONARY_DATA start: 574911 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 574999 data: 100263 rows: 1000 tail: 87 index: 536
+ Stream: column 0 section ROW_INDEX start: 574999 length 10
+ Stream: column 1 section ROW_INDEX start: 575009 length 38
+ Stream: column 2 section ROW_INDEX start: 575047 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 575089 length 0
+ Stream: column 3 section ROW_INDEX start: 575089 length 446
+ Stream: column 1 section PRESENT start: 575535 length 5
+ Stream: column 1 section DATA start: 575540 length 4542
+ Stream: column 2 section PRESENT start: 580082 length 5
+ Stream: column 2 section DATA start: 580087 length 8687
+ Stream: column 3 section PRESENT start: 588774 length 5
+ Stream: column 3 section DATA start: 588779 length 86035
+ Stream: column 3 section LENGTH start: 674814 length 984
+ Stream: column 3 section DICTIONARY_DATA start: 675798 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 675885 data: 108783 rows: 1000 tail: 87 index: 586
+ Stream: column 0 section ROW_INDEX start: 675885 length 10
+ Stream: column 1 section ROW_INDEX start: 675895 length 39
+ Stream: column 2 section ROW_INDEX start: 675934 length 43
+ Stream: column 3 section DICTIONARY_COUNT start: 675977 length 0
+ Stream: column 3 section ROW_INDEX start: 675977 length 494
+ Stream: column 1 section PRESENT start: 676471 length 5
+ Stream: column 1 section DATA start: 676476 length 4545
+ Stream: column 2 section PRESENT start: 681021 length 5
+ Stream: column 2 section DATA start: 681026 length 8688
+ Stream: column 3 section PRESENT start: 689714 length 5
+ Stream: column 3 section DATA start: 689719 length 94539
+ Stream: column 3 section LENGTH start: 784258 length 996
+ Stream: column 3 section DICTIONARY_DATA start: 785254 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 785341 data: 115818 rows: 1000 tail: 88 index: 607
+ Stream: column 0 section ROW_INDEX start: 785341 length 10
+ Stream: column 1 section ROW_INDEX start: 785351 length 39
+ Stream: column 2 section ROW_INDEX start: 785390 length 43
+ Stream: column 3 section DICTIONARY_COUNT start: 785433 length 0
+ Stream: column 3 section ROW_INDEX start: 785433 length 515
+ Stream: column 1 section PRESENT start: 785948 length 5
+ Stream: column 1 section DATA start: 785953 length 4546
+ Stream: column 2 section PRESENT start: 790499 length 5
+ Stream: column 2 section DATA start: 790504 length 8688
+ Stream: column 3 section PRESENT start: 799192 length 5
+ Stream: column 3 section DATA start: 799197 length 101544
+ Stream: column 3 section LENGTH start: 900741 length 1025
+ Stream: column 3 section DICTIONARY_DATA start: 901766 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 901854 data: 122636 rows: 1000 tail: 88 index: 626
+ Stream: column 0 section ROW_INDEX start: 901854 length 10
+ Stream: column 1 section ROW_INDEX start: 901864 length 39
+ Stream: column 2 section ROW_INDEX start: 901903 length 43
+ Stream: column 3 section DICTIONARY_COUNT start: 901946 length 0
+ Stream: column 3 section ROW_INDEX start: 901946 length 534
+ Stream: column 1 section PRESENT start: 902480 length 5
+ Stream: column 1 section DATA start: 902485 length 4543
+ Stream: column 2 section PRESENT start: 907028 length 5
+ Stream: column 2 section DATA start: 907033 length 8686
+ Stream: column 3 section PRESENT start: 915719 length 5
+ Stream: column 3 section DATA start: 915724 length 108415
+ Stream: column 3 section LENGTH start: 1024139 length 977
+ Stream: column 3 section DICTIONARY_DATA start: 1025116 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 1025204 data: 135122 rows: 1000 tail: 88 index: 665
+ Stream: column 0 section ROW_INDEX start: 1025204 length 10
+ Stream: column 1 section ROW_INDEX start: 1025214 length 39
+ Stream: column 2 section ROW_INDEX start: 1025253 length 43
+ Stream: column 3 section DICTIONARY_COUNT start: 1025296 length 0
+ Stream: column 3 section ROW_INDEX start: 1025296 length 573
+ Stream: column 1 section PRESENT start: 1025869 length 5
+ Stream: column 1 section DATA start: 1025874 length 4540
+ Stream: column 2 section PRESENT start: 1030414 length 5
+ Stream: column 2 section DATA start: 1030419 length 8683
+ Stream: column 3 section PRESENT start: 1039102 length 5
+ Stream: column 3 section DATA start: 1039107 length 120883
+ Stream: column 3 section LENGTH start: 1159990 length 1001
+ Stream: column 3 section DICTIONARY_DATA start: 1160991 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 1161079 data: 143016 rows: 1000 tail: 88 index: 679
+ Stream: column 0 section ROW_INDEX start: 1161079 length 10
+ Stream: column 1 section ROW_INDEX start: 1161089 length 39
+ Stream: column 2 section ROW_INDEX start: 1161128 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 1161170 length 0
+ Stream: column 3 section ROW_INDEX start: 1161170 length 588
+ Stream: column 1 section PRESENT start: 1161758 length 5
+ Stream: column 1 section DATA start: 1161763 length 4542
+ Stream: column 2 section PRESENT start: 1166305 length 5
+ Stream: column 2 section DATA start: 1166310 length 8683
+ Stream: column 3 section PRESENT start: 1174993 length 5
+ Stream: column 3 section DATA start: 1174998 length 128729
+ Stream: column 3 section LENGTH start: 1303727 length 1047
+ Stream: column 3 section DICTIONARY_DATA start: 1304774 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 1304862 data: 152284 rows: 1000 tail: 88 index: 727
+ Stream: column 0 section ROW_INDEX start: 1304862 length 10
+ Stream: column 1 section ROW_INDEX start: 1304872 length 39
+ Stream: column 2 section ROW_INDEX start: 1304911 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 1304953 length 0
+ Stream: column 3 section ROW_INDEX start: 1304953 length 636
+ Stream: column 1 section PRESENT start: 1305589 length 5
+ Stream: column 1 section DATA start: 1305594 length 4545
+ Stream: column 2 section PRESENT start: 1310139 length 5
+ Stream: column 2 section DATA start: 1310144 length 8684
+ Stream: column 3 section PRESENT start: 1318828 length 5
+ Stream: column 3 section DATA start: 1318833 length 137968
+ Stream: column 3 section LENGTH start: 1456801 length 1072
+ Stream: column 3 section DICTIONARY_DATA start: 1457873 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 1457961 data: 158956 rows: 1000 tail: 88 index: 746
+ Stream: column 0 section ROW_INDEX start: 1457961 length 10
+ Stream: column 1 section ROW_INDEX start: 1457971 length 39
+ Stream: column 2 section ROW_INDEX start: 1458010 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 1458052 length 0
+ Stream: column 3 section ROW_INDEX start: 1458052 length 655
+ Stream: column 1 section PRESENT start: 1458707 length 5
+ Stream: column 1 section DATA start: 1458712 length 4544
+ Stream: column 2 section PRESENT start: 1463256 length 5
+ Stream: column 2 section DATA start: 1463261 length 8682
+ Stream: column 3 section PRESENT start: 1471943 length 5
+ Stream: column 3 section DATA start: 1471948 length 144638
+ Stream: column 3 section LENGTH start: 1616586 length 1077
+ Stream: column 3 section DICTIONARY_DATA start: 1617663 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 1617751 data: 170658 rows: 1000 tail: 88 index: 782
+ Stream: column 0 section ROW_INDEX start: 1617751 length 10
+ Stream: column 1 section ROW_INDEX start: 1617761 length 39
+ Stream: column 2 section ROW_INDEX start: 1617800 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 1617842 length 0
+ Stream: column 3 section ROW_INDEX start: 1617842 length 691
+ Stream: column 1 section PRESENT start: 1618533 length 5
+ Stream: column 1 section DATA start: 1618538 length 4537
+ Stream: column 2 section PRESENT start: 1623075 length 5
+ Stream: column 2 section DATA start: 1623080 length 8691
+ Stream: column 3 section PRESENT start: 1631771 length 5
+ Stream: column 3 section DATA start: 1631776 length 156341
+ Stream: column 3 section LENGTH start: 1788117 length 1074
+ Stream: column 3 section DICTIONARY_DATA start: 1789191 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
+ Stripe: offset: 1789279 data: 179008 rows: 1000 tail: 88 index: 808
+ Stream: column 0 section ROW_INDEX start: 1789279 length 10
+ Stream: column 1 section ROW_INDEX start: 1789289 length 39
+ Stream: column 2 section ROW_INDEX start: 1789328 length 42
+ Stream: column 3 section DICTIONARY_COUNT start: 1789370 length 0
+ Stream: column 3 section ROW_INDEX start: 1789370 length 717
+ Stream: column 1 section PRESENT start: 1790087 length 5
+ Stream: column 1 section DATA start: 1790092 length 4542
+ Stream: column 2 section PRESENT start: 1794634 length 5
+ Stream: column 2 section DATA start: 1794639 length 8684
+ Stream: column 3 section PRESENT start: 1803323 length 5
+ Stream: column 3 section DATA start: 1803328 length 164661
+ Stream: column 3 section LENGTH start: 1967989 length 1106
+ Stream: column 3 section DICTIONARY_DATA start: 1969095 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT
+ Encoding column 2: DIRECT
+ Encoding column 3: DIRECT
Index: ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out
===================================================================
--- ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out (revision 0)
+++ ql/src/test/results/clientpositive/orc_dictionary_threshold.q.out (working copy)
@@ -0,0 +1,100 @@
+PREHOOK: query: CREATE TABLE test_orc (key STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE test_orc (key STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@test_orc
+PREHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@test_orc
+POSTHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT key FROM src limit 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@test_orc
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: SELECT * FROM test_orc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM test_orc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+238
+86
+311
+27
+165
+409
+255
+278
+98
+484
+PREHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1')
+PREHOOK: type: ALTERTABLE_SERDEPROPERTIES
+PREHOOK: Input: default@test_orc
+PREHOOK: Output: default@test_orc
+POSTHOOK: query: ALTER TABLE test_orc SET SERDEPROPERTIES ('orc.stripe.size' = '1')
+POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES
+POSTHOOK: Input: default@test_orc
+POSTHOOK: Output: default@test_orc
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@src_thousand
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/one_thousand' INTO TABLE src_thousand
+PREHOOK: type: LOAD
+PREHOOK: Output: default@src_thousand
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/one_thousand' INTO TABLE src_thousand
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@src_thousand
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: INSERT OVERWRITE TABLE test_orc
+SELECT key FROM (
+SELECT CONCAT("a", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("b", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("c", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("d", 1) AS key FROM src_thousand
+) a ORDER BY key LIMIT 4000
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_thousand
+PREHOOK: Output: default@test_orc
+POSTHOOK: query: INSERT OVERWRITE TABLE test_orc
+SELECT key FROM (
+SELECT CONCAT("a", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("b", 1) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("c", key) AS key FROM src_thousand
+UNION ALL
+SELECT CONCAT("d", 1) AS key FROM src_thousand
+) a ORDER BY key LIMIT 4000
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_thousand
+POSTHOOK: Output: default@test_orc
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ]
+PREHOOK: query: SELECT SUM(HASH(key)) FROM test_orc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT SUM(HASH(key)) FROM test_orc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: Lineage: test_orc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: test_orc.key EXPRESSION [(src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), (src_thousand)src_thousand.FieldSchema(name:key, type:string, comment:null), ]
+5557409630