Index: ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java =================================================================== --- ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java (revision 1471795) +++ ql/src/gen/protobuf/gen-java/org/apache/hadoop/hive/ql/io/orc/OrcProto.java (working copy) @@ -7157,6 +7157,10 @@ // optional uint64 numberOfRows = 5; boolean hasNumberOfRows(); long getNumberOfRows(); + + // optional uint64 rawDataSize = 6; + boolean hasRawDataSize(); + long getRawDataSize(); } public static final class StripeInformation extends com.google.protobuf.GeneratedMessage @@ -7237,12 +7241,23 @@ return numberOfRows_; } + // optional uint64 rawDataSize = 6; + public static final int RAWDATASIZE_FIELD_NUMBER = 6; + private long rawDataSize_; + public boolean hasRawDataSize() { + return ((bitField0_ & 0x00000020) == 0x00000020); + } + public long getRawDataSize() { + return rawDataSize_; + } + private void initFields() { offset_ = 0L; indexLength_ = 0L; dataLength_ = 0L; footerLength_ = 0L; numberOfRows_ = 0L; + rawDataSize_ = 0L; } private byte memoizedIsInitialized = -1; public final boolean isInitialized() { @@ -7271,6 +7286,9 @@ if (((bitField0_ & 0x00000010) == 0x00000010)) { output.writeUInt64(5, numberOfRows_); } + if (((bitField0_ & 0x00000020) == 0x00000020)) { + output.writeUInt64(6, rawDataSize_); + } getUnknownFields().writeTo(output); } @@ -7300,6 +7318,10 @@ size += com.google.protobuf.CodedOutputStream .computeUInt64Size(5, numberOfRows_); } + if (((bitField0_ & 0x00000020) == 0x00000020)) { + size += com.google.protobuf.CodedOutputStream + .computeUInt64Size(6, rawDataSize_); + } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; return size; @@ -7434,6 +7456,8 @@ bitField0_ = (bitField0_ & ~0x00000008); numberOfRows_ = 0L; bitField0_ = (bitField0_ & ~0x00000010); + rawDataSize_ = 0L; + bitField0_ = (bitField0_ & ~0x00000020); return this; } @@ -7492,6 +7516,10 @@ to_bitField0_ |= 0x00000010; } result.numberOfRows_ = numberOfRows_; + if (((from_bitField0_ & 0x00000020) == 0x00000020)) { + to_bitField0_ |= 0x00000020; + } + result.rawDataSize_ = rawDataSize_; result.bitField0_ = to_bitField0_; onBuilt(); return result; @@ -7523,6 +7551,9 @@ if (other.hasNumberOfRows()) { setNumberOfRows(other.getNumberOfRows()); } + if (other.hasRawDataSize()) { + setRawDataSize(other.getRawDataSize()); + } this.mergeUnknownFields(other.getUnknownFields()); return this; } @@ -7579,6 +7610,11 @@ numberOfRows_ = input.readUInt64(); break; } + case 48: { + bitField0_ |= 0x00000020; + rawDataSize_ = input.readUInt64(); + break; + } } } } @@ -7690,6 +7726,27 @@ return this; } + // optional uint64 rawDataSize = 6; + private long rawDataSize_ ; + public boolean hasRawDataSize() { + return ((bitField0_ & 0x00000020) == 0x00000020); + } + public long getRawDataSize() { + return rawDataSize_; + } + public Builder setRawDataSize(long value) { + bitField0_ |= 0x00000020; + rawDataSize_ = value; + onChanged(); + return this; + } + public Builder clearRawDataSize() { + bitField0_ = (bitField0_ & ~0x00000020); + rawDataSize_ = 0L; + onChanged(); + return this; + } + // @@protoc_insertion_point(builder_scope:org.apache.hadoop.hive.ql.io.orc.StripeInformation) } @@ -10371,25 +10428,26 @@ "T\020\003\022\010\n\004LONG\020\004\022\t\n\005FLOAT\020\005\022\n\n\006DOUBLE\020\006\022\n\n\006" + "STRING\020\007\022\n\n\006BINARY\020\010\022\r\n\tTIMESTAMP\020\t\022\010\n\004L" + "IST\020\n\022\007\n\003MAP\020\013\022\n\n\006STRUCT\020\014\022\t\n\005UNION\020\r\022\013\n" + - "\007DECIMAL\020\016\"x\n\021StripeInformation\022\016\n\006offse" + - "t\030\001 \001(\004\022\023\n\013indexLength\030\002 \001(\004\022\022\n\ndataLeng" + - "th\030\003 \001(\004\022\024\n\014footerLength\030\004 \001(\004\022\024\n\014number" + - "OfRows\030\005 \001(\004\"/\n\020UserMetadataItem\022\014\n\004name" + - "\030\001 \002(\t\022\r\n\005value\030\002 \002(\014\"\356\002\n\006Footer\022\024\n\014head" + - "erLength\030\001 \001(\004\022\025\n\rcontentLength\030\002 \001(\004\022D\n", - "\007stripes\030\003 \003(\01323.org.apache.hadoop.hive." + - "ql.io.orc.StripeInformation\0225\n\005types\030\004 \003" + - "(\0132&.org.apache.hadoop.hive.ql.io.orc.Ty" + - "pe\022D\n\010metadata\030\005 \003(\01322.org.apache.hadoop" + - ".hive.ql.io.orc.UserMetadataItem\022\024\n\014numb" + - "erOfRows\030\006 \001(\004\022F\n\nstatistics\030\007 \003(\01322.org" + - ".apache.hadoop.hive.ql.io.orc.ColumnStat" + - "istics\022\026\n\016rowIndexStride\030\010 \001(\r\"\210\001\n\nPostS" + - "cript\022\024\n\014footerLength\030\001 \001(\004\022F\n\013compressi" + - "on\030\002 \001(\01621.org.apache.hadoop.hive.ql.io.", - "orc.CompressionKind\022\034\n\024compressionBlockS" + - "ize\030\003 \001(\004*:\n\017CompressionKind\022\010\n\004NONE\020\000\022\010" + - "\n\004ZLIB\020\001\022\n\n\006SNAPPY\020\002\022\007\n\003LZO\020\003" + "\007DECIMAL\020\016\"\215\001\n\021StripeInformation\022\016\n\006offs" + + "et\030\001 \001(\004\022\023\n\013indexLength\030\002 \001(\004\022\022\n\ndataLen" + + "gth\030\003 \001(\004\022\024\n\014footerLength\030\004 \001(\004\022\024\n\014numbe" + + "rOfRows\030\005 \001(\004\022\023\n\013rawDataSize\030\006 \001(\004\"/\n\020Us" + + "erMetadataItem\022\014\n\004name\030\001 \002(\t\022\r\n\005value\030\002 " + + "\002(\014\"\356\002\n\006Footer\022\024\n\014headerLength\030\001 \001(\004\022\025\n\r", + "contentLength\030\002 \001(\004\022D\n\007stripes\030\003 \003(\01323.o" + + "rg.apache.hadoop.hive.ql.io.orc.StripeIn" + + "formation\0225\n\005types\030\004 \003(\0132&.org.apache.ha" + + "doop.hive.ql.io.orc.Type\022D\n\010metadata\030\005 \003" + + "(\01322.org.apache.hadoop.hive.ql.io.orc.Us" + + "erMetadataItem\022\024\n\014numberOfRows\030\006 \001(\004\022F\n\n" + + "statistics\030\007 \003(\01322.org.apache.hadoop.hiv" + + "e.ql.io.orc.ColumnStatistics\022\026\n\016rowIndex" + + "Stride\030\010 \001(\r\"\210\001\n\nPostScript\022\024\n\014footerLen" + + "gth\030\001 \001(\004\022F\n\013compression\030\002 \001(\01621.org.apa", + "che.hadoop.hive.ql.io.orc.CompressionKin" + + "d\022\034\n\024compressionBlockSize\030\003 \001(\004*:\n\017Compr" + + "essionKind\022\010\n\004NONE\020\000\022\010\n\004ZLIB\020\001\022\n\n\006SNAPPY" + + "\020\002\022\007\n\003LZO\020\003" }; com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { @@ -10497,7 +10555,7 @@ internal_static_org_apache_hadoop_hive_ql_io_orc_StripeInformation_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_org_apache_hadoop_hive_ql_io_orc_StripeInformation_descriptor, - new java.lang.String[] { "Offset", "IndexLength", "DataLength", "FooterLength", "NumberOfRows", }, + new java.lang.String[] { "Offset", "IndexLength", "DataLength", "FooterLength", "NumberOfRows", "RawDataSize", }, org.apache.hadoop.hive.ql.io.orc.OrcProto.StripeInformation.class, org.apache.hadoop.hive.ql.io.orc.OrcProto.StripeInformation.Builder.class); internal_static_org_apache_hadoop_hive_ql_io_orc_UserMetadataItem_descriptor = Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (revision 1471795) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (working copy) @@ -89,6 +89,7 @@ protected transient ListBucketingCtx lbCtx; protected transient boolean isSkewedStoredAsSubDirectories; private transient boolean statsCollectRawDataSize; + private transient boolean statsCollectRawDataSizeFromRecordWriter = false; private static final transient String[] FATAL_ERR_MSG = { @@ -106,6 +107,10 @@ void close(boolean abort) throws IOException; } + public static interface StatsProvidingRecordWriter extends RecordWriter { + SerDeStats getStats(); + } + public class FSPaths implements Cloneable { Path tmpPath; Path taskOutputTempPath; @@ -516,6 +521,9 @@ fsp.outWriters[filesIdx] = HiveFileFormatUtils.getHiveRecordWriter( jc, conf.getTableInfo(), outputClass, conf, fsp.outPaths[filesIdx], reporter); + // If the record writer provides the raw data size, get it from there instead of the serde + statsCollectRawDataSizeFromRecordWriter = + fsp.outWriters[filesIdx] instanceof StatsProvidingRecordWriter; // increment the CREATED_FILES counter if (reporter != null) { reporter.incrCounter(ProgressCounter.CREATED_FILES, 1); @@ -620,7 +628,7 @@ rowOutWriters = fpaths.outWriters; if (conf.isGatherStats()) { - if (statsCollectRawDataSize) { + if (statsCollectRawDataSize && !statsCollectRawDataSizeFromRecordWriter) { SerDeStats stats = serializer.getSerDeStats(); if (stats != null) { fpaths.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize()); @@ -634,8 +642,10 @@ row_count.set(row_count.get() + 1); } + RecordWriter rowOutWriter = null; + if (!multiFileSpray) { - rowOutWriters[0].write(recordValue); + rowOutWriter = rowOutWriters[0]; } else { int keyHashCode = 0; for (int i = 0; i < partitionEval.length; i++) { @@ -646,8 +656,20 @@ key.setHashCode(keyHashCode); int bucketNum = prtner.getBucket(key, null, totalFiles); int idx = bucketMap.get(bucketNum); - rowOutWriters[idx].write(recordValue); + rowOutWriter = rowOutWriters[idx]; } + + rowOutWriter.write(recordValue); + + if (conf.isGatherStats() && statsCollectRawDataSize && + statsCollectRawDataSizeFromRecordWriter) { + + SerDeStats stats = ((StatsProvidingRecordWriter)rowOutWriter).getStats(); + if (stats != null) { + fpaths.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize()); + } + } + } catch (IOException e) { throw new HiveException(e); } catch (SerDeException e) { Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java (revision 1471795) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java (working copy) @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow; +import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.io.NullWritable; @@ -45,7 +46,7 @@ private static class OrcRecordWriter implements RecordWriter, - FileSinkOperator.RecordWriter { + FileSinkOperator.StatsProvidingRecordWriter { private Writer writer = null; private final FileSystem fs; private final Path path; @@ -54,6 +55,7 @@ private final int compressionSize; private final CompressionKind compress; private final int rowIndexStride; + private final SerDeStats stats; OrcRecordWriter(FileSystem fs, Path path, Configuration conf, String stripeSize, String compress, @@ -65,6 +67,7 @@ this.compress = CompressionKind.valueOf(compress); this.compressionSize = Integer.valueOf(compressionSize); this.rowIndexStride = Integer.valueOf(rowIndexStride); + this.stats = new SerDeStats(); } @Override @@ -107,6 +110,12 @@ } writer.close(); } + + @Override + public SerDeStats getStats() { + stats.setRawDataSize(writer.getRowRawDataSize()); + return stats; + } } @Override Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java (revision 1471795) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java (working copy) @@ -18,12 +18,6 @@ package org.apache.hadoop.hive.ql.io.orc; -import com.google.protobuf.CodedInputStream; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; - import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; @@ -31,6 +25,13 @@ import java.util.Iterator; import java.util.List; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; + +import com.google.protobuf.CodedInputStream; + final class ReaderImpl implements Reader { private static final int DIRECTORY_SIZE_GUESS = 16 * 1024; @@ -77,10 +78,15 @@ } @Override + public long getRawDataSize() { + return stripe.getRawDataSize(); + } + + @Override public String toString() { return "offset: " + getOffset() + " data: " + getDataLength() + " rows: " + getNumberOfRows() + " tail: " + getFooterLength() + - " index: " + getIndexLength(); + " index: " + getIndexLength() + " raw_data: " + getRawDataSize(); } } Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/StripeInformation.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/StripeInformation.java (revision 1471795) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/StripeInformation.java (working copy) @@ -50,4 +50,10 @@ * @return a count of the number of rows */ long getNumberOfRows(); + + /** + * Get the raw size of the data in the stripe. + * @return the number of bytes of raw data + */ + long getRawDataSize(); } Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java (revision 1471795) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java (working copy) @@ -42,6 +42,11 @@ void addRow(Object row) throws IOException; /** + * Get the raw data size of the last row added + */ + long getRowRawDataSize(); + + /** * Flush all of the buffers and close the file. No methods on this writer * should be called afterwards. * @throws IOException Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (revision 1471795) +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (working copy) @@ -319,6 +319,8 @@ private final OrcProto.RowIndex.Builder rowIndex; private final OrcProto.RowIndexEntry.Builder rowIndexEntry; private final PositionedOutputStream rowIndexStream; + protected long stripeRawDataSize = 0; + protected long rowRawDataSize = 0; /** * Create a tree writer. @@ -370,16 +372,33 @@ * @param obj * @throws IOException */ - void write(Object obj) throws IOException { + abstract void write(Object obj) throws IOException; + + void write(Object obj, long rawDataSize) throws IOException{ if (obj != null) { indexStatistics.increment(); + setRawDataSize(rawDataSize); + } else { + // Estimate the raw size of null as 1 byte + setRawDataSize(1); } + if (isPresent != null) { isPresent.write(obj == null ? 0 : 1); } } /** + * Sets the row raw data size and updates the stripe raw data size + * + * @param rawDataSize + */ + private void setRawDataSize(long rawDataSize) { + rowRawDataSize = rawDataSize; + stripeRawDataSize += rawDataSize; + } + + /** * Write the stripe out to the file. * @param builder the stripe footer that contains the information about the * layout of the stripe. The TreeWriter is required to update @@ -406,6 +425,7 @@ } rowIndex.clear(); rowIndexEntry.clear(); + stripeRawDataSize = 0; } TreeWriter[] getChildrenWriters() { @@ -462,6 +482,14 @@ } return result; } + + long getStripeRawDataSize() { + return stripeRawDataSize; + } + + long getRowRawDataSize() { + return rowRawDataSize; + } } private static class BooleanTreeWriter extends TreeWriter { @@ -480,7 +508,7 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + super.write(obj, 1); if (obj != null) { boolean val = ((BooleanObjectInspector) inspector).get(obj); indexStatistics.updateBoolean(val); @@ -518,7 +546,7 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + super.write(obj, 1); if (obj != null) { byte val = ((ByteObjectInspector) inspector).get(obj); indexStatistics.updateInteger(val); @@ -574,19 +602,23 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + long rawDataSize = 0; if (obj != null) { long val; if (intInspector != null) { val = intInspector.get(obj); + rawDataSize = 4; } else if (longInspector != null) { val = longInspector.get(obj); + rawDataSize = 8; } else { val = shortInspector.get(obj); + rawDataSize = 2; } indexStatistics.updateInteger(val); writer.write(val); } + super.write(obj, rawDataSize); } @Override @@ -619,7 +651,7 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + super.write(obj, 4); if (obj != null) { float val = ((FloatObjectInspector) inspector).get(obj); indexStatistics.updateDouble(val); @@ -657,7 +689,7 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + super.write(obj, 8); if (obj != null) { double val = ((DoubleObjectInspector) inspector).get(obj); indexStatistics.updateDouble(val); @@ -716,13 +748,16 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + long rawDataSize = 0; if (obj != null) { String val = ((StringObjectInspector) inspector) .getPrimitiveJavaObject(obj); rows.add(dictionary.add(val)); indexStatistics.updateString(val); + // Raw data size is 2 bytes for each character in the string + rawDataSize = val.length() * 2; } + super.write(obj, rawDataSize); } @Override @@ -831,13 +866,17 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + long rawDataSize = 0; if (obj != null) { BytesWritable val = ((BinaryObjectInspector) inspector).getPrimitiveWritableObject(obj); stream.write(val.getBytes(), 0, val.getLength()); length.write(val.getLength()); + + // Raw data size is the length of the BytesWritable, i.e. the number of bytes + rawDataSize = val.getLength(); } + super.write(obj, rawDataSize); } @Override @@ -879,7 +918,12 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + // Raw data size is: + // the number of bytes needed to store the milliseconds since the epoch + // (8 since it's a long) + // + + // the number of bytes needed to store the nanos field (4 since it's an int) + super.write(obj, 12); if (obj != null) { Timestamp val = ((TimestampObjectInspector) inspector). @@ -939,7 +983,7 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + long rawDataSize = 0; if (obj != null) { HiveDecimal decimal = ((HiveDecimalObjectInspector) inspector). getPrimitiveJavaObject(obj); @@ -947,7 +991,9 @@ decimal.unscaledValue()); scaleStream.write(decimal.scale()); indexStatistics.updateDecimal(decimal); + rawDataSize = decimal.unscaledValue().bitLength() + 4; } + super.write(obj, rawDataSize); } @Override @@ -987,15 +1033,17 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + long rawDataSize = 0; if (obj != null) { StructObjectInspector insp = (StructObjectInspector) inspector; for(int i = 0; i < fields.size(); ++i) { StructField field = fields.get(i); TreeWriter writer = childrenWriters[i]; writer.write(insp.getStructFieldData(obj, field)); + rawDataSize += writer.getRowRawDataSize(); } } + super.write(obj, rawDataSize); } @Override @@ -1030,15 +1078,17 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + long rawDataSize = 0; if (obj != null) { ListObjectInspector insp = (ListObjectInspector) inspector; int len = insp.getListLength(obj); lengths.write(len); for(int i=0; i < len; ++i) { childrenWriters[0].write(insp.getListElement(obj, i)); + rawDataSize += childrenWriters[0].getRowRawDataSize(); } } + super.write(obj, rawDataSize); } @Override @@ -1081,7 +1131,7 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + long rawDataSize = 0; if (obj != null) { MapObjectInspector insp = (MapObjectInspector) inspector; int len = insp.getMapSize(obj); @@ -1092,8 +1142,11 @@ for(Map.Entry entry: valueMap.entrySet()) { childrenWriters[0].write(entry.getKey()); childrenWriters[1].write(entry.getValue()); + rawDataSize += childrenWriters[0].getRowRawDataSize(); + rawDataSize += childrenWriters[1].getRowRawDataSize(); } } + super.write(obj, rawDataSize); } @Override @@ -1136,13 +1189,16 @@ @Override void write(Object obj) throws IOException { - super.write(obj); + long rawDataSize = 0; if (obj != null) { UnionObjectInspector insp = (UnionObjectInspector) inspector; byte tag = insp.getTag(obj); tags.write(tag); childrenWriters[tag].write(insp.getField(obj)); + // raw data size is size of tag (1) + size of value + rawDataSize = childrenWriters[tag].getRowRawDataSize() + 1; } + super.write(obj, rawDataSize); } @Override @@ -1329,6 +1385,7 @@ (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride); OrcProto.StripeFooter.Builder builder = OrcProto.StripeFooter.newBuilder(); + long stripeRawDataSize = treeWriter.getStripeRawDataSize(); treeWriter.writeStripe(builder, requiredIndexEntries); long start = rawWriter.getPos(); long section = start; @@ -1359,7 +1416,8 @@ .setIndexLength(indexEnd - start) .setDataLength(section - indexEnd) .setNumberOfRows(rowsInStripe) - .setFooterLength(end - section).build(); + .setFooterLength(end - section) + .setRawDataSize(stripeRawDataSize).build(); stripes.add(dirEntry); rowCount += rowsInStripe; rowsInStripe = 0; @@ -1463,6 +1521,11 @@ } @Override + public long getRowRawDataSize() { + return treeWriter.getRowRawDataSize(); + } + + @Override public void close() throws IOException { flushStripe(); int footerLength = writeFooter(rawWriter.getPos()); Index: ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto =================================================================== --- ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto (revision 1471795) +++ ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto (working copy) @@ -105,6 +105,7 @@ optional uint64 dataLength = 3; optional uint64 footerLength = 4; optional uint64 numberOfRows = 5; + optional uint64 rawDataSize = 6; } message UserMetadataItem { Index: ql/src/test/queries/clientpositive/orc_stats.q =================================================================== --- ql/src/test/queries/clientpositive/orc_stats.q (revision 0) +++ ql/src/test/queries/clientpositive/orc_stats.q (working copy) @@ -0,0 +1,30 @@ +CREATE TABLE test_orc (key1 TINYINT, key2 SMALLINT, key3 INT, key4 BIGINT, key5 BOOLEAN, key6 FLOAT, key7 DOUBLE, key8 STRING, key9 BINARY, key10 TIMESTAMP, key11 ARRAY, key12 MAP, key13 STRUCT, key14 UNIONTYPE) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; + +INSERT OVERWRITE TABLE test_orc SELECT 11, 11, 11, 11, 11, 11, 11, 11, "11", 11, ARRAY(11, 11), MAP(11, 11, 12, 12), NAMED_STRUCT("a", 11, "b", 11), CREATE_UNION(0, 11, 11) FROM src; + +-- The raw data size should be +-- ( +-- 1 (size of tinyint) +-- +2 (size of samllint) +-- +4 (size of int) +-- +8 (size of bigint) +-- +1 (size of boolean) +-- +4 (size of float) +-- +8 (size of double) +-- +(2 + 2) (size of string of length 2) +-- +(1 + 1) (size of binary with characters of size 1) +-- +(8 + 4) (size of timestamp) +-- +(4 + 4) (size of array of 2 integers) +-- +(4 + 4 + 4 + 4) (size of map with 2 integer keys and 2 integer values) +-- +(4 + 4) (size of struct of 2 integers) +-- +(4 + 1) (size of union of 2 integers) +-- ) * 500 (number of rows) +-- ---------- +-- 41500 + +DESC FORMATTED test_orc; + + Index: ql/src/test/resources/orc-file-dump.out =================================================================== --- ql/src/test/resources/orc-file-dump.out (revision 1471795) +++ ql/src/test/resources/orc-file-dump.out (working copy) @@ -11,7 +11,7 @@ Column 3: count: 21000 min: Darkness, max: worst Stripes: - Stripe: offset: 3 data: 83505 rows: 6000 tail: 91 index: 179 + Stripe: offset: 3 data: 83505 rows: 6000 tail: 91 index: 179 raw_data: 118406 Stream: column 0 section ROW_INDEX start: 3 length 10 Stream: column 1 section ROW_INDEX start: 13 length 38 Stream: column 2 section ROW_INDEX start: 51 length 42 @@ -29,7 +29,7 @@ Encoding column 1: DIRECT Encoding column 2: DIRECT Encoding column 3: DICTIONARY[35] - Stripe: offset: 83778 data: 83453 rows: 6000 tail: 91 index: 180 + Stripe: offset: 83778 data: 83453 rows: 6000 tail: 91 index: 180 raw_data: 118812 Stream: column 0 section ROW_INDEX start: 83778 length 10 Stream: column 1 section ROW_INDEX start: 83788 length 39 Stream: column 2 section ROW_INDEX start: 83827 length 42 @@ -47,7 +47,7 @@ Encoding column 1: DIRECT Encoding column 2: DIRECT Encoding column 3: DICTIONARY[35] - Stripe: offset: 167502 data: 83456 rows: 6000 tail: 92 index: 182 + Stripe: offset: 167502 data: 83456 rows: 6000 tail: 92 index: 182 raw_data: 119052 Stream: column 0 section ROW_INDEX start: 167502 length 10 Stream: column 1 section ROW_INDEX start: 167512 length 39 Stream: column 2 section ROW_INDEX start: 167551 length 42 @@ -65,7 +65,7 @@ Encoding column 1: DIRECT Encoding column 2: DIRECT Encoding column 3: DICTIONARY[35] - Stripe: offset: 251232 data: 41842 rows: 3000 tail: 90 index: 172 + Stripe: offset: 251232 data: 41842 rows: 3000 tail: 90 index: 172 raw_data: 59252 Stream: column 0 section ROW_INDEX start: 251232 length 10 Stream: column 1 section ROW_INDEX start: 251242 length 39 Stream: column 2 section ROW_INDEX start: 251281 length 43 Index: ql/src/test/results/clientpositive/orc_stats.q.out =================================================================== --- ql/src/test/results/clientpositive/orc_stats.q.out (revision 0) +++ ql/src/test/results/clientpositive/orc_stats.q.out (working copy) @@ -0,0 +1,133 @@ +PREHOOK: query: CREATE TABLE test_orc (key1 TINYINT, key2 SMALLINT, key3 INT, key4 BIGINT, key5 BOOLEAN, key6 FLOAT, key7 DOUBLE, key8 STRING, key9 BINARY, key10 TIMESTAMP, key11 ARRAY, key12 MAP, key13 STRUCT, key14 UNIONTYPE) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_orc (key1 TINYINT, key2 SMALLINT, key3 INT, key4 BIGINT, key5 BOOLEAN, key6 FLOAT, key7 DOUBLE, key8 STRING, key9 BINARY, key10 TIMESTAMP, key11 ARRAY, key12 MAP, key13 STRUCT, key14 UNIONTYPE) +ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_orc +PREHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT 11, 11, 11, 11, 11, 11, 11, 11, "11", 11, ARRAY(11, 11), MAP(11, 11, 12, 12), NAMED_STRUCT("a", 11, "b", 11), CREATE_UNION(0, 11, 11) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_orc +POSTHOOK: query: INSERT OVERWRITE TABLE test_orc SELECT 11, 11, 11, 11, 11, 11, 11, 11, "11", 11, ARRAY(11, 11), MAP(11, 11, 12, 12), NAMED_STRUCT("a", 11, "b", 11), CREATE_UNION(0, 11, 11) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_orc +POSTHOOK: Lineage: test_orc.key1 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key10 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key11 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key12 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key13 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key14 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key2 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key3 SIMPLE [] +POSTHOOK: Lineage: test_orc.key4 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key5 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key6 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key7 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key8 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key9 EXPRESSION [] +PREHOOK: query: -- The raw data size should be +-- ( +-- 1 (size of tinyint) +-- +2 (size of samllint) +-- +4 (size of int) +-- +8 (size of bigint) +-- +1 (size of boolean) +-- +4 (size of float) +-- +8 (size of double) +-- +(2 + 2) (size of string of length 2) +-- +(1 + 1) (size of binary with characters of size 1) +-- +(8 + 4) (size of timestamp) +-- +(4 + 4) (size of array of 2 integers) +-- +(4 + 4 + 4 + 4) (size of map with 2 integer keys and 2 integer values) +-- +(4 + 4) (size of struct of 2 integers) +-- +(4 + 1) (size of union of 2 integers) +-- ) * 500 (number of rows) +-- ---------- +-- 41500 + +DESC FORMATTED test_orc +PREHOOK: type: DESCTABLE +POSTHOOK: query: -- The raw data size should be +-- ( +-- 1 (size of tinyint) +-- +2 (size of samllint) +-- +4 (size of int) +-- +8 (size of bigint) +-- +1 (size of boolean) +-- +4 (size of float) +-- +8 (size of double) +-- +(2 + 2) (size of string of length 2) +-- +(1 + 1) (size of binary with characters of size 1) +-- +(8 + 4) (size of timestamp) +-- +(4 + 4) (size of array of 2 integers) +-- +(4 + 4 + 4 + 4) (size of map with 2 integer keys and 2 integer values) +-- +(4 + 4) (size of struct of 2 integers) +-- +(4 + 1) (size of union of 2 integers) +-- ) * 500 (number of rows) +-- ---------- +-- 41500 + +DESC FORMATTED test_orc +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: test_orc.key1 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key10 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key11 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key12 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key13 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key14 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key2 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key3 SIMPLE [] +POSTHOOK: Lineage: test_orc.key4 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key5 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key6 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key7 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key8 EXPRESSION [] +POSTHOOK: Lineage: test_orc.key9 EXPRESSION [] +# col_name data_type comment + +key1 tinyint from deserializer +key2 smallint from deserializer +key3 int from deserializer +key4 bigint from deserializer +key5 boolean from deserializer +key6 float from deserializer +key7 double from deserializer +key8 string from deserializer +key9 binary from deserializer +key10 timestamp from deserializer +key11 array from deserializer +key12 map from deserializer +key13 struct from deserializer +key14 uniontype from deserializer + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 1 + numPartitions 0 + numRows 500 + rawDataSize 41500 + totalSize 1434 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1