diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index bcee201..73388c1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -38,8 +38,8 @@ import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; -import org.apache.hadoop.hive.ql.io.HivePassThroughOutputFormat; import org.apache.hadoop.hive.ql.io.HivePartitioner; +import org.apache.hadoop.hive.ql.io.HivePassThroughOutputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; @@ -89,6 +89,7 @@ protected transient ListBucketingCtx lbCtx; protected transient boolean isSkewedStoredAsSubDirectories; private transient boolean statsCollectRawDataSize; + private transient boolean statsFromRecordWriter = false; private static final transient String[] FATAL_ERR_MSG = { @@ -106,6 +107,10 @@ void close(boolean abort) throws IOException; } + public static interface StatsProvidingRecordWriter extends RecordWriter { + SerDeStats getStats(); + } + public class FSPaths implements Cloneable { Path tmpPath; Path taskOutputTempPath; @@ -516,6 +521,9 @@ private void createBucketFiles(FSPaths fsp) throws HiveException { fsp.outWriters[filesIdx] = HiveFileFormatUtils.getHiveRecordWriter( jc, conf.getTableInfo(), outputClass, conf, fsp.outPaths[filesIdx], reporter); + // If the record writer provides stats, get it from there instead of the serde + statsFromRecordWriter = + fsp.outWriters[filesIdx] instanceof StatsProvidingRecordWriter; // increment the CREATED_FILES counter if (reporter != null) { reporter.incrCounter(ProgressCounter.CREATED_FILES, 1); @@ -619,7 +627,7 @@ public void processOp(Object row, int tag) throws HiveException { } rowOutWriters = fpaths.outWriters; - if (conf.isGatherStats()) { + if (conf.isGatherStats() && !statsFromRecordWriter) { if (statsCollectRawDataSize) { SerDeStats stats = serializer.getSerDeStats(); if (stats != null) { @@ -634,8 +642,10 @@ public void processOp(Object row, int tag) throws HiveException { row_count.set(row_count.get() + 1); } + RecordWriter rowOutWriter = null; + if (!multiFileSpray) { - rowOutWriters[0].write(recordValue); + rowOutWriter = rowOutWriters[0]; } else { int keyHashCode = 0; for (int i = 0; i < partitionEval.length; i++) { @@ -646,8 +656,9 @@ public void processOp(Object row, int tag) throws HiveException { key.setHashCode(keyHashCode); int bucketNum = prtner.getBucket(key, null, totalFiles); int idx = bucketMap.get(bucketNum); - rowOutWriters[idx].write(recordValue); + rowOutWriter = rowOutWriters[idx]; } + rowOutWriter.write(recordValue); } catch (IOException e) { throw new HiveException(e); } catch (SerDeException e) { @@ -864,6 +875,26 @@ public void closeOp(boolean abort) throws HiveException { if (!abort) { for (FSPaths fsp : valToPaths.values()) { fsp.closeWriters(abort); + // before closing the operator check if statistics gathering is requested + // and is provided by record writer. this is different from the statistics + // gathering done in processOp(). In processOp(), for each row added + // serde statistics about the row is gathered and accumulated in hashmap. + // this adds more overhead to the actual processing of row. But if the + // record writer already gathers the statistics, it can simply return the + // accumulated statistics which will be aggregated in case of spray writers + if (conf.isGatherStats() && statsFromRecordWriter) { + for (int idx = 0; idx < fsp.outWriters.length; idx++) { + RecordWriter outWriter = fsp.outWriters[idx]; + if (outWriter != null) { + SerDeStats stats = ((StatsProvidingRecordWriter) outWriter).getStats(); + if (stats != null) { + fsp.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize()); + fsp.stat.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount()); + } + } + } + } + if (isNativeTable) { fsp.commit(fs); } @@ -934,7 +965,7 @@ public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException hiveOutputFormat = ReflectionUtils.newInstance(conf.getTableInfo().getOutputFileFormatClass(),job); } else { - hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance(); + hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance(); } } else { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/BinaryColumnStatistics.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/BinaryColumnStatistics.java new file mode 100644 index 0000000..23030a3 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/BinaryColumnStatistics.java @@ -0,0 +1,25 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.orc; + +/** + * Statistics for binary columns. + */ +public interface BinaryColumnStatistics extends ColumnStatistics { + long getSum(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java index 6268617..42d897c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java @@ -21,6 +21,7 @@ import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.io.BytesWritable; class ColumnStatisticsImpl implements ColumnStatistics { @@ -332,10 +333,11 @@ public String toString() { } } - private static final class StringStatisticsImpl extends ColumnStatisticsImpl + protected static final class StringStatisticsImpl extends ColumnStatisticsImpl implements StringColumnStatistics { private String minimum = null; private String maximum = null; + private long sum = 0; StringStatisticsImpl() { } @@ -349,6 +351,9 @@ public String toString() { if (str.hasMinimum()) { minimum = str.getMinimum(); } + if(str.hasSum()) { + sum = str.getSum(); + } } @Override @@ -356,6 +361,7 @@ void reset() { super.reset(); minimum = null; maximum = null; + sum = 0; } @Override @@ -368,6 +374,7 @@ void updateString(String value) { } else if (maximum.compareTo(value) < 0) { maximum = value; } + sum += value.length(); } @Override @@ -384,6 +391,7 @@ void merge(ColumnStatisticsImpl other) { maximum = str.maximum; } } + sum += str.sum; } @Override @@ -394,6 +402,7 @@ void merge(ColumnStatisticsImpl other) { if (getNumberOfValues() != 0) { str.setMinimum(minimum); str.setMaximum(maximum); + str.setSum(sum); } result.setStringStatistics(str); return result; @@ -410,6 +419,11 @@ public String getMaximum() { } @Override + public long getSum() { + return sum; + } + + @Override public String toString() { StringBuilder buf = new StringBuilder(super.toString()); if (getNumberOfValues() != 0) { @@ -417,6 +431,67 @@ public String toString() { buf.append(minimum); buf.append(" max: "); buf.append(maximum); + buf.append(" sum: "); + buf.append(sum); + } + return buf.toString(); + } + } + + protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements + BinaryColumnStatistics { + + private long sum = 0; + + BinaryStatisticsImpl() { + } + + BinaryStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + OrcProto.BinaryStatistics binStats = stats.getBinaryStatistics(); + if (binStats.hasSum()) { + sum = binStats.getSum(); + } + } + + @Override + void reset() { + super.reset(); + sum = 0; + } + + @Override + void updateBinary(BytesWritable value) { + sum += value.getLength(); + } + + @Override + void merge(ColumnStatisticsImpl other) { + super.merge(other); + BinaryStatisticsImpl bin = (BinaryStatisticsImpl) other; + sum += bin.sum; + } + + @Override + public long getSum() { + return sum; + } + + @Override + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder result = super.serialize(); + OrcProto.BinaryStatistics.Builder bin = OrcProto.BinaryStatistics.newBuilder(); + bin.setSum(sum); + result.setBinaryStatistics(bin); + return result; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (getNumberOfValues() != 0) { + buf.append(" sum: "); + buf.append(sum); } return buf.toString(); } @@ -666,6 +741,10 @@ void updateString(String value) { throw new UnsupportedOperationException("Can't update string"); } + void updateBinary(BytesWritable value) { + throw new UnsupportedOperationException("Can't update binary"); + } + void updateDecimal(HiveDecimal value) { throw new UnsupportedOperationException("Can't update decimal"); } @@ -720,6 +799,8 @@ static ColumnStatisticsImpl create(ObjectInspector inspector) { return new DecimalStatisticsImpl(); case DATE: return new DateStatisticsImpl(); + case BINARY: + return new BinaryStatisticsImpl(); default: return new ColumnStatisticsImpl(); } @@ -741,6 +822,8 @@ static ColumnStatisticsImpl deserialize(OrcProto.ColumnStatistics stats) { return new DecimalStatisticsImpl(stats); } else if (stats.hasDateStatistics()) { return new DateStatisticsImpl(stats); + } else if(stats.hasBinaryStatistics()) { + return new BinaryStatisticsImpl(stats); } else { return new ColumnStatisticsImpl(stats); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java index c80fb02..0454b91 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java @@ -17,12 +17,16 @@ */ package org.apache.hadoop.hive.ql.io.orc; -import org.apache.hadoop.conf.Configuration; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Properties; + import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow; +import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.io.NullWritable; @@ -33,10 +37,6 @@ import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.Progressable; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Properties; - /** * A Hive OutputFormat for ORC files. */ @@ -45,14 +45,17 @@ private static class OrcRecordWriter implements RecordWriter, - FileSinkOperator.RecordWriter { + FileSinkOperator.RecordWriter, + FileSinkOperator.StatsProvidingRecordWriter { private Writer writer = null; private final Path path; private final OrcFile.WriterOptions options; + private final SerDeStats stats; OrcRecordWriter(Path path, OrcFile.WriterOptions options) { this.path = path; this.options = options; + this.stats = new SerDeStats(); } @Override @@ -94,6 +97,13 @@ public void close(boolean b) throws IOException { } writer.close(); } + + @Override + public SerDeStats getStats() { + stats.setRawDataSize(writer.getRawDataSize()); + stats.setRowCount(writer.getNumberOfRows()); + return stats; + } } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java index 90260fd..8743b41 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java @@ -18,13 +18,13 @@ package org.apache.hadoop.hive.ql.io.orc; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; - import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; + /** * The interface for reading ORC files. * @@ -39,6 +39,19 @@ long getNumberOfRows(); /** + * Get the deserialized data size of the file + * @return raw data size + */ + long getRawDataSize(); + + /** + * Get the deserialized data size of the specified columns + * @param colIndices + * @return raw data size of columns + */ + long getRawDataSizeOfColumns(int[] colIndices); + + /** * Get the user metadata keys. * @return the set of metadata keys */ @@ -121,6 +134,7 @@ * @throws IOException * @deprecated */ + @Deprecated RecordReader rows(long offset, long length, boolean[] include) throws IOException; @@ -141,4 +155,5 @@ RecordReader rows(long offset, long length, boolean[] include, SearchArgument sarg, String[] neededColumns) throws IOException; + } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java index c454f32..81f7c4a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java @@ -18,23 +18,24 @@ package org.apache.hadoop.hive.ql.io.orc; -import com.google.protobuf.CodedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.io.Text; -import org.apache.hadoop.util.StringUtils; -import java.io.IOException; -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; +import com.google.protobuf.CodedInputStream; final class ReaderImpl implements Reader { @@ -49,6 +50,7 @@ private final int bufferSize; private final OrcProto.Footer footer; private final ObjectInspector inspector; + private long deserializedSize = -1; private static class StripeInformationImpl implements StripeInformation { @@ -102,6 +104,69 @@ public long getNumberOfRows() { } @Override + public long getRawDataSize() { + if (deserializedSize == -1) { + List stats = footer.getStatisticsList(); + int[] indices = new int[stats.size()]; + for (int i = 0; i < stats.size(); ++i) { + indices[i] = i; + } + deserializedSize = getRawDataSizeOfColumns(indices); + } + return deserializedSize; + } + + @Override + public long getRawDataSizeOfColumns(int[] colIndices) { + long result = 0; + for (int colIdx : colIndices) { + result += getRawDataSizeOfColumn(colIdx); + } + return result; + } + + private long getRawDataSizeOfColumn(int colIdx) { + OrcProto.ColumnStatistics colStat = footer.getStatistics(colIdx); + long numVals = colStat.getNumberOfValues(); + Type type = footer.getTypes(colIdx); + + switch (type.getKind()) { + case BINARY: + // old orc format doesn't support binary statistics. checking for binary + // statistics is not required as protocol buffers takes care of it. + return colStat.getBinaryStatistics().getSum(); + case DATE: + return numVals * 8; + case DECIMAL: + return numVals * 36; + case DOUBLE: + return numVals * 8; + case FLOAT: + return numVals * 4; + case INT: + return numVals * 4; + case LONG: + return numVals * 8; + case SHORT: + return numVals * 2; + case STRING: + // old orc format doesn't support sum for string statistics. checking for + // existence is not required as protocol buffers takes care of it. + return colStat.getStringStatistics().getSum(); + case TIMESTAMP: + return numVals * 12; + case BOOLEAN: + return numVals / 8; + case BYTE: + return numVals; + default: + break; + } + + return 0; + } + + @Override public Iterable getMetadataKeys() { List result = new ArrayList(); for(OrcProto.UserMetadataItem item: footer.getMetadataList()) { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringColumnStatistics.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringColumnStatistics.java index 72e779a..3a49269 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringColumnStatistics.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/StringColumnStatistics.java @@ -32,4 +32,10 @@ * @return the maximum */ String getMaximum(); + + /** + * Get the total length of all strings + * @return the sum (total length) + */ + long getSum(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java index 8e74b91..591a238 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java @@ -47,4 +47,22 @@ * @throws IOException */ void close() throws IOException; + + /** + * Return the deserialized data size. Raw data size will be compute when + * writing the file footer. Hence raw data size value will be available only + * after closing the writer. + * + * @return raw data size + */ + long getRawDataSize(); + + /** + * Return the number of rows in file. Row count gets updated when flushing + * the stripes. To get accurate row count this method should be called after + * closing the writer. + * + * @return row count + */ + long getNumberOfRows(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java index 44961ce..e7176d2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java @@ -27,17 +27,14 @@ import java.util.Map; import java.util.TreeMap; - -import com.google.protobuf.ByteString; -import com.google.protobuf.CodedOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type; import org.apache.hadoop.hive.serde2.io.DateWritable; @@ -66,6 +63,9 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; +import com.google.protobuf.ByteString; +import com.google.protobuf.CodedOutputStream; + /** * An ORC file writer. The file is divided into stripes, which is the natural * unit of work when reading. Each stripe is buffered in memory until the @@ -111,6 +111,7 @@ private int columnCount; private long rowCount = 0; private long rowsInStripe = 0; + private long rawDataSize = 0; private int rowsInIndex = 0; private final List stripes = new ArrayList(); @@ -1085,6 +1086,7 @@ void write(Object obj) throws IOException { ((BinaryObjectInspector) inspector).getPrimitiveWritableObject(obj); stream.write(val.getBytes(), 0, val.getLength()); length.write(val.getLength()); + indexStatistics.updateBinary(val); } } @@ -1760,6 +1762,110 @@ private void flushStripe() throws IOException { } } + private long computeRawDataSize() { + long result = 0; + for (TreeWriter child : treeWriter.getChildrenWriters()) { + result += getRawDataSizeFromInspectors(child, child.inspector); + } + return result; + } + + private long getRawDataSizeFromInspectors(TreeWriter child, ObjectInspector oi) { + long total = 0; + switch (oi.getCategory()) { + case PRIMITIVE: + total += getRawDataSizeFromPrimitives(child, oi); + break; + case LIST: + case MAP: + case UNION: + case STRUCT: + for (TreeWriter tw : child.childrenWriters) { + total += getRawDataSizeFromInspectors(tw, tw.inspector); + } + break; + default: + break; + } + return total; + } + + private long getRawDataSizeFromPrimitives(TreeWriter child, ObjectInspector oi) { + long result = 0; + switch (((PrimitiveObjectInspector) oi).getPrimitiveCategory()) { + case BOOLEAN: + child = (BooleanTreeWriter) child; + // booleans are written to bit streams, so assume raw data + // size equal to uncompressed byte size + result += child.fileStatistics.getNumberOfValues() / 8; + break; + case BYTE: + child = (ByteTreeWriter) child; + // byte type = 1 byte + result += child.fileStatistics.getNumberOfValues(); + break; + case SHORT: + // short = 2 bytes + child = (IntegerTreeWriter) child; + result += child.fileStatistics.getNumberOfValues() * 2; + break; + case INT: + // int = 4 bytes + child = (IntegerTreeWriter) child; + result += child.fileStatistics.getNumberOfValues() * 4; + break; + case LONG: + // long = 8 bytes + child = (IntegerTreeWriter) child; + result += child.fileStatistics.getNumberOfValues() * 8; + break; + case FLOAT: + // float = 4 bytes + child = (FloatTreeWriter) child; + result += child.fileStatistics.getNumberOfValues() * 4; + break; + case DOUBLE: + // double = 8 bytes + child = (DoubleTreeWriter) child; + result += child.fileStatistics.getNumberOfValues() * 8; + break; + case STRING: + // get total length of string + child = (StringTreeWriter) child; + StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics; + result += scs.getSum(); + break; + case DECIMAL: + child = (DecimalTreeWriter) child; + // HiveDecimal in ORC is stored using BigInteger (value) and int (scale). + // BigInteger internally uses integer array to store the values. Hence size + // of each decimal values cannot be estimated properly. We will + // assume 32 bytes for BigInteger and 4 for int + result += child.fileStatistics.getNumberOfValues() * 36; + break; + case DATE: + // date constructor accepts long, we will assume 8 bytes for date + child = (DateTreeWriter) child; + result += child.fileStatistics.getNumberOfValues() * 8; + break; + case BINARY: + // get total length of binary blob + child = (BinaryTreeWriter) child; + BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics; + result += bcs.getSum(); + break; + case TIMESTAMP: + child = (TimestampTreeWriter) child; + // seconds used long and nanos uses int + result += child.fileStatistics.getNumberOfValues() * 12; + break; + default: + break; + } + + return result; + } + private OrcProto.CompressionKind writeCompressionKind(CompressionKind kind) { switch (kind) { case NONE: return OrcProto.CompressionKind.NONE; @@ -1786,6 +1892,8 @@ private int writeFooter(long bodyLength) throws IOException { builder.setHeaderLength(headerLength); builder.setNumberOfRows(rowCount); builder.setRowIndexStride(rowIndexStride); + // populate raw data size + rawDataSize = computeRawDataSize(); // serialize the types writeTypes(builder, treeWriter); // add the stripe information @@ -1871,4 +1979,22 @@ public void close() throws IOException { rawWriter.close(); } } + + /** + * Raw data size will be compute when writing the file footer. Hence raw data + * size value will be available only after closing the writer. + */ + @Override + public long getRawDataSize() { + return rawDataSize; + } + + /** + * Row count gets updated when flushing the stripes. To get accurate row + * count call this method after writer is closed. + */ + @Override + public long getNumberOfRows() { + return rowCount; + } } diff --git ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto index edbf822..cee08fd 100644 --- ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto +++ ql/src/protobuf/org/apache/hadoop/hive/ql/io/orc/orc_proto.proto @@ -15,6 +15,8 @@ message DoubleStatistics { message StringStatistics { optional string minimum = 1; optional string maximum = 2; + // sum will store the total length of all strings in a stripe + optional sint64 sum = 3; } message BucketStatistics { @@ -33,6 +35,11 @@ message DateStatistics { optional sint32 maximum = 2; } +message BinaryStatistics { + // sum will store the total binary blob length in a stripe + optional sint64 sum = 1; +} + message ColumnStatistics { optional uint64 numberOfValues = 1; optional IntegerStatistics intStatistics = 2; @@ -41,6 +48,7 @@ message ColumnStatistics { optional BucketStatistics bucketStatistics = 5; optional DecimalStatistics decimalStatistics = 6; optional DateStatistics dateStatistics = 7; + optional BinaryStatistics binaryStatistics = 8; } message RowIndexEntry { diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java index e6569f4..1dfcb56 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java @@ -18,12 +18,28 @@ package org.apache.hadoop.hive.ql.io.orc; +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertNotNull; +import static junit.framework.Assert.assertNull; +import static junit.framework.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.ql.io.sarg.TestSearchArgumentImpl; import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.ShortWritable; @@ -55,26 +71,25 @@ import org.junit.Test; import org.junit.rules.TestName; -import java.io.File; -import java.io.IOException; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Random; - -import static junit.framework.Assert.*; -import static junit.framework.Assert.assertEquals; - /** * Tests for the top level reader/streamFactory of ORC files. */ public class TestOrcFile { + public static class SimpleStruct { + BytesWritable bytes1; + Text string1; + + SimpleStruct(BytesWritable b1, String s1) { + this.bytes1 = b1; + if(s1 == null) { + this.string1 = null; + } else { + this.string1 = new Text(s1); + } + } + } + public static class InnerStruct { int int1; Text string1 = new Text(); @@ -132,48 +147,6 @@ } } - public static class AllTypesRow { - Boolean boolean1; - Byte byte1; - Short short1; - Integer int1; - Long long1; - Float float1; - Double double1; - BytesWritable bytes1; - Text string1; - MiddleStruct middle; - List list = new ArrayList(); - Map map = new HashMap(); - Timestamp ts; - HiveDecimal decimal1; - - AllTypesRow(Boolean b1, Byte b2, Short s1, Integer i1, Long l1, Float f1, - Double d1, - BytesWritable b3, String s2, MiddleStruct m1, - List l2, Map m2, - Timestamp ts1, HiveDecimal decimal) { - this.boolean1 = b1; - this.byte1 = b2; - this.short1 = s1; - this.int1 = i1; - this.long1 = l1; - this.float1 = f1; - this.double1 = d1; - this.bytes1 = b3; - if (s2 == null) { - this.string1 = null; - } else { - this.string1 = new Text(s2); - } - this.middle = m1; - this.list = l2; - this.map = m2; - this.ts = ts1; - this.decimal1 = decimal; - } - } - private static InnerStruct inner(int i, String s) { return new InnerStruct(i, s); } @@ -231,39 +204,6 @@ public void openFileSystem () throws Exception { } @Test - public void testWriteFormat_0_11() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory - .getReflectionObjectInspector(AllTypesRow.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } - conf.set("hive.exec.orc.write.format", "0.11"); - Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, - 100000, CompressionKind.NONE, 10000, 10000); - for(int i = 0; i < 7500; i++) { - if (i % 2 == 0) { - writer.addRow(new AllTypesRow(false, (byte) 1, (short) 1024, 65536, - Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", - new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list( - inner(3, "good"), inner(4, "bad")), map(), Timestamp - .valueOf("2000-03-12 15:00:00"), new HiveDecimal( - "12345678.6547456"))); - } else { - writer.addRow(new AllTypesRow(true, (byte) 100, (short) 2048, 65536, - Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye", - new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list( - inner(100000000, "cat"), inner(-100000, "in"), - inner(1234, "hat")), - map(inner(5, "chani"), inner(1, "mauddib")), Timestamp - .valueOf("2000-03-12 15:00:01"), new HiveDecimal( - "12345678.6547457"))); - } - } - writer.close(); - } - - @Test public void testReadFormat_0_11() throws Exception { Path resourceDir = new Path(System.getProperty("test.build.resources", "ql" + File.separator + "src" + File.separator + "test" + File.separator @@ -319,7 +259,7 @@ public void testReadFormat_0_11() throws Exception { assertEquals("count: 7500 min: -15.0 max: -5.0 sum: -75000.0", stats[7].toString()); - assertEquals("count: 7500 min: bye max: hi", stats[9].toString()); + assertEquals("count: 7500 min: bye max: hi sum: 0", stats[9].toString()); // check the inspectors StructObjectInspector readerInspector = (StructObjectInspector) reader @@ -515,6 +455,93 @@ public void testReadFormat_0_11() throws Exception { } @Test + public void testStringAndBinaryStatistics() throws Exception { + + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .inspector(inspector) + .stripeSize(100000) + .bufferSize(10000)); + writer.addRow(new SimpleStruct(bytes(0,1,2,3,4), "foo")); + writer.addRow(new SimpleStruct(bytes(0,1,2,3), "bar")); + writer.addRow(new SimpleStruct(bytes(0,1,2,3,4,5), null)); + writer.addRow(new SimpleStruct(null, "hi")); + writer.close(); + Reader reader = OrcFile.createReader(fs, testFilePath); + + // check the stats + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(4, stats[0].getNumberOfValues()); + assertEquals("count: 4", stats[0].toString()); + + assertEquals(3, stats[1].getNumberOfValues()); + assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum()); + assertEquals("count: 3 sum: 15", stats[1].toString()); + + assertEquals(3, stats[2].getNumberOfValues()); + assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum()); + assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum()); + assertEquals(8, ((StringColumnStatistics) stats[2]).getSum()); + assertEquals("count: 3 min: bar max: hi sum: 8", + stats[2].toString()); + + // check the inspectors + StructObjectInspector readerInspector = + (StructObjectInspector) reader.getObjectInspector(); + assertEquals(ObjectInspector.Category.STRUCT, + readerInspector.getCategory()); + assertEquals("struct", + readerInspector.getTypeName()); + List fields = + readerInspector.getAllStructFieldRefs(); + BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector. + getStructFieldRef("bytes1").getFieldObjectInspector(); + StringObjectInspector st = (StringObjectInspector) readerInspector. + getStructFieldRef("string1").getFieldObjectInspector(); + RecordReader rows = reader.rows(null); + Object row = rows.next(null); + assertNotNull(row); + // check the contents of the first row + assertEquals(bytes(0,1,2,3,4), bi.getPrimitiveWritableObject( + readerInspector.getStructFieldData(row, fields.get(0)))); + assertEquals("foo", st.getPrimitiveJavaObject(readerInspector. + getStructFieldData(row, fields.get(1)))); + + // check the contents of second row + assertEquals(true, rows.hasNext()); + row = rows.next(row); + assertEquals(bytes(0,1,2,3), bi.getPrimitiveWritableObject( + readerInspector.getStructFieldData(row, fields.get(0)))); + assertEquals("bar", st.getPrimitiveJavaObject(readerInspector. + getStructFieldData(row, fields.get(1)))); + + // check the contents of second row + assertEquals(true, rows.hasNext()); + row = rows.next(row); + assertEquals(bytes(0,1,2,3,4,5), bi.getPrimitiveWritableObject( + readerInspector.getStructFieldData(row, fields.get(0)))); + assertNull(st.getPrimitiveJavaObject(readerInspector. + getStructFieldData(row, fields.get(1)))); + + // check the contents of second row + assertEquals(true, rows.hasNext()); + row = rows.next(row); + assertNull(bi.getPrimitiveWritableObject( + readerInspector.getStructFieldData(row, fields.get(0)))); + assertEquals("hi", st.getPrimitiveJavaObject(readerInspector. + getStructFieldData(row, fields.get(1)))); + + // handle the close up + assertEquals(false, rows.hasNext()); + rows.close(); + } + + @Test public void test1() throws Exception { ObjectInspector inspector; synchronized (TestOrcFile.class) { @@ -567,7 +594,7 @@ public void test1() throws Exception { assertEquals("count: 2 min: -15.0 max: -5.0 sum: -20.0", stats[7].toString()); - assertEquals("count: 2 min: bye max: hi", stats[9].toString()); + assertEquals("count: 2 min: bye max: hi sum: 5", stats[9].toString()); // check the inspectors StructObjectInspector readerInspector = diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java index b93db84..492bb00 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java @@ -125,7 +125,7 @@ public void testMultiStripeWithNull() throws Exception { assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum()); assertEquals(19998, ((StringColumnStatistics) stats[2]).getNumberOfValues()); - assertEquals("count: 19998 min: a max: a", + assertEquals("count: 19998 min: a max: a sum: 19998", stats[2].toString()); // check the inspectors @@ -229,7 +229,7 @@ public void testMultiStripeWithoutNull() throws Exception { assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum()); assertEquals(20000, ((StringColumnStatistics) stats[2]).getNumberOfValues()); - assertEquals("count: 20000 min: a max: b", + assertEquals("count: 20000 min: a max: b sum: 20000", stats[2].toString()); // check the inspectors @@ -329,7 +329,7 @@ public void testColumnsWithNullAndCompression() throws Exception { assertEquals("h", ((StringColumnStatistics) stats[2]).getMaximum()); assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum()); assertEquals(7, ((StringColumnStatistics) stats[2]).getNumberOfValues()); - assertEquals("count: 7 min: a max: h", + assertEquals("count: 7 min: a max: h sum: 7", stats[2].toString()); // check the inspectors diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSerDeStats.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSerDeStats.java new file mode 100644 index 0000000..a6b8f5c --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSerDeStats.java @@ -0,0 +1,599 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.orc; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertNotNull; +import static junit.framework.Assert.assertNull; + +import java.io.File; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +public class TestOrcSerDeStats { + + public static class ListStruct { + List list1; + + public ListStruct(List l1) { + this.list1 = l1; + } + } + + public static class MapStruct { + Map map1; + + public MapStruct(Map m1) { + this.map1 = m1; + } + } + + public static class SimpleStruct { + BytesWritable bytes1; + Text string1; + + SimpleStruct(BytesWritable b1, String s1) { + this.bytes1 = b1; + if (s1 == null) { + this.string1 = null; + } else { + this.string1 = new Text(s1); + } + } + } + + public static class InnerStruct { + int int1; + Text string1 = new Text(); + + InnerStruct(int int1, String string1) { + this.int1 = int1; + this.string1.set(string1); + } + } + + public static class MiddleStruct { + List list = new ArrayList(); + + MiddleStruct(InnerStruct... items) { + list.clear(); + for (InnerStruct item : items) { + list.add(item); + } + } + } + + public static class BigRow { + Boolean boolean1; + Byte byte1; + Short short1; + Integer int1; + Long long1; + Float float1; + Double double1; + BytesWritable bytes1; + Text string1; + MiddleStruct middle; + List list = new ArrayList(); + Map map = new HashMap(); + Timestamp ts; + HiveDecimal decimal1; + + BigRow(Boolean b1, Byte b2, Short s1, Integer i1, Long l1, Float f1, + Double d1, + BytesWritable b3, String s2, MiddleStruct m1, + List l2, Map m2, Timestamp ts1, + HiveDecimal dec1) { + this.boolean1 = b1; + this.byte1 = b2; + this.short1 = s1; + this.int1 = i1; + this.long1 = l1; + this.float1 = f1; + this.double1 = d1; + this.bytes1 = b3; + if (s2 == null) { + this.string1 = null; + } else { + this.string1 = new Text(s2); + } + this.middle = m1; + this.list = l2; + this.map = m2; + this.ts = ts1; + this.decimal1 = dec1; + } + } + + private static InnerStruct inner(int i, String s) { + return new InnerStruct(i, s); + } + + private static Map map(InnerStruct... items) { + Map result = new HashMap(); + for (InnerStruct i : items) { + result.put(new Text(i.string1), i); + } + return result; + } + + private static List list(InnerStruct... items) { + List result = new ArrayList(); + for (InnerStruct s : items) { + result.add(s); + } + return result; + } + + private static BytesWritable bytes(int... items) { + BytesWritable result = new BytesWritable(); + result.setSize(items.length); + for (int i = 0; i < items.length; ++i) { + result.getBytes()[i] = (byte) items[i]; + } + return result; + } + + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + + Configuration conf; + FileSystem fs; + Path testFilePath; + + @Rule + public TestName testCaseName = new TestName(); + + @Before + public void openFileSystem() throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcSerDeStats." + + testCaseName.getMethodName() + ".orc"); + fs.delete(testFilePath, false); + } + + @Test + public void testStringAndBinaryStatistics() throws Exception { + + ObjectInspector inspector; + synchronized (TestOrcSerDeStats.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .inspector(inspector) + .stripeSize(100000) + .bufferSize(10000)); + writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo")); + writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar")); + writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null)); + writer.addRow(new SimpleStruct(null, "hi")); + writer.close(); + assertEquals(4, writer.getNumberOfRows()); + assertEquals(23, writer.getRawDataSize()); + Reader reader = OrcFile.createReader(fs, testFilePath); + assertEquals(4, reader.getNumberOfRows()); + assertEquals(23, reader.getRawDataSize()); + assertEquals(15, reader.getRawDataSizeOfColumns(new int[] {1})); + assertEquals(8, reader.getRawDataSizeOfColumns(new int[] {2})); + assertEquals(23, reader.getRawDataSizeOfColumns(new int[] {1, 2})); + + // check the stats + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(4, stats[0].getNumberOfValues()); + assertEquals("count: 4", stats[0].toString()); + + assertEquals(3, stats[1].getNumberOfValues()); + assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum()); + assertEquals("count: 3 sum: 15", stats[1].toString()); + + assertEquals(3, stats[2].getNumberOfValues()); + assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum()); + assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum()); + assertEquals(8, ((StringColumnStatistics) stats[2]).getSum()); + assertEquals("count: 3 min: bar max: hi sum: 8", + stats[2].toString()); + + // check the inspectors + StructObjectInspector readerInspector = + (StructObjectInspector) reader.getObjectInspector(); + assertEquals(ObjectInspector.Category.STRUCT, + readerInspector.getCategory()); + assertEquals("struct", + readerInspector.getTypeName()); + List fields = + readerInspector.getAllStructFieldRefs(); + BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector. + getStructFieldRef("bytes1").getFieldObjectInspector(); + StringObjectInspector st = (StringObjectInspector) readerInspector. + getStructFieldRef("string1").getFieldObjectInspector(); + RecordReader rows = reader.rows(null); + Object row = rows.next(null); + assertNotNull(row); + // check the contents of the first row + assertEquals(bytes(0, 1, 2, 3, 4), bi.getPrimitiveWritableObject( + readerInspector.getStructFieldData(row, fields.get(0)))); + assertEquals("foo", st.getPrimitiveJavaObject(readerInspector. + getStructFieldData(row, fields.get(1)))); + + // check the contents of second row + assertEquals(true, rows.hasNext()); + row = rows.next(row); + assertEquals(bytes(0, 1, 2, 3), bi.getPrimitiveWritableObject( + readerInspector.getStructFieldData(row, fields.get(0)))); + assertEquals("bar", st.getPrimitiveJavaObject(readerInspector. + getStructFieldData(row, fields.get(1)))); + + // check the contents of second row + assertEquals(true, rows.hasNext()); + row = rows.next(row); + assertEquals(bytes(0, 1, 2, 3, 4, 5), bi.getPrimitiveWritableObject( + readerInspector.getStructFieldData(row, fields.get(0)))); + assertNull(st.getPrimitiveJavaObject(readerInspector. + getStructFieldData(row, fields.get(1)))); + + // check the contents of second row + assertEquals(true, rows.hasNext()); + row = rows.next(row); + assertNull(bi.getPrimitiveWritableObject( + readerInspector.getStructFieldData(row, fields.get(0)))); + assertEquals("hi", st.getPrimitiveJavaObject(readerInspector. + getStructFieldData(row, fields.get(1)))); + + // handle the close up + assertEquals(false, rows.hasNext()); + rows.close(); + } + + + @Test + public void testOrcSerDeStatsList() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcSerDeStats.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (ListStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .inspector(inspector) + .stripeSize(10000) + .bufferSize(10000)); + for (int row = 0; row < 5000; row++) { + List test = new ArrayList(); + for (int i = 0; i < 1000; i++) { + test.add("hi"); + } + writer.addRow(new ListStruct(test)); + } + writer.close(); + assertEquals(5000, writer.getNumberOfRows()); + assertEquals(10000000, writer.getRawDataSize()); + + Reader reader = OrcFile.createReader(fs, testFilePath); + // stats from reader + assertEquals(5000, reader.getNumberOfRows()); + assertEquals(10000000, reader.getRawDataSize()); + assertEquals(10000000, reader.getRawDataSizeOfColumns(new int[] {2})); + } + + @Test + public void testOrcSerDeStatsMap() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcSerDeStats.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (MapStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .inspector(inspector) + .stripeSize(10000) + .bufferSize(10000)); + for (int row = 0; row < 1000; row++) { + Map test = new HashMap(); + for (int i = 0; i < 10; i++) { + test.put("hi" + i, 2.0); + } + writer.addRow(new MapStruct(test)); + } + writer.close(); + // stats from writer + assertEquals(1000, writer.getNumberOfRows()); + assertEquals(110000, writer.getRawDataSize()); + + Reader reader = OrcFile.createReader(fs, testFilePath); + // stats from reader + assertEquals(1000, reader.getNumberOfRows()); + assertEquals(110000, reader.getRawDataSize()); + assertEquals(30000, reader.getRawDataSizeOfColumns(new int[] {2})); + assertEquals(80000, reader.getRawDataSizeOfColumns(new int[] {3})); + } + + @Test + public void testOrcSerDeStatsSimpleWithNulls() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcSerDeStats.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .inspector(inspector) + .stripeSize(10000) + .bufferSize(10000)); + for (int row = 0; row < 1000; row++) { + if (row % 2 == 0) { + writer.addRow(new SimpleStruct(new BytesWritable(new byte[] {1, 2, 3}), "hi")); + } else { + writer.addRow(null); + } + } + writer.close(); + // stats from writer + assertEquals(1000, writer.getNumberOfRows()); + assertEquals(2500, writer.getRawDataSize()); + + Reader reader = OrcFile.createReader(fs, testFilePath); + // stats from reader + assertEquals(1000, reader.getNumberOfRows()); + assertEquals(2500, reader.getRawDataSize()); + assertEquals(1500, reader.getRawDataSizeOfColumns(new int[] {1})); + assertEquals(1000, reader.getRawDataSizeOfColumns(new int[] {2})); + assertEquals(2500, reader.getRawDataSizeOfColumns(new int[] {1, 2})); + } + + @Test + public void testOrcSerDeStatsComplex() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcSerDeStats.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .inspector(inspector) + .stripeSize(100000) + .bufferSize(10000)); + // 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64 + writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536, + Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", + new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), + list(inner(3, "good"), inner(4, "bad")), + map(), Timestamp.valueOf("2000-03-12 15:00:00"), new HiveDecimal( + "12345678.6547456"))); + // 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 = + // 97 + writer.addRow(new BigRow(true, (byte) 100, (short) 2048, 65536, + Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye", + new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), + list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), + map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.valueOf("2000-03-11 15:00:00"), + new HiveDecimal("12345678.6547452"))); + writer.close(); + long rowCount = writer.getNumberOfRows(); + long rawDataSize = writer.getRawDataSize(); + assertEquals(2, rowCount); + assertEquals(257, rawDataSize); + Reader reader = OrcFile.createReader(fs, testFilePath); + + assertEquals(2, reader.getNumberOfRows()); + assertEquals(257, reader.getRawDataSize()); + assertEquals(28, reader.getRawDataSizeOfColumns(new int[] {3, 4, 5})); + + // check the stats + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(2, stats[1].getNumberOfValues()); + assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount()); + assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount()); + assertEquals("count: 2 true: 1", stats[1].toString()); + + assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum()); + assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum()); + assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined()); + assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum()); + assertEquals("count: 2 min: 1024 max: 2048 sum: 3072", + stats[3].toString()); + + assertEquals(Long.MAX_VALUE, + ((IntegerColumnStatistics) stats[5]).getMaximum()); + assertEquals(Long.MAX_VALUE, + ((IntegerColumnStatistics) stats[5]).getMinimum()); + assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined()); + assertEquals("count: 2 min: 9223372036854775807 max: 9223372036854775807", + stats[5].toString()); + + assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum()); + assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum()); + assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001); + assertEquals("count: 2 min: -15.0 max: -5.0 sum: -20.0", + stats[7].toString()); + + assertEquals("count: 2 min: bye max: hi sum: 5", stats[9].toString()); + } + + @Test + public void testOrcSerDeStatsComplexOldFormat() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcSerDeStats.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .inspector(inspector) + .stripeSize(100000) + .version(OrcFile.Version.V_0_11) + .bufferSize(10000)); + // 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64 + writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536, + Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", + new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), + list(inner(3, "good"), inner(4, "bad")), + map(), Timestamp.valueOf("2000-03-12 15:00:00"), new HiveDecimal( + "12345678.6547456"))); + // 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 = + // 97 + writer.addRow(new BigRow(true, (byte) 100, (short) 2048, 65536, + Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye", + new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), + list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), + map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.valueOf("2000-03-11 15:00:00"), + new HiveDecimal("12345678.6547452"))); + writer.close(); + long rowCount = writer.getNumberOfRows(); + long rawDataSize = writer.getRawDataSize(); + assertEquals(2, rowCount); + assertEquals(257, rawDataSize); + Reader reader = OrcFile.createReader(fs, testFilePath); + + assertEquals(2, reader.getNumberOfRows()); + assertEquals(257, reader.getRawDataSize()); + assertEquals(28, reader.getRawDataSizeOfColumns(new int[] {3, 4, 5})); + + // check the stats + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(2, stats[1].getNumberOfValues()); + assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount()); + assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount()); + assertEquals("count: 2 true: 1", stats[1].toString()); + + assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum()); + assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum()); + assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined()); + assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum()); + assertEquals("count: 2 min: 1024 max: 2048 sum: 3072", + stats[3].toString()); + + assertEquals(Long.MAX_VALUE, + ((IntegerColumnStatistics) stats[5]).getMaximum()); + assertEquals(Long.MAX_VALUE, + ((IntegerColumnStatistics) stats[5]).getMinimum()); + assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined()); + assertEquals("count: 2 min: 9223372036854775807 max: 9223372036854775807", + stats[5].toString()); + + assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum()); + assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum()); + assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001); + assertEquals("count: 2 min: -15.0 max: -5.0 sum: -20.0", + stats[7].toString()); + + assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum()); + assertEquals("count: 2 sum: 5", stats[8].toString()); + + assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum()); + assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum()); + assertEquals(5, ((StringColumnStatistics) stats[9]).getSum()); + assertEquals("count: 2 min: bye max: hi sum: 5", stats[9].toString()); + } + + @Test(expected = ClassCastException.class) + public void testSerdeStatsOldFormat() throws Exception { + Path resourceDir = new Path(System.getProperty("test.build.resources", "ql" + + File.separator + "src" + File.separator + "test" + File.separator + + "resources")); + Path oldFilePath = new Path(resourceDir, "orc-file-11-format.orc"); + Reader reader = OrcFile.createReader(fs, oldFilePath); + + int stripeCount = 0; + int rowCount = 0; + long currentOffset = -1; + for (StripeInformation stripe : reader.getStripes()) { + stripeCount += 1; + rowCount += stripe.getNumberOfRows(); + if (currentOffset < 0) { + currentOffset = stripe.getOffset() + stripe.getIndexLength() + + stripe.getDataLength() + stripe.getFooterLength(); + } else { + assertEquals(currentOffset, stripe.getOffset()); + currentOffset += stripe.getIndexLength() + stripe.getDataLength() + + stripe.getFooterLength(); + } + } + assertEquals(reader.getNumberOfRows(), rowCount); + assertEquals(728437, reader.getRawDataSize()); + assertEquals(2, stripeCount); + + // check the stats + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(7500, stats[1].getNumberOfValues()); + assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount()); + assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount()); + assertEquals("count: 7500 true: 3750", stats[1].toString()); + + assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum()); + assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum()); + assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined()); + assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum()); + assertEquals("count: 7500 min: 1024 max: 2048 sum: 11520000", + stats[3].toString()); + + assertEquals(Long.MAX_VALUE, + ((IntegerColumnStatistics) stats[5]).getMaximum()); + assertEquals(Long.MAX_VALUE, + ((IntegerColumnStatistics) stats[5]).getMinimum()); + assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined()); + assertEquals( + "count: 7500 min: 9223372036854775807 max: 9223372036854775807", + stats[5].toString()); + + assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum()); + assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum()); + assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(), + 0.00001); + assertEquals("count: 7500 min: -15.0 max: -5.0 sum: -75000.0", + stats[7].toString()); + + assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum()); + assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum()); + assertEquals(0, ((StringColumnStatistics) stats[9]).getSum()); + assertEquals("count: 7500 min: bye max: hi sum: 0", stats[9].toString()); + + // old orc format will not have binary statistics. toString() will show only + // the general column statistics + assertEquals("count: 7500", stats[8].toString()); + // since old orc format doesn't support binary statistics, + // this should throw ClassCastException + assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum()); + + } + +} diff --git ql/src/test/resources/orc-file-dump-dictionary-threshold.out ql/src/test/resources/orc-file-dump-dictionary-threshold.out index 003c132..bac7465 100644 --- ql/src/test/resources/orc-file-dump-dictionary-threshold.out +++ ql/src/test/resources/orc-file-dump-dictionary-threshold.out @@ -8,71 +8,71 @@ Statistics: Column 0: count: 21000 Column 1: count: 21000 min: -2147390285 max: 2147453086 sum: 109128518326 Column 2: count: 21000 min: -9222731174895935707 max: 9222919052987871506 - Column 3: count: 21000 min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 + Column 3: count: 21000 min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 sum: 6910238 Stripes: - Stripe: offset: 3 data: 102311 rows: 4000 tail: 68 index: 217 + Stripe: offset: 3 data: 102311 rows: 4000 tail: 68 index: 224 Stream: column 0 section ROW_INDEX start: 3 length 10 Stream: column 1 section ROW_INDEX start: 13 length 36 Stream: column 2 section ROW_INDEX start: 49 length 39 - Stream: column 3 section ROW_INDEX start: 88 length 132 - Stream: column 1 section DATA start: 220 length 16022 - Stream: column 2 section DATA start: 16242 length 32028 - Stream: column 3 section DATA start: 48270 length 50887 - Stream: column 3 section LENGTH start: 99157 length 3374 + Stream: column 3 section ROW_INDEX start: 88 length 139 + Stream: column 1 section DATA start: 227 length 16022 + Stream: column 2 section DATA start: 16249 length 32028 + Stream: column 3 section DATA start: 48277 length 50887 + Stream: column 3 section LENGTH start: 99164 length 3374 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DIRECT_V2 - Stripe: offset: 102599 data: 284999 rows: 5000 tail: 68 index: 349 - Stream: column 0 section ROW_INDEX start: 102599 length 10 - Stream: column 1 section ROW_INDEX start: 102609 length 36 - Stream: column 2 section ROW_INDEX start: 102645 length 39 - Stream: column 3 section ROW_INDEX start: 102684 length 264 - Stream: column 1 section DATA start: 102948 length 20029 - Stream: column 2 section DATA start: 122977 length 40035 - Stream: column 3 section DATA start: 163012 length 219588 - Stream: column 3 section LENGTH start: 382600 length 5347 + Stripe: offset: 102606 data: 284999 rows: 5000 tail: 68 index: 356 + Stream: column 0 section ROW_INDEX start: 102606 length 10 + Stream: column 1 section ROW_INDEX start: 102616 length 36 + Stream: column 2 section ROW_INDEX start: 102652 length 39 + Stream: column 3 section ROW_INDEX start: 102691 length 271 + Stream: column 1 section DATA start: 102962 length 20029 + Stream: column 2 section DATA start: 122991 length 40035 + Stream: column 3 section DATA start: 163026 length 219588 + Stream: column 3 section LENGTH start: 382614 length 5347 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DIRECT_V2 - Stripe: offset: 388015 data: 491655 rows: 5000 tail: 69 index: 536 - Stream: column 0 section ROW_INDEX start: 388015 length 10 - Stream: column 1 section ROW_INDEX start: 388025 length 36 - Stream: column 2 section ROW_INDEX start: 388061 length 39 - Stream: column 3 section ROW_INDEX start: 388100 length 451 - Stream: column 1 section DATA start: 388551 length 20029 - Stream: column 2 section DATA start: 408580 length 40035 - Stream: column 3 section DATA start: 448615 length 425862 - Stream: column 3 section LENGTH start: 874477 length 5729 + Stripe: offset: 388029 data: 491655 rows: 5000 tail: 69 index: 544 + Stream: column 0 section ROW_INDEX start: 388029 length 10 + Stream: column 1 section ROW_INDEX start: 388039 length 36 + Stream: column 2 section ROW_INDEX start: 388075 length 39 + Stream: column 3 section ROW_INDEX start: 388114 length 459 + Stream: column 1 section DATA start: 388573 length 20029 + Stream: column 2 section DATA start: 408602 length 40035 + Stream: column 3 section DATA start: 448637 length 425862 + Stream: column 3 section LENGTH start: 874499 length 5729 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DIRECT_V2 - Stripe: offset: 880275 data: 707368 rows: 5000 tail: 68 index: 677 - Stream: column 0 section ROW_INDEX start: 880275 length 10 - Stream: column 1 section ROW_INDEX start: 880285 length 36 - Stream: column 2 section ROW_INDEX start: 880321 length 39 - Stream: column 3 section ROW_INDEX start: 880360 length 592 - Stream: column 1 section DATA start: 880952 length 20029 - Stream: column 2 section DATA start: 900981 length 40035 - Stream: column 3 section DATA start: 941016 length 641580 - Stream: column 3 section LENGTH start: 1582596 length 5724 + Stripe: offset: 880297 data: 707368 rows: 5000 tail: 68 index: 691 + Stream: column 0 section ROW_INDEX start: 880297 length 10 + Stream: column 1 section ROW_INDEX start: 880307 length 36 + Stream: column 2 section ROW_INDEX start: 880343 length 39 + Stream: column 3 section ROW_INDEX start: 880382 length 606 + Stream: column 1 section DATA start: 880988 length 20029 + Stream: column 2 section DATA start: 901017 length 40035 + Stream: column 3 section DATA start: 941052 length 641580 + Stream: column 3 section LENGTH start: 1582632 length 5724 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DIRECT_V2 - Stripe: offset: 1588388 data: 348697 rows: 2000 tail: 67 index: 786 - Stream: column 0 section ROW_INDEX start: 1588388 length 10 - Stream: column 1 section ROW_INDEX start: 1588398 length 36 - Stream: column 2 section ROW_INDEX start: 1588434 length 39 - Stream: column 3 section ROW_INDEX start: 1588473 length 701 - Stream: column 1 section DATA start: 1589174 length 8011 - Stream: column 2 section DATA start: 1597185 length 16014 - Stream: column 3 section DATA start: 1613199 length 322259 - Stream: column 3 section LENGTH start: 1935458 length 2413 + Stripe: offset: 1588424 data: 348697 rows: 2000 tail: 67 index: 797 + Stream: column 0 section ROW_INDEX start: 1588424 length 10 + Stream: column 1 section ROW_INDEX start: 1588434 length 36 + Stream: column 2 section ROW_INDEX start: 1588470 length 39 + Stream: column 3 section ROW_INDEX start: 1588509 length 712 + Stream: column 1 section DATA start: 1589221 length 8011 + Stream: column 2 section DATA start: 1597232 length 16014 + Stream: column 3 section DATA start: 1613246 length 322259 + Stream: column 3 section LENGTH start: 1935505 length 2413 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 - Encoding column 3: DIRECT_V2 \ No newline at end of file + Encoding column 3: DIRECT_V2 diff --git ql/src/test/resources/orc-file-dump.out ql/src/test/resources/orc-file-dump.out index fac5326..5b5eb2c 100644 --- ql/src/test/resources/orc-file-dump.out +++ ql/src/test/resources/orc-file-dump.out @@ -8,75 +8,75 @@ Statistics: Column 0: count: 21000 Column 1: count: 21000 min: -2146993718 max: 2147378179 sum: 193017464403 Column 2: count: 21000 min: -9222758097219661129 max: 9222303228623055266 - Column 3: count: 21000 min: Darkness, max: worst + Column 3: count: 21000 min: Darkness, max: worst sum: 81761 Stripes: - Stripe: offset: 3 data: 63766 rows: 5000 tail: 74 index: 119 + Stripe: offset: 3 data: 63766 rows: 5000 tail: 74 index: 123 Stream: column 0 section ROW_INDEX start: 3 length 10 Stream: column 1 section ROW_INDEX start: 13 length 35 Stream: column 2 section ROW_INDEX start: 48 length 39 - Stream: column 3 section ROW_INDEX start: 87 length 35 - Stream: column 1 section DATA start: 122 length 20029 - Stream: column 2 section DATA start: 20151 length 40035 - Stream: column 3 section DATA start: 60186 length 3544 - Stream: column 3 section LENGTH start: 63730 length 25 - Stream: column 3 section DICTIONARY_DATA start: 63755 length 133 + Stream: column 3 section ROW_INDEX start: 87 length 39 + Stream: column 1 section DATA start: 126 length 20029 + Stream: column 2 section DATA start: 20155 length 40035 + Stream: column 3 section DATA start: 60190 length 3544 + Stream: column 3 section LENGTH start: 63734 length 25 + Stream: column 3 section DICTIONARY_DATA start: 63759 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DICTIONARY_V2 - Stripe: offset: 63962 data: 63755 rows: 5000 tail: 76 index: 118 - Stream: column 0 section ROW_INDEX start: 63962 length 10 - Stream: column 1 section ROW_INDEX start: 63972 length 34 - Stream: column 2 section ROW_INDEX start: 64006 length 39 - Stream: column 3 section ROW_INDEX start: 64045 length 35 - Stream: column 1 section DATA start: 64080 length 20029 - Stream: column 2 section DATA start: 84109 length 40035 - Stream: column 3 section DATA start: 124144 length 3533 - Stream: column 3 section LENGTH start: 127677 length 25 - Stream: column 3 section DICTIONARY_DATA start: 127702 length 133 + Stripe: offset: 63966 data: 63755 rows: 5000 tail: 74 index: 122 + Stream: column 0 section ROW_INDEX start: 63966 length 10 + Stream: column 1 section ROW_INDEX start: 63976 length 34 + Stream: column 2 section ROW_INDEX start: 64010 length 39 + Stream: column 3 section ROW_INDEX start: 64049 length 39 + Stream: column 1 section DATA start: 64088 length 20029 + Stream: column 2 section DATA start: 84117 length 40035 + Stream: column 3 section DATA start: 124152 length 3533 + Stream: column 3 section LENGTH start: 127685 length 25 + Stream: column 3 section DICTIONARY_DATA start: 127710 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DICTIONARY_V2 - Stripe: offset: 127911 data: 63766 rows: 5000 tail: 76 index: 120 - Stream: column 0 section ROW_INDEX start: 127911 length 10 - Stream: column 1 section ROW_INDEX start: 127921 length 36 - Stream: column 2 section ROW_INDEX start: 127957 length 39 - Stream: column 3 section ROW_INDEX start: 127996 length 35 - Stream: column 1 section DATA start: 128031 length 20029 - Stream: column 2 section DATA start: 148060 length 40035 - Stream: column 3 section DATA start: 188095 length 3544 - Stream: column 3 section LENGTH start: 191639 length 25 - Stream: column 3 section DICTIONARY_DATA start: 191664 length 133 + Stripe: offset: 127917 data: 63766 rows: 5000 tail: 74 index: 124 + Stream: column 0 section ROW_INDEX start: 127917 length 10 + Stream: column 1 section ROW_INDEX start: 127927 length 36 + Stream: column 2 section ROW_INDEX start: 127963 length 39 + Stream: column 3 section ROW_INDEX start: 128002 length 39 + Stream: column 1 section DATA start: 128041 length 20029 + Stream: column 2 section DATA start: 148070 length 40035 + Stream: column 3 section DATA start: 188105 length 3544 + Stream: column 3 section LENGTH start: 191649 length 25 + Stream: column 3 section DICTIONARY_DATA start: 191674 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DICTIONARY_V2 - Stripe: offset: 200000 data: 63796 rows: 5000 tail: 74 index: 119 + Stripe: offset: 200000 data: 63796 rows: 5000 tail: 74 index: 123 Stream: column 0 section ROW_INDEX start: 200000 length 10 Stream: column 1 section ROW_INDEX start: 200010 length 35 Stream: column 2 section ROW_INDEX start: 200045 length 39 - Stream: column 3 section ROW_INDEX start: 200084 length 35 - Stream: column 1 section DATA start: 200119 length 20029 - Stream: column 2 section DATA start: 220148 length 40035 - Stream: column 3 section DATA start: 260183 length 3574 - Stream: column 3 section LENGTH start: 263757 length 25 - Stream: column 3 section DICTIONARY_DATA start: 263782 length 133 + Stream: column 3 section ROW_INDEX start: 200084 length 39 + Stream: column 1 section DATA start: 200123 length 20029 + Stream: column 2 section DATA start: 220152 length 40035 + Stream: column 3 section DATA start: 260187 length 3574 + Stream: column 3 section LENGTH start: 263761 length 25 + Stream: column 3 section DICTIONARY_DATA start: 263786 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DICTIONARY_V2 - Stripe: offset: 263989 data: 12940 rows: 1000 tail: 71 index: 120 - Stream: column 0 section ROW_INDEX start: 263989 length 10 - Stream: column 1 section ROW_INDEX start: 263999 length 36 - Stream: column 2 section ROW_INDEX start: 264035 length 39 - Stream: column 3 section ROW_INDEX start: 264074 length 35 - Stream: column 1 section DATA start: 264109 length 4007 - Stream: column 2 section DATA start: 268116 length 8007 - Stream: column 3 section DATA start: 276123 length 768 - Stream: column 3 section LENGTH start: 276891 length 25 - Stream: column 3 section DICTIONARY_DATA start: 276916 length 133 + Stripe: offset: 263993 data: 12940 rows: 1000 tail: 71 index: 123 + Stream: column 0 section ROW_INDEX start: 263993 length 10 + Stream: column 1 section ROW_INDEX start: 264003 length 36 + Stream: column 2 section ROW_INDEX start: 264039 length 39 + Stream: column 3 section ROW_INDEX start: 264078 length 38 + Stream: column 1 section DATA start: 264116 length 4007 + Stream: column 2 section DATA start: 268123 length 8007 + Stream: column 3 section DATA start: 276130 length 768 + Stream: column 3 section LENGTH start: 276898 length 25 + Stream: column 3 section DICTIONARY_DATA start: 276923 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 diff --git serde/src/java/org/apache/hadoop/hive/serde2/SerDeStats.java serde/src/java/org/apache/hadoop/hive/serde2/SerDeStats.java index 1c09dc3..6cf2ccd 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/SerDeStats.java +++ serde/src/java/org/apache/hadoop/hive/serde2/SerDeStats.java @@ -27,9 +27,11 @@ // currently we support only raw data size stat private long rawDataSize; + private long rowCount; public SerDeStats() { rawDataSize = 0; + rowCount = 0; } /** @@ -48,4 +50,20 @@ public void setRawDataSize(long uSize) { rawDataSize = uSize; } + /** + * Return the row count + * @return row count + */ + public long getRowCount() { + return rowCount; + } + + /** + * Set the row count + * @param rowCount - count of rows + */ + public void setRowCount(long rowCount) { + this.rowCount = rowCount; + } + }