diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index bcee201..560abfc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -106,6 +106,10 @@ void close(boolean abort) throws IOException; } + public static interface StatsProvidingRecordWriter extends RecordWriter { + SerDeStats getStats(); + } + public class FSPaths implements Cloneable { Path tmpPath; Path taskOutputTempPath; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java index 90260fd..ac991d9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java @@ -39,6 +39,19 @@ long getNumberOfRows(); /** + * Get the deserialized data size of the file + * @return raw data size + */ + long getRawDataSize(); + + /** + * Get the deserialized data size of the specified columns + * @param colIndices + * @return raw data size of columns + */ + long getRawDataSizeOfColumns(int[] colIndices); + + /** * Get the user metadata keys. * @return the set of metadata keys */ diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java index 8e74b91..591a238 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java @@ -47,4 +47,22 @@ * @throws IOException */ void close() throws IOException; + + /** + * Return the deserialized data size. Raw data size will be compute when + * writing the file footer. Hence raw data size value will be available only + * after closing the writer. + * + * @return raw data size + */ + long getRawDataSize(); + + /** + * Return the number of rows in file. Row count gets updated when flushing + * the stripes. To get accurate row count this method should be called after + * closing the writer. + * + * @return row count + */ + long getNumberOfRows(); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/SerDeStats.java serde/src/java/org/apache/hadoop/hive/serde2/SerDeStats.java index 1c09dc3..6cf2ccd 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/SerDeStats.java +++ serde/src/java/org/apache/hadoop/hive/serde2/SerDeStats.java @@ -27,9 +27,11 @@ // currently we support only raw data size stat private long rawDataSize; + private long rowCount; public SerDeStats() { rawDataSize = 0; + rowCount = 0; } /** @@ -48,4 +50,20 @@ public void setRawDataSize(long uSize) { rawDataSize = uSize; } + /** + * Return the row count + * @return row count + */ + public long getRowCount() { + return rowCount; + } + + /** + * Set the row count + * @param rowCount - count of rows + */ + public void setRowCount(long rowCount) { + this.rowCount = rowCount; + } + }