diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java index 96cbef6..20e3e7c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java @@ -20,7 +20,12 @@ import java.util.ArrayList; import java.util.List; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.List; + import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; @@ -147,7 +152,28 @@ public static void main(String[] args) throws Exception { } } } + + FileSystem fs = path.getFileSystem(conf); + long fileLen = fs.getContentSummary(path).getLength(); + long paddedBytes = getTotalPaddingSize(reader); + // empty ORC file is ~45 bytes. Assumption here is file length always >0 + double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; + DecimalFormat format = new DecimalFormat("##.##"); + System.out.println("\nFile length: " + fileLen + " bytes"); + System.out.println("Padding length: " + paddedBytes + " bytes"); + System.out.println("Padding ratio: " + format.format(percentPadding) + "%"); rows.close(); } } + + private static long getTotalPaddingSize(Reader reader) throws IOException { + long paddedBytes = 0; + List stripes = reader.getStripes(); + for (int i = 1; i < stripes.size(); i++) { + long prevStripeOffset = stripes.get(i - 1).getOffset(); + long prevStripeLen = stripes.get(i - 1).getLength(); + paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen); + } + return paddedBytes; + } } diff --git ql/src/test/resources/orc-file-dump-dictionary-threshold.out ql/src/test/resources/orc-file-dump-dictionary-threshold.out index f6e7a50..32ba5dc 100644 --- ql/src/test/resources/orc-file-dump-dictionary-threshold.out +++ ql/src/test/resources/orc-file-dump-dictionary-threshold.out @@ -103,3 +103,7 @@ Stripes: Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DIRECT_V2 + +File length: 1932446 bytes +Padding length: 0 bytes +Padding ratio: 0% diff --git ql/src/test/resources/orc-file-dump.out ql/src/test/resources/orc-file-dump.out index f46a89b..7b95170 100644 --- ql/src/test/resources/orc-file-dump.out +++ ql/src/test/resources/orc-file-dump.out @@ -108,3 +108,7 @@ Stripes: Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 Encoding column 3: DICTIONARY_V2 + +File length: 269529 bytes +Padding length: 0 bytes +Padding ratio: 0%