diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java index 09a6360..10f1c4d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java @@ -23,8 +23,10 @@ import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Collection; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.GnuParser; @@ -32,6 +34,7 @@ import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -93,26 +96,34 @@ public static void main(String[] args) throws Exception { } // if the specified path is directory, iterate through all files and print the file dump - List filesInPath = Lists.newArrayList(); + Set filesInPath = new HashSet(); + Set flushLengthFiles = new HashSet(); for (String filename : files) { Path path = new Path(filename); - filesInPath.addAll(getAllFilesInPath(path, conf)); + Collection dirFiles = getAllFilesInPath(path, conf); + for (String dirFile : dirFiles) { + if (dirFile.contains(AcidUtils.DELTA_SIDE_FILE_SUFFIX)) { + flushLengthFiles.add(dirFile); + } else { + filesInPath.add(dirFile); + } + } } if (dumpData) { - printData(filesInPath, conf); + printData(filesInPath, flushLengthFiles, conf); } else { if (jsonFormat) { boolean prettyPrint = cli.hasOption('p'); - JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, + JsonFileDump.printJsonMetaData(filesInPath, flushLengthFiles, conf, rowIndexCols, prettyPrint, printTimeZone); } else { - printMetaData(filesInPath, conf, rowIndexCols, printTimeZone); + printMetaData(filesInPath, flushLengthFiles, conf, rowIndexCols, printTimeZone); } } } - private static Collection getAllFilesInPath(final Path path, + private static Collection getAllFilesInPath(final Path path, final Configuration conf) throws IOException { List filesInPath = Lists.newArrayList(); FileSystem fs = path.getFileSystem(conf); @@ -133,11 +144,12 @@ public static void main(String[] args) throws Exception { return filesInPath; } - private static void printData(List files, Configuration conf) throws IOException, + private static void printData(Set files, final Set flushLengthFiles, + Configuration conf) throws IOException, JSONException { for (String file : files) { try { - printJsonData(conf, file); + printJsonData(conf, file, flushLengthFiles); if (files.size() > 1) { System.out.println(Strings.repeat("=", 80) + "\n"); } @@ -150,13 +162,35 @@ private static void printData(List files, Configuration conf) throws IOE } } - private static void printMetaData(List files, Configuration conf, + private static void printMetaData(Set files, final Set flushLengthFiles, + Configuration conf, List rowIndexCols, boolean printTimeZone) throws IOException { for (String filename : files) { try { Path path = new Path(filename); - Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); + Path sideFile = OrcRecordUpdater.getSideFile(path); + boolean sideFileExists = flushLengthFiles != null && + flushLengthFiles.contains(sideFile.toString()); + long maxLen = Long.MAX_VALUE; + Reader reader; + if (sideFileExists) { + ReaderInfo readerInfo = getReaderInfo(conf, sideFile, path); + reader = readerInfo.reader; + maxLen = readerInfo.maxLen; + if (reader == null) { + // side file does not have any readable offsets + continue; + } + System.err.println("Dumping metadata from " + filename + + ". Footer offset: " + maxLen + "\n"); + } else { + reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen)); + System.err.println("Dumping metadata from " + filename + "\n"); + } System.out.println("Structure for " + filename); + if (maxLen != Long.MAX_VALUE) { + System.out.println("Footer Offset: " + maxLen); + } System.out.println("File Version: " + reader.getFileVersion().getName() + " with " + reader.getWriterVersion()); RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); @@ -498,9 +532,26 @@ static void printObject(JSONWriter writer, } static void printJsonData(Configuration conf, - String filename) throws IOException, JSONException { + String filename, final Set flushLengthFiles) throws IOException, JSONException { Path path = new Path(filename); - Reader reader = OrcFile.createReader(path.getFileSystem(conf), path); + Path sideFile = OrcRecordUpdater.getSideFile(path); + boolean sideFileExists = flushLengthFiles != null && + flushLengthFiles.contains(sideFile.toString()); + long maxLen = Long.MAX_VALUE; + Reader reader; + if (sideFileExists) { + ReaderInfo readerInfo = getReaderInfo(conf, sideFile, path); + reader = readerInfo.reader; + maxLen = readerInfo.maxLen; + if (reader == null) { + // side file does not have any readable offsets + return; + } + System.err.println("Dumping data from " + filename + ". Footer offset: " + maxLen + "\n"); + } else { + reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen)); + System.err.println("Dumping data from " + filename + "\n"); + } PrintStream printStream = System.out; OutputStreamWriter out = new OutputStreamWriter(printStream, "UTF-8"); RecordReader rows = reader.rows(null); @@ -517,4 +568,56 @@ static void printJsonData(Configuration conf, } } } + + static ReaderInfo getReaderInfo(final Configuration conf, final Path sideFile, final Path path) + throws IOException { + FileSystem fs = sideFile.getFileSystem(conf); + FileStatus sideFileStatus = fs.getFileStatus(sideFile); + final long sideFileLen = sideFileStatus.getLen(); + System.err.println("\nReading side file " + sideFile + " [length " + sideFileLen + "]"); + long readPosStart; + long maxLen = Long.MAX_VALUE; + Reader reader; + ReaderInfo readerInfo = new ReaderInfo(); + if (sideFileLen >= 8) { + if (sideFileLen % 8 == 0) { + // flush length file is flushed properly and probably not corrupted + readPosStart = sideFileLen - 8; + } else { + // last entry could have been corrupted, ignore the last entry as it is incomplete + readPosStart = (sideFileLen - sideFileLen % 8) - 8; + } + FSDataInputStream fdis = fs.open(sideFile); + fdis.seek(readPosStart); + maxLen = fdis.readLong(); + boolean readable = false; + do { + try { + reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen)); + readerInfo.reader = reader; + readerInfo.maxLen = maxLen; + readable = true; + } catch (Exception e) { + final long prevOffset = maxLen; + readPosStart = Math.max(0, readPosStart - 8); + fdis.seek(readPosStart); + maxLen = fdis.readLong(); + System.err.println("Unable to read data from footer offset: " + prevOffset + + ". Trying previous offset: " + maxLen); + } + } while(!readable); + } else { + System.err.println("Unable to read any data from side file." + + " Skipping corresponding data file " + path); + readerInfo.reader = null; + readerInfo.maxLen = maxLen; + } + + return readerInfo; + } + + public static class ReaderInfo { + Reader reader; + long maxLen; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java index 7f673dc..50b2885 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.List; +import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -35,8 +36,10 @@ */ public class JsonFileDump { - public static void printJsonMetaData(List files, Configuration conf, - List rowIndexCols, boolean prettyPrint, boolean printTimeZone) throws JSONException, IOException { + public static void printJsonMetaData(Set files, final Set flushLengthFiles, + Configuration conf, + List rowIndexCols, boolean prettyPrint, boolean printTimeZone) + throws JSONException, IOException { JSONStringer writer = new JSONStringer(); boolean multiFile = files.size() > 1; if (multiFile) { @@ -51,7 +54,25 @@ public static void printJsonMetaData(List files, Configuration conf, } writer.key("fileName").value(filename); Path path = new Path(filename); - Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); + Path sideFile = OrcRecordUpdater.getSideFile(path); + boolean sideFileExists = flushLengthFiles != null && + flushLengthFiles.contains(sideFile.toString()); + long maxLen = Long.MAX_VALUE; + Reader reader; + if (sideFileExists) { + FileDump.ReaderInfo readerInfo = FileDump.getReaderInfo(conf, sideFile, path); + reader = readerInfo.reader; + maxLen = readerInfo.maxLen; + if (reader == null) { + // side file does not have any readable offsets + continue; + } + } else { + reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen)); + } + if (maxLen != Long.MAX_VALUE) { + writer.key("footerOffset").value(maxLen); + } writer.key("fileVersion").value(reader.getFileVersion().getName()); writer.key("writerVersion").value(reader.getWriterVersion()); RecordReaderImpl rows = (RecordReaderImpl) reader.rows();