diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java index 6b230e9..9e4b0ac 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java @@ -17,20 +17,34 @@ */ package org.apache.hadoop.hive.ql.io.orc; +import java.io.OutputStreamWriter; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.io.IOException; import java.text.DecimalFormat; -import java.util.List; +import java.util.Map; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; -import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONWriter; /** * A tool for printing out the file structure of ORC files. @@ -43,24 +57,40 @@ private FileDump() {} public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); - List files = new ArrayList(); + List rowIndexCols = null; - for (String arg : args) { - if (arg.startsWith("--")) { - if (arg.startsWith(ROWINDEX_PREFIX)) { - String[] colStrs = arg.substring(ROWINDEX_PREFIX.length()).split(","); - rowIndexCols = new ArrayList(colStrs.length); - for (String colStr : colStrs) { - rowIndexCols.add(Integer.parseInt(colStr)); - } - } else { - System.err.println("Unknown argument " + arg); - } - } else { - files.add(arg); + Options opts = createOptions(); + CommandLine cli = new GnuParser().parse(opts, args); + + if (cli.hasOption('h')) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("orcfiledump", opts); + return; + } + + boolean dumpData = cli.hasOption('d'); + if (cli.hasOption("rowindex")) { + String[] colStrs = cli.getOptionValue("rowindex").split(","); + rowIndexCols = new ArrayList(colStrs.length); + for (String colStr : colStrs) { + rowIndexCols.add(Integer.parseInt(colStr)); } } + String[] files = cli.getArgs(); + if (dumpData) printData(Arrays.asList(files), conf); + else printMetaData(Arrays.asList(files), conf, rowIndexCols); + } + + private static void printData(List files, Configuration conf) throws IOException, + JSONException { + for (String file : files) { + printJsonData(conf, file); + } + } + + private static void printMetaData(List files, Configuration conf, + List rowIndexCols) throws IOException { for (String filename : files) { System.out.println("Structure for " + filename); Path path = new Path(filename); @@ -179,4 +209,149 @@ private static long getTotalPaddingSize(Reader reader) throws IOException { } return paddedBytes; } + + static Options createOptions() { + Options result = new Options(); + + // add -d and --data to print the rows + result.addOption(OptionBuilder + .withLongOpt("data") + .withDescription("Should the data be printed") + .create('d')); + + result.addOption(OptionBuilder + .withLongOpt("help") + .withDescription("print help message") + .create('h')); + + result.addOption(OptionBuilder + .withLongOpt("rowindex") + .withArgName("comma separated list of row indices") + .withDescription("Dump stats for column number(s)") + .hasArg() + .create()); + + + return result; + } + + private static void printMap(JSONWriter writer, + Map obj, + List types, + OrcProto.Type type + ) throws IOException, JSONException { + writer.array(); + int keyType = type.getSubtypes(0); + int valueType = type.getSubtypes(1); + for(Map.Entry item: obj.entrySet()) { + writer.object(); + writer.key("_key"); + printObject(writer, item.getKey(), types, keyType); + writer.value("_value"); + printObject(writer, item.getValue(), types, valueType); + writer.endObject(); + } + writer.endArray(); + } + + private static void printList(JSONWriter writer, + List obj, + List types, + OrcProto.Type type + ) throws IOException, JSONException { + int subtype = type.getSubtypes(0); + writer.array(); + for(Object item: obj) { + printObject(writer, item, types, subtype); + } + writer.endArray(); + } + + private static void printUnion(JSONWriter writer, + OrcUnion obj, + List types, + OrcProto.Type type + ) throws IOException, JSONException { + int subtype = type.getSubtypes(obj.getTag()); + printObject(writer, obj.getObject(), types, subtype); + } + + static void printStruct(JSONWriter writer, + OrcStruct obj, + List types, + OrcProto.Type type) throws IOException, JSONException { + writer.object(); + List fieldTypes = type.getSubtypesList(); + for(int i=0; i < fieldTypes.size(); ++i) { + writer.key(type.getFieldNames(i)); + printObject(writer, obj.getFieldValue(i), types, fieldTypes.get(i)); + } + writer.endObject(); + } + + static void printObject(JSONWriter writer, + Object obj, + List types, + int typeId) throws IOException, JSONException { + OrcProto.Type type = types.get(typeId); + if (obj == null) { + writer.value(null); + } else { + switch (type.getKind()) { + case STRUCT: + printStruct(writer, (OrcStruct) obj, types, type); + break; + case UNION: + printUnion(writer, (OrcUnion) obj, types, type); + break; + case LIST: + printList(writer, (List) obj, types, type); + break; + case MAP: + printMap(writer, (Map) obj, types, type); + break; + case BYTE: + writer.value(((ByteWritable) obj).get()); + break; + case SHORT: + writer.value(((ShortWritable) obj).get()); + break; + case INT: + writer.value(((IntWritable) obj).get()); + break; + case LONG: + writer.value(((LongWritable) obj).get()); + break; + case FLOAT: + writer.value(((FloatWritable) obj).get()); + break; + case DOUBLE: + writer.value(((DoubleWritable) obj).get()); + break; + case BOOLEAN: + writer.value(((BooleanWritable) obj).get()); + break; + default: + writer.value(obj.toString()); + break; + } + } + } + + static void printJsonData(Configuration conf, + String filename) throws IOException, JSONException { + Path path = new Path(filename); + Reader reader = OrcFile.createReader(path.getFileSystem(conf), path); + OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8"); + RecordReader rows = reader.rows(null); + Object row = null; + List types = reader.getTypes(); + while (rows.hasNext()) { + row = rows.next(row); + JSONWriter writer = new JSONWriter(out); + printObject(writer, row, types, 0); + out.write("\n"); + out.flush(); + } + } }