diff --git a/data/files/parquet_create.txt b/data/files/parquet_create.txt new file mode 100644 index 0000000..ccd48ee --- /dev/null +++ b/data/files/parquet_create.txt @@ -0,0 +1,3 @@ +1|foo line1|key11:value11,key12:value12,key13:value13|a,b,c|one,two +2|bar line2|key21:value21,key22:value22,key23:value23|d,e,f|three,four +3|baz line3|key31:value31,key32:value32,key33:value33|g,h,i|five,six diff --git a/data/files/parquet_partitioned.txt b/data/files/parquet_partitioned.txt new file mode 100644 index 0000000..8f322f3 --- /dev/null +++ b/data/files/parquet_partitioned.txt @@ -0,0 +1,3 @@ +1|foo|part1 +2|bar|part2 +3|baz|part2 diff --git a/pom.xml b/pom.xml index 41f5337..fbb21df 100644 --- a/pom.xml +++ b/pom.xml @@ -127,6 +127,7 @@ requires netty < 3.6.0 we force hadoops version --> 3.4.0.Final + 1.3.2 0.10.1 2.5.0 1.0.1 @@ -222,6 +223,17 @@ ${bonecp.version} + com.twitter + parquet-hadoop-bundle + ${parquet.version} + + + com.twitter + parquet-column + ${parquet.version} + tests + + com.sun.jersey jersey-core ${jersey.version} diff --git a/ql/pom.xml b/ql/pom.xml index 7087a4c..53d0b9e 100644 --- a/ql/pom.xml +++ b/ql/pom.xml @@ -67,6 +67,10 @@ ${kryo.version} + com.twitter + parquet-hadoop-bundle + + commons-codec commons-codec ${commons-codec.version} @@ -204,6 +208,12 @@ + com.twitter + parquet-column + tests + test + + junit junit ${junit.version} @@ -476,6 +486,7 @@ org.apache.hive:hive-exec org.apache.hive:hive-serde com.esotericsoftware.kryo:kryo + com.twiter:parquet-hadoop-bundle org.apache.thrift:libthrift commons-lang:commons-lang org.json:json diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java new file mode 100644 index 0000000..13924e1 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java @@ -0,0 +1,47 @@ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; +import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.RecordReader; + +import parquet.hadoop.ParquetInputFormat; + + +/** + * + * A Parquet InputFormat for Hive (with the deprecated package mapred) + * + */ +public class MapredParquetInputFormat extends FileInputFormat { + + public static final Log LOG = LogFactory.getLog(MapredParquetInputFormat.class); + + private final ParquetInputFormat realInput; + + public MapredParquetInputFormat() { + this(new ParquetInputFormat(DataWritableReadSupport.class)); + } + + protected MapredParquetInputFormat(final ParquetInputFormat inputFormat) { + this.realInput = inputFormat; + } + + @Override + public org.apache.hadoop.mapred.RecordReader getRecordReader( + final org.apache.hadoop.mapred.InputSplit split, + final org.apache.hadoop.mapred.JobConf job, + final org.apache.hadoop.mapred.Reporter reporter + ) throws IOException { + try { + return (RecordReader) new ParquetRecordReaderWrapper(realInput, split, job, reporter); + } catch (final InterruptedException e) { + throw new RuntimeException("Cannot create a RecordReaderWrapper", e); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java new file mode 100644 index 0000000..7023082 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java @@ -0,0 +1,115 @@ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.io.HiveOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter; +import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport; +import org.apache.hadoop.hive.ql.io.parquet.write.ParquetRecordWriterWrapper; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.OutputFormat; +import org.apache.hadoop.util.Progressable; +import org.apache.hadoop.hive.ql.io.FSRecordWriter; + +import parquet.hadoop.ParquetOutputFormat; + +/** + * + * A Parquet OutputFormat for Hive (with the deprecated package mapred) + * + */ +public class MapredParquetOutputFormat extends FileOutputFormat implements + HiveOutputFormat { + + public static final Log LOG = LogFactory.getLog(MapredParquetOutputFormat.class); + + protected ParquetOutputFormat realOutputFormat; + + public MapredParquetOutputFormat() { + realOutputFormat = new ParquetOutputFormat(new DataWritableWriteSupport()); + } + + public MapredParquetOutputFormat(final OutputFormat mapreduceOutputFormat) { + realOutputFormat = (ParquetOutputFormat) mapreduceOutputFormat; + } + + @Override + public void checkOutputSpecs(final FileSystem ignored, final JobConf job) throws IOException { + realOutputFormat.checkOutputSpecs(ShimLoader.getHadoopShims().getHCatShim().createJobContext(job, null)); + } + + @Override + public RecordWriter getRecordWriter( + final FileSystem ignored, + final JobConf job, + final String name, + final Progressable progress + ) throws IOException { + throw new RuntimeException("Should never be used"); + } + + /** + * + * Create the parquet schema from the hive schema, and return the RecordWriterWrapper which + * contains the real output format + */ + @Override + public FSRecordWriter getHiveRecordWriter( + final JobConf jobConf, + final Path finalOutPath, + final Class valueClass, + final boolean isCompressed, + final Properties tableProperties, + final Progressable progress) throws IOException { + + LOG.info("getHiveRecordWriter " + this); + LOG.info("creating new record writer..."); + + // Seriously? Hard coded property names? + final String columnNameProperty = tableProperties.getProperty("columns"); + final String columnTypeProperty = tableProperties.getProperty("columns.types"); + List columnNames; + List columnTypes; + + if (columnNameProperty.length() == 0) { + columnNames = new ArrayList(); + } else { + columnNames = Arrays.asList(columnNameProperty.split(",")); + } + + if (columnTypeProperty.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + + DataWritableWriteSupport.setSchema(HiveSchemaConverter.convert(columnNames, columnTypes), jobConf); + return getParquerRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress); + } + + protected ParquetRecordWriterWrapper getParquerRecordWriterWrapper( + ParquetOutputFormat realOutputFormat, + JobConf jobConf, + String finalOutPath, + Progressable progress + ) throws IOException { + return new ParquetRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java new file mode 100644 index 0000000..5ece1ba --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java @@ -0,0 +1,161 @@ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.MapWork; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.util.StringUtils; + +public class ProjectionPusher { + + public static final Log LOG = LogFactory.getLog(ProjectionPusher.class); + + private final Map pathToPartitionInfo = + new LinkedHashMap(); + /** + * MapWork is the Hive object which describes input files, + * columns projections, and filters. + */ + private MapWork mapWork; + + private static final List virtualColumns; + + static { + List vcols = new ArrayList(); + vcols.add("INPUT__FILE__NAME"); + vcols.add("BLOCK__OFFSET__INSIDE__FILE"); + vcols.add("ROW__OFFSET__INSIDE__BLOCK"); + vcols.add("RAW__DATA__SIZE"); + virtualColumns = Collections.unmodifiableList(vcols); + } + + public List getColumns(final String columns) { + final List result = (List) StringUtils.getStringCollection(columns); + result.removeAll(virtualColumns); + return result; + } + + /** + * Sets the mapWork variable based on the current JobConf in order to get all partitions. + * + * @param job + */ + private void updateMrWork(final JobConf job) { + final String plan = HiveConf.getVar(job, HiveConf.ConfVars.PLAN); + if (mapWork == null && plan != null && plan.length() > 0) { + mapWork = Utilities.getMapWork(job); + pathToPartitionInfo.clear(); + for (final Map.Entry entry : mapWork.getPathToPartitionInfo().entrySet()) { + pathToPartitionInfo.put(new Path(entry.getKey()).toUri().getPath().toString(), entry.getValue()); + } + } + } + + private void pushProjectionsAndFilters(final JobConf jobConf, + final String splitPath, final String splitPathWithNoSchema) { + + if (mapWork == null) { + //LOG.debug("Not pushing projections and filters because MapWork is null"); + return; + } else if (mapWork.getPathToAliases() == null) { + //LOG.debug("Not pushing projections and filters because pathToAliases is null"); + return; + } + + final ArrayList aliases = new ArrayList(); + final Iterator>> iterator = mapWork.getPathToAliases().entrySet().iterator(); + + while (iterator.hasNext()) { + final Entry> entry = iterator.next(); + final String key = new Path(entry.getKey()).toUri().getPath(); + + if (splitPath.equals(key) || splitPathWithNoSchema.equals(key)) { + final ArrayList list = entry.getValue(); + for (final String val : list) { + aliases.add(val); + } + } + } + + for (final String alias : aliases) { + final Operator op = mapWork.getAliasToWork().get( + alias); + if (op != null && op instanceof TableScanOperator) { + final TableScanOperator tableScan = (TableScanOperator) op; + + // push down projections + final List list = tableScan.getNeededColumnIDs(); + + if (list != null) { + ColumnProjectionUtils.appendReadColumnIDs(jobConf, list); + } else { + ColumnProjectionUtils.setFullyReadColumns(jobConf); + } + + pushFilters(jobConf, tableScan); + } + } + } + + private void pushFilters(final JobConf jobConf, final TableScanOperator tableScan) { + + final TableScanDesc scanDesc = tableScan.getConf(); + if (scanDesc == null) { + LOG.debug("Not pushing filters because TableScanDesc is null"); + return; + } + + // construct column name list for reference by filter push down + Utilities.setColumnNameList(jobConf, tableScan); + + // push down filters + final ExprNodeGenericFuncDesc filterExpr = scanDesc.getFilterExpr(); + if (filterExpr == null) { + LOG.debug("Not pushing filters because FilterExpr is null"); + return; + } + + final String filterText = filterExpr.getExprString(); + final String filterExprSerialized = Utilities.serializeExpression(filterExpr); + jobConf.set( + TableScanDesc.FILTER_TEXT_CONF_STR, + filterText); + jobConf.set( + TableScanDesc.FILTER_EXPR_CONF_STR, + filterExprSerialized); + } + + + public JobConf pushProjectionsAndFilters(JobConf jobConf, Path path) + throws IOException { + updateMrWork(jobConf); // TODO: refactor this + final JobConf cloneJobConf = new JobConf(jobConf); + final PartitionDesc part = pathToPartitionInfo.get(path.toString()); + + if ((part != null) && (part.getTableDesc() != null)) { + Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), cloneJobConf); + } + + pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().toString()); + return cloneJobConf; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java new file mode 100644 index 0000000..ce39cee --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java @@ -0,0 +1,80 @@ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +import parquet.io.ParquetDecodingException; +import parquet.io.api.Converter; +import parquet.schema.GroupType; + +/** + * + * A ArrayWritableGroupConverter + * + */ +public class ArrayWritableGroupConverter extends HiveGroupConverter { + + private final Converter[] converters; + private final HiveGroupConverter parent; + private final int index; + private final boolean isMap; + private Writable currentValue; + private Writable[] mapPairContainer; + + public ArrayWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent, final int index) { + this.parent = parent; + this.index = index; + + if (groupType.getFieldCount() == 2) { + converters = new Converter[2]; + converters[0] = getConverterFromDescription(groupType.getType(0), 0, this); + converters[1] = getConverterFromDescription(groupType.getType(1), 1, this); + isMap = true; + } else if (groupType.getFieldCount() == 1) { + converters = new Converter[1]; + converters[0] = getConverterFromDescription(groupType.getType(0), 0, this); + isMap = false; + } else { + throw new RuntimeException("Invalid parquet hive schema: " + groupType); + } + + } + + @Override + public Converter getConverter(final int fieldIndex) { + return converters[fieldIndex]; + } + + @Override + public void start() { + if (isMap) { + mapPairContainer = new Writable[2]; + } + } + + @Override + public void end() { + if (isMap) { + currentValue = new ArrayWritable(Writable.class, mapPairContainer); + } + parent.add(index, currentValue); + } + + @Override + protected void set(final int index, final Writable value) { + if (index != 0 && mapPairContainer == null || index > 1) { + throw new ParquetDecodingException("Repeated group can only have one or two fields for maps. Not allowed to set for the index : " + index); + } + + if (isMap) { + mapPairContainer[index] = value; + } else { + currentValue = value; + } + } + + @Override + protected void add(final int index, final Writable value) { + set(index, value); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java new file mode 100644 index 0000000..873a9cb --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java @@ -0,0 +1,125 @@ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Converter; +import parquet.schema.GroupType; +import parquet.schema.Type; + +/** + * + * A MapWritableGroupConverter, real converter between hive and parquet types recursively for complex types. + * + */ +public class DataWritableGroupConverter extends HiveGroupConverter { + + private final Converter[] converters; + private final HiveGroupConverter parent; + private final int index; + private final Object[] currentArr; + private Writable[] rootMap; + + public DataWritableGroupConverter(final GroupType requestedSchema, final GroupType tableSchema) { + this(requestedSchema, null, 0, tableSchema); + final int fieldCount = tableSchema.getFieldCount(); + this.rootMap = new Writable[fieldCount]; + } + + public DataWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent, final int index) { + this(groupType, parent, index, groupType); + } + + public DataWritableGroupConverter(final GroupType selectedGroupType, final HiveGroupConverter parent, final int index, final GroupType containingGroupType) { + this.parent = parent; + this.index = index; + final int totalFieldCount = containingGroupType.getFieldCount(); + final int selectedFieldCount = selectedGroupType.getFieldCount(); + + currentArr = new Object[totalFieldCount]; + converters = new Converter[selectedFieldCount]; + + int i = 0; + for (final Type subtype : selectedGroupType.getFields()) { + if (containingGroupType.getFields().contains(subtype)) { + converters[i] = getConverterFromDescription(subtype, containingGroupType.getFieldIndex(subtype.getName()), this); + } else { + throw new RuntimeException("Group type [" + containingGroupType + "] does not contain requested field: " + subtype); + } + ++i; + } + } + + final public ArrayWritable getCurrentArray() { + final Writable[] writableArr; + if (this.rootMap != null) { // We're at the root : we can safely re-use the same map to save perf + writableArr = this.rootMap; + } else { + writableArr = new Writable[currentArr.length]; + } + + for (int i = 0; i < currentArr.length; i++) { + final Object obj = currentArr[i]; + if (obj instanceof List) { + final List objList = (List)obj; + final ArrayWritable arr = new ArrayWritable(Writable.class, objList.toArray(new Writable[objList.size()])); + writableArr[i] = arr; + } else { + writableArr[i] = (Writable) obj; + } + } + return new ArrayWritable(Writable.class, writableArr); + } + + @Override + final protected void set(final int index, final Writable value) { + currentArr[index] = value; + } + + @Override + public Converter getConverter(final int fieldIndex) { + return converters[fieldIndex]; + } + + @Override + public void start() { + for (int i = 0; i < currentArr.length; i++) { + currentArr[i] = null; + } + } + + @Override + public void end() { + if (parent != null) { + parent.set(index, getCurrentArray()); + } + } + + @Override + protected void add(final int index, final Writable value) { + + if (currentArr[index] != null) { + + final Object obj = currentArr[index]; + if (obj instanceof List) { + final List list = (List) obj; + list.add(value); + } else { + throw new RuntimeException("This should be a List: " + obj); + } + + } else { + // create a list here because we don't know the final length of the object + // and it is more flexible than ArrayWritable. + // + // converted to ArrayWritable by getCurrentArray(). + final List buffer = new ArrayList(); + buffer.add(value); + currentArr[index] = (Object) buffer; + } + + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java new file mode 100644 index 0000000..b537e15 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java @@ -0,0 +1,31 @@ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import org.apache.hadoop.io.ArrayWritable; + +import parquet.io.api.GroupConverter; +import parquet.io.api.RecordMaterializer; +import parquet.schema.GroupType; + +/** + * + * A MapWritableReadSupport, encapsulates the tuples + * + */ +public class DataWritableRecordConverter extends RecordMaterializer { + + private final DataWritableGroupConverter root; + + public DataWritableRecordConverter(final GroupType requestedSchema, final GroupType tableSchema) { + this.root = new DataWritableGroupConverter(requestedSchema, tableSchema); + } + + @Override + public ArrayWritable getCurrentRecord() { + return root.getCurrentArray(); + } + + @Override + public GroupConverter getRootConverter() { + return root; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java new file mode 100644 index 0000000..028256a --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java @@ -0,0 +1,147 @@ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import java.math.BigDecimal; + +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable.DicBinaryWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; + +import parquet.column.Dictionary; +import parquet.io.api.Binary; +import parquet.io.api.Converter; +import parquet.io.api.PrimitiveConverter; + +/** + * + * ETypeConverter is an easy way to set the converter for the right type. + * + */ +public enum ETypeConverter { + + EDOUBLE_CONVERTER(Double.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addDouble(final double value) { + parent.set(index, new DoubleWritable(value)); + } + }; + } + }, + EBOOLEAN_CONVERTER(Boolean.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addBoolean(final boolean value) { + parent.set(index, new BooleanWritable(value)); + } + }; + } + }, + EFLOAT_CONVERTER(Float.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addFloat(final float value) { + parent.set(index, new FloatWritable(value)); + } + }; + } + }, + EINT32_CONVERTER(Integer.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addInt(final int value) { + parent.set(index, new IntWritable(value)); + } + }; + } + }, + EINT64_CONVERTER(Long.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addLong(final long value) { + parent.set(index, new LongWritable(value)); + } + }; + } + }, + EINT96_CONVERTER(BigDecimal.class) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addDouble(final double value) { + parent.set(index, new DoubleWritable(value)); + } + }; + } + }, + EBINARY_CONVERTER(Binary.class) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + private Binary[] dictBinary; + private String[] dict; + + @Override + public boolean hasDictionarySupport() { + return true; + } + + @Override + public void setDictionary(Dictionary dictionary) { + dictBinary = new Binary[dictionary.getMaxId() + 1]; + dict = new String[dictionary.getMaxId() + 1]; + for (int i = 0; i <= dictionary.getMaxId(); i++) { + Binary binary = dictionary.decodeToBinary(i); + dictBinary[i] = binary; + dict[i] = binary.toStringUsingUTF8(); + } + } + + @Override + public void addValueFromDictionary(int dictionaryId) { + parent.set(index, new DicBinaryWritable(dictBinary[dictionaryId], dict[dictionaryId])); + } + + @Override + final public void addBinary(Binary value) { + parent.set(index, new BinaryWritable(value)); + } + }; + } + }; + final Class _type; + + private ETypeConverter(final Class type) { + this._type = type; + } + + private Class getType() { + return _type; + } + + abstract Converter getConverter(final Class type, final int index, final HiveGroupConverter parent); + + static public Converter getNewConverter(final Class type, final int index, final HiveGroupConverter parent) { + for (final ETypeConverter eConverter : values()) { + if (eConverter.getType() == type) { + return eConverter.getConverter(type, index, parent); + } + } + throw new RuntimeException("Converter not found ... for type : " + type); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java new file mode 100644 index 0000000..7bf01e1 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java @@ -0,0 +1,32 @@ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Converter; +import parquet.io.api.GroupConverter; +import parquet.schema.Type; +import parquet.schema.Type.Repetition; + +public abstract class HiveGroupConverter extends GroupConverter { + + static protected Converter getConverterFromDescription(final Type type, final int index, final HiveGroupConverter parent) { + if (type == null) { + return null; + } + + if (type.isPrimitive()) { + return ETypeConverter.getNewConverter(type.asPrimitiveType().getPrimitiveTypeName().javaType, index, parent); + } else { + if (type.asGroupType().getRepetition() == Repetition.REPEATED) { + return new ArrayWritableGroupConverter(type.asGroupType(), parent, index); + } else { + return new DataWritableGroupConverter(type.asGroupType(), parent, index); + } + } + } + + abstract protected void set(int index, Writable value); + + abstract protected void add(int index, Writable value); + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java new file mode 100644 index 0000000..7bd1a10 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java @@ -0,0 +1,120 @@ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import java.util.List; + +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import parquet.Log; +import parquet.schema.GroupType; +import parquet.schema.MessageType; +import parquet.schema.OriginalType; +import parquet.schema.PrimitiveType; +import parquet.schema.PrimitiveType.PrimitiveTypeName; +import parquet.schema.Type; +import parquet.schema.Type.Repetition; + +/** + * + * A HiveSchemaConverter + * + */ +public class HiveSchemaConverter { + + private static final Log LOG = Log.getLog(HiveSchemaConverter.class); + + static public MessageType convert(final List columnNames, final List columnTypes) { + final MessageType schema = new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); + return schema; + } + + static private Type[] convertTypes(final List columnNames, final List columnTypes) { + if (columnNames.size() != columnTypes.size()) { + throw new RuntimeException("Mismatched Hive columns and types. Hive columns names found : " + columnNames + + " . And Hive types found : " + columnTypes); + } + + final Type[] types = new Type[columnNames.size()]; + + for (int i = 0; i < columnNames.size(); ++i) { + types[i] = convertType(columnNames.get(i), columnTypes.get(i)); + } + + return types; + } + + static private Type convertType(final String name, final TypeInfo typeInfo) { + return convertType(name, typeInfo, Repetition.OPTIONAL); + } + + static private Type convertType(final String name, final TypeInfo typeInfo, final Repetition repetition) { + if (typeInfo.getCategory().equals(Category.PRIMITIVE)) { + if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.BINARY, name); + } else if (typeInfo.equals(TypeInfoFactory.intTypeInfo) || typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.INT32, name); + } else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.INT64, name); + } else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.DOUBLE, name); + } else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.FLOAT, name); + } else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.BOOLEAN, name); + } else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) { + // TODO : binaryTypeInfo is a byte array. Need to map it + throw new UnsupportedOperationException("Binary type not implemented"); + } else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { + throw new UnsupportedOperationException("Timestamp type not implemented"); + } else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { + throw new UnsupportedOperationException("Void type not implemented"); + } else if (typeInfo.equals(TypeInfoFactory.unknownTypeInfo)) { + throw new UnsupportedOperationException("Unknown type not implemented"); + } else { + throw new RuntimeException("Unknown type: " + typeInfo); + } + } else if (typeInfo.getCategory().equals(Category.LIST)) { + return convertArrayType(name, (ListTypeInfo) typeInfo); + } else if (typeInfo.getCategory().equals(Category.STRUCT)) { + return convertStructType(name, (StructTypeInfo) typeInfo); + } else if (typeInfo.getCategory().equals(Category.MAP)) { + return convertMapType(name, (MapTypeInfo) typeInfo); + } else if (typeInfo.getCategory().equals(Category.UNION)) { + throw new UnsupportedOperationException("Union type not implemented"); + } else { + throw new RuntimeException("Unknown type: " + typeInfo); + } + } + + // An optional group containing a repeated anonymous group "bag", containing + // 1 anonymous element "array_element" + static private GroupType convertArrayType(final String name, final ListTypeInfo typeInfo) { + final TypeInfo subType = typeInfo.getListElementTypeInfo(); + return listWrapper(name, OriginalType.LIST, new GroupType(Repetition.REPEATED, ParquetHiveSerDe.ARRAY.toString(), convertType("array_element", subType))); + } + + // An optional group containing multiple elements + static private GroupType convertStructType(final String name, final StructTypeInfo typeInfo) { + final List columnNames = typeInfo.getAllStructFieldNames(); + final List columnTypes = typeInfo.getAllStructFieldTypeInfos(); + return new GroupType(Repetition.OPTIONAL, name, convertTypes(columnNames, columnTypes)); + + } + + // An optional group containing a repeated anonymous group "map", containing + // 2 elements: "key", "value" + static private GroupType convertMapType(final String name, final MapTypeInfo typeInfo) { + final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED); + final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), typeInfo.getMapValueTypeInfo()); + return listWrapper(name, OriginalType.MAP_KEY_VALUE, new GroupType(Repetition.REPEATED, ParquetHiveSerDe.MAP.toString(), keyType, valueType)); + } + + static private GroupType listWrapper(final String name, final OriginalType originalType, final GroupType groupType) { + return new GroupType(Repetition.OPTIONAL, name, originalType, groupType); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java new file mode 100644 index 0000000..7e416e8 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -0,0 +1,120 @@ +package org.apache.hadoop.hive.ql.io.parquet.read; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.io.parquet.convert.DataWritableRecordConverter; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.util.StringUtils; + +import parquet.hadoop.api.ReadSupport; +import parquet.io.api.RecordMaterializer; +import parquet.schema.MessageType; +import parquet.schema.MessageTypeParser; +import parquet.schema.PrimitiveType; +import parquet.schema.PrimitiveType.PrimitiveTypeName; +import parquet.schema.Type; +import parquet.schema.Type.Repetition; + +/** + * + * A MapWritableReadSupport + * + * Manages the translation between Hive and Parquet + * + */ +public class DataWritableReadSupport extends ReadSupport { + + public static final String HIVE_SCHEMA_KEY = "HIVE_TABLE_SCHEMA"; + private static final List virtualColumns; + + static { + List vcols = new ArrayList(); + vcols.add("INPUT__FILE__NAME"); + vcols.add("BLOCK__OFFSET__INSIDE__FILE"); + vcols.add("ROW__OFFSET__INSIDE__BLOCK"); + vcols.add("RAW__DATA__SIZE"); + virtualColumns = Collections.unmodifiableList(vcols); + } + + /** + * From a string which columns names (including hive column), return a list + * of string columns + * + * @param comma separated list of columns + * @return list with virtual columns removed + */ + private static List getColumns(final String columns) { + final List result = (List) StringUtils.getStringCollection(columns); + result.removeAll(virtualColumns); + return result; + } + /** + * + * It creates the readContext for Parquet side with the requested schema during the init phase. + * + * @param configuration needed to get the wanted columns + * @param keyValueMetaData // unused + * @param fileSchema parquet file schema + * @return the parquet ReadContext + */ + @Override + public parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration, final Map keyValueMetaData, final MessageType fileSchema) { + final String columns = configuration.get("columns"); + final Map contextMetadata = new HashMap(); + if (columns != null) { + final List listColumns = getColumns(columns); + + final List typeListTable = new ArrayList(); + for (final String col : listColumns) { + if (fileSchema.containsField(col)) { + typeListTable.add(fileSchema.getType(col)); + } else { // dummy type, should not be called + typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col)); + } + } + MessageType tableSchema = new MessageType("table_schema", typeListTable); + contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString()); + + MessageType requestedSchemaByUser = tableSchema; + final List indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration); + + final List typeListWanted = new ArrayList(); + for (final Integer idx : indexColumnsWanted) { + typeListWanted.add(tableSchema.getType(listColumns.get(idx))); + } + requestedSchemaByUser = new MessageType(fileSchema.getName(), typeListWanted); + + return new ReadContext(requestedSchemaByUser, contextMetadata); + } else { + contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString()); + return new ReadContext(fileSchema, contextMetadata); + } + } + + /** + * + * It creates the hive read support to interpret data from parquet to hive + * + * @param configuration // unused + * @param keyValueMetaData + * @param fileSchema // unused + * @param readContext containing the requested schema and the schema of the hive table + * @return Record Materialize for Hive + */ + @Override + public RecordMaterializer prepareForRead(final Configuration configuration, final Map keyValueMetaData, final MessageType fileSchema, + final parquet.hadoop.api.ReadSupport.ReadContext readContext) { + final Map metadata = readContext.getReadSupportMetadata(); + if (metadata == null) { + throw new RuntimeException("ReadContext not initialized properly. Don't know the Hive Schema."); + } + final MessageType tableSchema = MessageTypeParser.parseMessageType(metadata.get(HIVE_SCHEMA_KEY)); + return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java new file mode 100644 index 0000000..9eb2f5a --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java @@ -0,0 +1,228 @@ +package org.apache.hadoop.hive.ql.io.parquet.read; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskAttemptID; + +import parquet.hadoop.ParquetFileReader; +import parquet.hadoop.ParquetInputFormat; +import parquet.hadoop.ParquetInputSplit; +import parquet.hadoop.api.ReadSupport.ReadContext; +import parquet.hadoop.metadata.BlockMetaData; +import parquet.hadoop.metadata.FileMetaData; +import parquet.hadoop.metadata.ParquetMetadata; +import parquet.hadoop.util.ContextUtil; +import parquet.schema.MessageTypeParser; + +public class ParquetRecordReaderWrapper implements RecordReader { + public static final Log LOG = LogFactory.getLog(ParquetRecordReaderWrapper.class); + + private final long splitLen; // for getPos() + + private org.apache.hadoop.mapreduce.RecordReader realReader; + // expect readReader return same Key & Value objects (common case) + // this avoids extra serialization & deserialization of these objects + private ArrayWritable valueObj = null; + private boolean firstRecord = false; + private boolean eof = false; + private int schemaSize; + + private final ProjectionPusher projectionPusher; + + public ParquetRecordReaderWrapper( + final ParquetInputFormat newInputFormat, + final InputSplit oldSplit, + final JobConf oldJobConf, + final Reporter reporter) + throws IOException, InterruptedException { + this(newInputFormat, oldSplit, oldJobConf, reporter, new ProjectionPusher()); + } + + public ParquetRecordReaderWrapper( + final ParquetInputFormat newInputFormat, + final InputSplit oldSplit, + final JobConf oldJobConf, + final Reporter reporter, + final ProjectionPusher pusher) + throws IOException, InterruptedException { + this.splitLen = oldSplit.getLength(); + this.projectionPusher = pusher; + + final ParquetInputSplit split = getSplit(oldSplit, oldJobConf); + + TaskAttemptID taskAttemptID = TaskAttemptID.forName(oldJobConf.get("mapred.task.id")); + if (taskAttemptID == null) { + taskAttemptID = new TaskAttemptID(); + } + + // create a TaskInputOutputContext + final TaskAttemptContext taskContext = ContextUtil.newTaskAttemptContext(oldJobConf, taskAttemptID); + + if (split != null) { + try { + realReader = newInputFormat.createRecordReader(split, taskContext); + realReader.initialize(split, taskContext); + + // read once to gain access to key and value objects + if (realReader.nextKeyValue()) { + firstRecord = true; + valueObj = realReader.getCurrentValue(); + } else { + eof = true; + } + } catch (final InterruptedException e) { + throw new IOException(e); + } + } else { + realReader = null; + eof = true; + if (valueObj == null) { // Should initialize the value for createValue + valueObj = new ArrayWritable(Writable.class, new Writable[schemaSize]); + } + } + } + + @Override + public void close() throws IOException { + if (realReader != null) { + realReader.close(); + } + } + + @Override + public Void createKey() { + return null; + } + + @Override + public ArrayWritable createValue() { + return valueObj; + } + + @Override + public long getPos() throws IOException { + return (long) (splitLen * getProgress()); + } + + @Override + public float getProgress() throws IOException { + if (realReader == null) { + return 1f; + } else { + try { + return realReader.getProgress(); + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + } + + @Override + public boolean next(final Void key, final ArrayWritable value) throws IOException { + if (eof) { + return false; + } + + try { + if (firstRecord) { // key & value are already read. + firstRecord = false; + } else if (!realReader.nextKeyValue()) { + eof = true; // strictly not required, just for consistency + return false; + } + + final ArrayWritable tmpCurValue = realReader.getCurrentValue(); + + if (value != tmpCurValue) { + final Writable[] arrValue = value.get(); + final Writable[] arrCurrent = tmpCurValue.get(); + if (value != null && arrValue.length == arrCurrent.length) { + System.arraycopy(arrCurrent, 0, arrValue, 0, arrCurrent.length); + } else { + if (arrValue.length != arrCurrent.length) { + throw new IOException("DeprecatedParquetHiveInput : size of object differs. Value size : " + arrValue.length + ", Current Object size : " + + arrCurrent.length); + } else { + throw new IOException("DeprecatedParquetHiveInput can not support RecordReaders that don't return same key & value & value is null"); + } + } + } + return true; + + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + + /** + * gets a ParquetInputSplit corresponding to a split given by Hive + * + * @param oldSplit The split given by Hive + * @param conf The JobConf of the Hive job + * @return a ParquetInputSplit corresponding to the oldSplit + * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file + */ + protected ParquetInputSplit getSplit( + final InputSplit oldSplit, + final JobConf conf + ) throws IOException { + + ParquetInputSplit split; + + if (oldSplit instanceof FileSplit) { + final Path finalPath = ((FileSplit) oldSplit).getPath(); + final JobConf cloneJob = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent()); + + final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath); + final List blocks = parquetMetadata.getBlocks(); + final FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); + + final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema()); + schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount(); + + final List splitGroup = new ArrayList(); + final long splitStart = ((FileSplit) oldSplit).getStart(); + final long splitLength = ((FileSplit) oldSplit).getLength(); + for (final BlockMetaData block : blocks) { + final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); + if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) { + splitGroup.add(block); + } + } + + if (splitGroup.isEmpty()) { + LOG.warn("Skipping split, could not find row group in: " + (FileSplit) oldSplit); + split = null; + } else { + split = new ParquetInputSplit(finalPath, + splitStart, + splitLength, + ((FileSplit) oldSplit).getLocations(), + splitGroup, + readContext.getRequestedSchema().toString(), + fileMetaData.getSchema().toString(), + fileMetaData.getKeyValueMetaData(), + readContext.getReadSupportMetadata()); + } + + } else { + throw new IllegalArgumentException("Unknown split type: " + oldSplit); + } + + return split; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/AbstractParquetMapInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/AbstractParquetMapInspector.java new file mode 100644 index 0000000..4684cec --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/AbstractParquetMapInspector.java @@ -0,0 +1,148 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableMapObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +public abstract class AbstractParquetMapInspector implements SettableMapObjectInspector { + + protected final ObjectInspector keyInspector; + protected final ObjectInspector valueInspector; + + public AbstractParquetMapInspector(final ObjectInspector keyInspector, final ObjectInspector valueInspector) { + this.keyInspector = keyInspector; + this.valueInspector = valueInspector; + } + + @Override + public String getTypeName() { + return "map<" + keyInspector.getTypeName() + "," + valueInspector.getTypeName() + ">"; + } + + @Override + public Category getCategory() { + return Category.MAP; + } + + @Override + public ObjectInspector getMapKeyObjectInspector() { + return keyInspector; + } + + @Override + public ObjectInspector getMapValueObjectInspector() { + return valueInspector; + } + + @Override + public Map getMap(final Object data) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] mapContainer = ((ArrayWritable) data).get(); + + if (mapContainer == null || mapContainer.length == 0) { + return null; + } + + final Writable[] mapArray = ((ArrayWritable) mapContainer[0]).get(); + final Map map = new HashMap(); + + for (final Writable obj : mapArray) { + final ArrayWritable mapObj = (ArrayWritable) obj; + final Writable[] arr = mapObj.get(); + map.put(arr[0], arr[1]); + } + + return map; + } + + if (data instanceof Map) { + return (Map) data; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public int getMapSize(final Object data) { + if (data == null) { + return -1; + } + + if (data instanceof ArrayWritable) { + final Writable[] mapContainer = ((ArrayWritable) data).get(); + + if (mapContainer == null || mapContainer.length == 0) { + return -1; + } else { + return ((ArrayWritable) mapContainer[0]).get().length; + } + } + + if (data instanceof Map) { + return ((Map) data).size(); + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public Object create() { + Map m = new HashMap(); + return m; + } + + @Override + public Object put(Object map, Object key, Object value) { + Map m = (HashMap) map; + m.put(key, value); + return m; + } + + @Override + public Object remove(Object map, Object key) { + Map m = (HashMap) map; + m.remove(key); + return m; + } + + @Override + public Object clear(Object map) { + Map m = (HashMap) map; + m.clear(); + return m; + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final StandardParquetHiveMapInspector other = (StandardParquetHiveMapInspector) obj; + if (this.keyInspector != other.keyInspector && (this.keyInspector == null || !this.keyInspector.equals(other.keyInspector))) { + return false; + } + if (this.valueInspector != other.valueInspector && (this.valueInspector == null || !this.valueInspector.equals(other.valueInspector))) { + return false; + } + return true; + } + + @Override + public int hashCode() { + int hash = 7; + hash = 59 * hash + (this.keyInspector != null ? this.keyInspector.hashCode() : 0); + hash = 59 * hash + (this.valueInspector != null ? this.valueInspector.hashCode() : 0); + return hash; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java new file mode 100644 index 0000000..9193eeb --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java @@ -0,0 +1,209 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +import org.apache.hadoop.hive.ql.io.parquet.serde.primitive.ParquetPrimitiveInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.ArrayWritable; + +/** + * + * The ArrayWritableObjectInspector will inspect an ArrayWritable, considering it as a Hive struct.
+ * It can also inspect a List if Hive decides to inspect the result of an inspection. + * + */ +public class ArrayWritableObjectInspector extends SettableStructObjectInspector { + + private final TypeInfo typeInfo; + private final List fieldInfos; + private final List fieldNames; + private final List fields; + private final HashMap fieldsByName; + + public ArrayWritableObjectInspector(final StructTypeInfo rowTypeInfo) { + + typeInfo = rowTypeInfo; + fieldNames = rowTypeInfo.getAllStructFieldNames(); + fieldInfos = rowTypeInfo.getAllStructFieldTypeInfos(); + fields = new ArrayList(fieldNames.size()); + fieldsByName = new HashMap(); + + for (int i = 0; i < fieldNames.size(); ++i) { + final String name = fieldNames.get(i); + final TypeInfo fieldInfo = fieldInfos.get(i); + + final StructFieldImpl field = new StructFieldImpl(name, getObjectInspector(fieldInfo), i); + fields.add(field); + fieldsByName.put(name, field); + } + } + + private ObjectInspector getObjectInspector(final TypeInfo typeInfo) { + if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableFloatObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.intTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableIntObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableLongObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { + return ParquetPrimitiveInspectorFactory.parquetStringInspector; + } else if (typeInfo.getCategory().equals(Category.STRUCT)) { + return new ArrayWritableObjectInspector((StructTypeInfo) typeInfo); + } else if (typeInfo.getCategory().equals(Category.LIST)) { + final TypeInfo subTypeInfo = ((ListTypeInfo) typeInfo).getListElementTypeInfo(); + return new ParquetHiveArrayInspector(getObjectInspector(subTypeInfo)); + } else if (typeInfo.getCategory().equals(Category.MAP)) { + final TypeInfo keyTypeInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo(); + final TypeInfo valueTypeInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo(); + if (keyTypeInfo.equals(TypeInfoFactory.stringTypeInfo) || keyTypeInfo.equals(TypeInfoFactory.byteTypeInfo) + || keyTypeInfo.equals(TypeInfoFactory.shortTypeInfo)) { + return new DeepParquetHiveMapInspector(getObjectInspector(keyTypeInfo), getObjectInspector(valueTypeInfo)); + } else { + return new StandardParquetHiveMapInspector(getObjectInspector(keyTypeInfo), getObjectInspector(valueTypeInfo)); + } + } else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { + throw new UnsupportedOperationException("timestamp not implemented yet"); + } else if (typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { + return ParquetPrimitiveInspectorFactory.parquetByteInspector; + } else if (typeInfo.equals(TypeInfoFactory.shortTypeInfo)) { + return ParquetPrimitiveInspectorFactory.parquetShortInspector; + } else { + throw new RuntimeException("Unknown field info: " + typeInfo); + } + + } + + @Override + public Category getCategory() { + return Category.STRUCT; + } + + @Override + public String getTypeName() { + return typeInfo.getTypeName(); + } + + @Override + public List getAllStructFieldRefs() { + return fields; + } + + @Override + public Object getStructFieldData(final Object data, final StructField fieldRef) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final ArrayWritable arr = (ArrayWritable) data; + return arr.get()[((StructFieldImpl) fieldRef).getIndex()]; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public StructField getStructFieldRef(final String name) { + return fieldsByName.get(name); + } + + @Override + public List getStructFieldsDataAsList(final Object data) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final ArrayWritable arr = (ArrayWritable) data; + final Object[] arrWritable = arr.get(); + return new ArrayList(Arrays.asList(arrWritable)); + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public Object create() { + final ArrayList list = new ArrayList(fields.size()); + for (int i = 0; i < fields.size(); ++i) { + list.add(null); + } + return list; + } + + @Override + public Object setStructFieldData(Object struct, StructField field, Object fieldValue) { + final ArrayList list = (ArrayList) struct; + list.set(((StructFieldImpl) field).getIndex(), fieldValue); + return list; + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final ArrayWritableObjectInspector other = (ArrayWritableObjectInspector) obj; + if (this.typeInfo != other.typeInfo && (this.typeInfo == null || !this.typeInfo.equals(other.typeInfo))) { + return false; + } + return true; + } + + @Override + public int hashCode() { + int hash = 5; + hash = 29 * hash + (this.typeInfo != null ? this.typeInfo.hashCode() : 0); + return hash; + } + + class StructFieldImpl implements StructField { + + private final String name; + private final ObjectInspector inspector; + private final int index; + + public StructFieldImpl(final String name, final ObjectInspector inspector, final int index) { + this.name = name; + this.inspector = inspector; + this.index = index; + } + + @Override + public String getFieldComment() { + return ""; + } + + @Override + public String getFieldName() { + return name; + } + + public int getIndex() { + return index; + } + + @Override + public ObjectInspector getFieldObjectInspector() { + return inspector; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/DeepParquetHiveMapInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/DeepParquetHiveMapInspector.java new file mode 100644 index 0000000..5dd3535 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/DeepParquetHiveMapInspector.java @@ -0,0 +1,69 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.Map; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +/** + * The DeepParquetHiveMapInspector will inspect an ArrayWritable, considering it as a Hive map.
+ * It can also inspect a Map if Hive decides to inspect the result of an inspection.
+ * When trying to access elements from the map it will iterate over all keys, inspecting them and comparing them to the + * desired key. + * + */ +public class DeepParquetHiveMapInspector extends AbstractParquetMapInspector { + + public DeepParquetHiveMapInspector(final ObjectInspector keyInspector, final ObjectInspector valueInspector) { + super(keyInspector, valueInspector); + } + + @Override + public Object getMapValueElement(final Object data, final Object key) { + if (data == null || key == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] mapContainer = ((ArrayWritable) data).get(); + + if (mapContainer == null || mapContainer.length == 0) { + return null; + } + + final Writable[] mapArray = ((ArrayWritable) mapContainer[0]).get(); + + for (final Writable obj : mapArray) { + final ArrayWritable mapObj = (ArrayWritable) obj; + final Writable[] arr = mapObj.get(); + if (key.equals(arr[0]) || key.equals(((PrimitiveObjectInspector) keyInspector).getPrimitiveJavaObject(arr[0])) + || key.equals(((PrimitiveObjectInspector) keyInspector).getPrimitiveWritableObject(arr[0]))) { + return arr[1]; + } + } + + return null; + } + + if (data instanceof Map) { + final Map map = (Map) data; + + if (map.containsKey(key)) { + return map.get(key); + } + + for (final Map.Entry entry : map.entrySet()) { + if (key.equals(((PrimitiveObjectInspector) keyInspector).getPrimitiveJavaObject(entry.getKey())) + || key.equals(((PrimitiveObjectInspector) keyInspector).getPrimitiveWritableObject(entry.getKey()))) { + return entry.getValue(); + } + } + + return null; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveArrayInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveArrayInspector.java new file mode 100644 index 0000000..5d02b33 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveArrayInspector.java @@ -0,0 +1,172 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableListObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +/** + * The ParquetHiveArrayInspector will inspect an ArrayWritable, considering it as an Hive array.
+ * It can also inspect a List if Hive decides to inspect the result of an inspection. + * + */ +public class ParquetHiveArrayInspector implements SettableListObjectInspector { + + ObjectInspector arrayElementInspector; + + public ParquetHiveArrayInspector(final ObjectInspector arrayElementInspector) { + this.arrayElementInspector = arrayElementInspector; + } + + @Override + public String getTypeName() { + return "array<" + arrayElementInspector.getTypeName() + ">"; + } + + @Override + public Category getCategory() { + return Category.LIST; + } + + @Override + public ObjectInspector getListElementObjectInspector() { + return arrayElementInspector; + } + + @Override + public Object getListElement(final Object data, final int index) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] listContainer = ((ArrayWritable) data).get(); + + if (listContainer == null || listContainer.length == 0) { + return null; + } + + final Writable subObj = listContainer[0]; + + if (subObj == null) { + return null; + } + + if (index >= 0 && index < ((ArrayWritable) subObj).get().length) { + return ((ArrayWritable) subObj).get()[index]; + } else { + return null; + } + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public int getListLength(final Object data) { + if (data == null) { + return -1; + } + + if (data instanceof ArrayWritable) { + final Writable[] listContainer = ((ArrayWritable) data).get(); + + if (listContainer == null || listContainer.length == 0) { + return -1; + } + + final Writable subObj = listContainer[0]; + + if (subObj == null) { + return 0; + } + + return ((ArrayWritable) subObj).get().length; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public List getList(final Object data) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] listContainer = ((ArrayWritable) data).get(); + + if (listContainer == null || listContainer.length == 0) { + return null; + } + + final Writable subObj = listContainer[0]; + + if (subObj == null) { + return null; + } + + final Writable[] array = ((ArrayWritable) subObj).get(); + final List list = new ArrayList(); + + for (final Writable obj : array) { + list.add(obj); + } + + return list; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public Object create(final int size) { + final ArrayList result = new ArrayList(size); + for (int i = 0; i < size; ++i) { + result.add(null); + } + return result; + } + + @Override + public Object set(final Object list, final int index, final Object element) { + final ArrayList l = (ArrayList) list; + l.set(index, element); + return list; + } + + @Override + public Object resize(final Object list, final int newSize) { + final ArrayList l = (ArrayList) list; + l.ensureCapacity(newSize); + while (l.size() < newSize) { + l.add(null); + } + while (l.size() > newSize) { + l.remove(l.size() - 1); + } + return list; + } + + @Override + public boolean equals(final Object o) { + if (o == null || o.getClass() != getClass()) { + return false; + } else if (o == this) { + return true; + } else { + final ObjectInspector other = ((ParquetHiveArrayInspector) o).arrayElementInspector; + return other.equals(arrayElementInspector); + } + } + + @Override + public int hashCode() { + int hash = 3; + hash = 29 * hash + (this.arrayElementInspector != null ? this.arrayElementInspector.hashCode() : 0); + return hash; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java new file mode 100644 index 0000000..9647d67 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java @@ -0,0 +1,272 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Binary; + +/** + * + * A ParquetHiveSerDe for Hive (with the deprecated package mapred) + * + */ +public class ParquetHiveSerDe implements SerDe { + + public static Text MAP_KEY = new Text("key"); + public static Text MAP_VALUE = new Text("value"); + public static Text MAP = new Text("map"); + public static Text ARRAY = new Text("bag"); + private SerDeStats stats; + ObjectInspector objInspector; + + private enum LAST_OPERATION { + + SERIALIZE, + DESERIALIZE, + UNKNOWN + } + LAST_OPERATION status; + private long serializedSize; + private long deserializedSize; + + @Override + final public void initialize(final Configuration conf, final Properties tbl) throws SerDeException { + + final TypeInfo rowTypeInfo; + final List columnNames; + final List columnTypes; + // Get column names and sort order + final String columnNameProperty = tbl.getProperty("columns"); + final String columnTypeProperty = tbl.getProperty("columns.types"); + + if (columnNameProperty.length() == 0) { + columnNames = new ArrayList(); + } else { + columnNames = Arrays.asList(columnNameProperty.split(",")); + } + + if (columnTypeProperty.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + + if (columnNames.size() != columnTypes.size()) { + throw new RuntimeException("ParquetHiveSerde initialization failed. Number of column name and column type differs."); + } + + // Create row related objects + rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo); + + // Stats part + stats = new SerDeStats(); + serializedSize = 0; + deserializedSize = 0; + status = LAST_OPERATION.UNKNOWN; + } + + @Override + public Object deserialize(final Writable blob) throws SerDeException { + status = LAST_OPERATION.DESERIALIZE; + deserializedSize = 0; + if (blob instanceof ArrayWritable) { + deserializedSize = ((ArrayWritable) blob).get().length; + return blob; + } else { + return null; + } + } + + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return objInspector; + } + + @Override + public Class getSerializedClass() { + return ArrayWritable.class; + } + + @Override + public Writable serialize(final Object obj, final ObjectInspector objInspector) throws SerDeException { + if (!objInspector.getCategory().equals(Category.STRUCT)) { + throw new SerDeException("Cannot serialize " + objInspector.getCategory() + ". Can only serialize a struct"); + } + + final ArrayWritable serializeData = createStruct(obj, (StructObjectInspector) objInspector); + + serializedSize = serializeData.get().length; + status = LAST_OPERATION.SERIALIZE; + + return serializeData; + } + + private ArrayWritable createStruct(final Object obj, final StructObjectInspector inspector) throws SerDeException { + + final List fields = inspector.getAllStructFieldRefs(); + final Writable[] arr = new Writable[fields.size()]; + + int i = 0; + + for (final StructField field : fields) { + final Object subObj = inspector.getStructFieldData(obj, field); + final ObjectInspector subInspector = field.getFieldObjectInspector(); + + arr[i] = createObject(subObj, subInspector); + ++i; + } + + return new ArrayWritable(Writable.class, arr); + + } + + private Writable createMap(final Object obj, final MapObjectInspector inspector) throws SerDeException { + final Map sourceMap = inspector.getMap(obj); + final ObjectInspector keyInspector = inspector.getMapKeyObjectInspector(); + final ObjectInspector valueInspector = inspector.getMapValueObjectInspector(); + final List array = new ArrayList(); + + if (sourceMap != null) { + for (final Entry keyValue : sourceMap.entrySet()) { + final Writable key = createObject(keyValue.getKey(), keyInspector); + final Writable value = createObject(keyValue.getValue(), valueInspector); + + if (key != null) { + Writable[] arr = new Writable[2]; + arr[0] = key; + arr[1] = value; + array.add(new ArrayWritable(Writable.class, arr)); + } + + } + } + + if (array.size() > 0) { + final ArrayWritable subArray = new ArrayWritable(ArrayWritable.class, array.toArray(new ArrayWritable[array.size()])); + return new ArrayWritable(Writable.class, new Writable[] {subArray}); + } else { + return null; + } + } + + private ArrayWritable createArray(final Object obj, final ListObjectInspector inspector) throws SerDeException { + final List sourceArray = inspector.getList(obj); + final ObjectInspector subInspector = inspector.getListElementObjectInspector(); + final List array = new ArrayList(); + + if (sourceArray != null) { + for (final Object curObj : sourceArray) { + final Writable newObj = createObject(curObj, subInspector); + if (newObj != null) { + array.add(newObj); + } + } + } + + if (array.size() > 0) { + final ArrayWritable subArray = new ArrayWritable(array.get(0).getClass(), array.toArray(new Writable[array.size()])); + return new ArrayWritable(Writable.class, new Writable[] {subArray}); + } else { + return null; + } + } + + private Writable createPrimitive(final Object obj, final PrimitiveObjectInspector inspector) throws SerDeException { + + if (obj == null) { + return null; + } + + switch (inspector.getPrimitiveCategory()) { + case VOID: + return null; + case BOOLEAN: + return new BooleanWritable(((BooleanObjectInspector) inspector).get(obj) ? Boolean.TRUE : Boolean.FALSE); + case BYTE: + return new ByteWritable((byte) ((ByteObjectInspector) inspector).get(obj)); + case DOUBLE: + return new DoubleWritable(((DoubleObjectInspector) inspector).get(obj)); + case FLOAT: + return new FloatWritable(((FloatObjectInspector) inspector).get(obj)); + case INT: + return new IntWritable(((IntObjectInspector) inspector).get(obj)); + case LONG: + return new LongWritable(((LongObjectInspector) inspector).get(obj)); + case SHORT: + return new ShortWritable((short) ((ShortObjectInspector) inspector).get(obj)); + case STRING: + return new BinaryWritable(Binary.fromString(((StringObjectInspector) inspector).getPrimitiveJavaObject(obj))); + default: + throw new SerDeException("Unknown primitive : " + inspector.getPrimitiveCategory()); + } + } + + private Writable createObject(final Object obj, final ObjectInspector inspector) throws SerDeException { + switch (inspector.getCategory()) { + case STRUCT: + return createStruct(obj, (StructObjectInspector) inspector); + case LIST: + return createArray(obj, (ListObjectInspector) inspector); + case MAP: + return createMap(obj, (MapObjectInspector) inspector); + case PRIMITIVE: + return createPrimitive(obj, (PrimitiveObjectInspector) inspector); + default: + throw new SerDeException("Unknown data type" + inspector.getCategory()); + } + } + + // + @Override + public SerDeStats getSerDeStats() { + // must be different + assert (status != LAST_OPERATION.UNKNOWN); + + if (status == LAST_OPERATION.SERIALIZE) { + stats.setRawDataSize(serializedSize); + } else { + stats.setRawDataSize(deserializedSize); + } + return stats; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/StandardParquetHiveMapInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/StandardParquetHiveMapInspector.java new file mode 100644 index 0000000..995662d --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/StandardParquetHiveMapInspector.java @@ -0,0 +1,52 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.Map; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +/** + * The StandardParquetHiveMapInspector will inspect an ArrayWritable, considering it as a Hive map.
+ * It can also inspect a Map if Hive decides to inspect the result of an inspection. + * + */ +public class StandardParquetHiveMapInspector extends AbstractParquetMapInspector { + + public StandardParquetHiveMapInspector(final ObjectInspector keyInspector, final ObjectInspector valueInspector) { + super(keyInspector, valueInspector); + } + + @Override + public Object getMapValueElement(final Object data, final Object key) { + if (data == null || key == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] mapContainer = ((ArrayWritable) data).get(); + + if (mapContainer == null || mapContainer.length == 0) { + return null; + } + + final Writable[] mapArray = ((ArrayWritable) mapContainer[0]).get(); + + for (final Writable obj : mapArray) { + final ArrayWritable mapObj = (ArrayWritable) obj; + final Writable[] arr = mapObj.get(); + if (key.equals(arr[0])) { + return arr[1]; + } + } + + return null; + } + + if (data instanceof Map) { + return ((Map) data).get(key); + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetByteInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetByteInspector.java new file mode 100644 index 0000000..c567102 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetByteInspector.java @@ -0,0 +1,43 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde.primitive; + +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableByteObjectInspector; +import org.apache.hadoop.io.IntWritable; + +/** + * The ParquetByteInspector can inspect both ByteWritables and IntWritables into bytes. + * + */ +public class ParquetByteInspector extends AbstractPrimitiveJavaObjectInspector implements SettableByteObjectInspector { + + ParquetByteInspector() { + super(TypeInfoFactory.byteTypeInfo); + } + + @Override + public Object getPrimitiveWritableObject(final Object o) { + return o == null ? null : new ByteWritable(get(o)); + } + + @Override + public Object create(final byte val) { + return new ByteWritable(val); + } + + @Override + public Object set(final Object o, final byte val) { + ((ByteWritable) o).set(val); + return o; + } + + @Override + public byte get(Object o) { + // Accept int writables and convert them. + if (o instanceof IntWritable) { + return (byte) ((IntWritable) o).get(); + } + return ((ByteWritable) o).get(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetPrimitiveInspectorFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetPrimitiveInspectorFactory.java new file mode 100644 index 0000000..4973b95 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetPrimitiveInspectorFactory.java @@ -0,0 +1,16 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde.primitive; + +/** + * The ParquetPrimitiveInspectorFactory allows us to be sure that the same object is inspected by the same inspector. + * + */ +public class ParquetPrimitiveInspectorFactory { + + public static final ParquetByteInspector parquetByteInspector = new ParquetByteInspector(); + public static final ParquetShortInspector parquetShortInspector = new ParquetShortInspector(); + public static final ParquetStringInspector parquetStringInspector = new ParquetStringInspector(); + + private ParquetPrimitiveInspectorFactory() { + // prevent instantiation + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetShortInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetShortInspector.java new file mode 100644 index 0000000..f2b1176 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetShortInspector.java @@ -0,0 +1,43 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde.primitive; + +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableShortObjectInspector; +import org.apache.hadoop.io.IntWritable; + +/** + * The ParquetShortInspector can inspect both ShortWritables and IntWritables into shorts. + * + */ +public class ParquetShortInspector extends AbstractPrimitiveJavaObjectInspector implements SettableShortObjectInspector { + + ParquetShortInspector() { + super(TypeInfoFactory.shortTypeInfo); + } + + @Override + public Object getPrimitiveWritableObject(final Object o) { + return o == null ? null : new ShortWritable(get(o)); + } + + @Override + public Object create(final short val) { + return new ShortWritable(val); + } + + @Override + public Object set(final Object o, final short val) { + ((ShortWritable) o).set(val); + return o; + } + + @Override + public short get(Object o) { + // Accept int writables and convert them. + if (o instanceof IntWritable) { + return (short) ((IntWritable) o).get(); + } + return ((ShortWritable) o).get(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java new file mode 100644 index 0000000..869958e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java @@ -0,0 +1,85 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde.primitive; + +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableStringObjectInspector; +import org.apache.hadoop.io.Text; + +import parquet.io.api.Binary; + +/** + * The ParquetStringInspector inspects a BinaryWritable to give a Text or String. + * + */ +public class ParquetStringInspector extends AbstractPrimitiveJavaObjectInspector implements SettableStringObjectInspector { + + ParquetStringInspector() { + super(TypeInfoFactory.stringTypeInfo); + } + + @Override + public Text getPrimitiveWritableObject(final Object o) { + if (o == null) { + return null; + } + + if (o instanceof BinaryWritable) { + return new Text(((BinaryWritable) o).getBytes()); + } + + if (o instanceof Text) { + return (Text) o; + } + + if (o instanceof String) { + return new Text((String) o); + } + + throw new UnsupportedOperationException("Cannot inspect " + o.getClass().getCanonicalName()); + } + + @Override + public String getPrimitiveJavaObject(final Object o) { + if (o == null) { + return null; + } + + if (o instanceof BinaryWritable) { + return ((BinaryWritable) o).getString(); + } + + if (o instanceof Text) { + return ((Text) o).toString(); + } + + if (o instanceof String) { + return (String) o; + } + + throw new UnsupportedOperationException("Cannot inspect " + o.getClass().getCanonicalName()); + } + + @Override + public Object set(final Object o, final Text text) { + return new BinaryWritable(text == null ? null : Binary.fromByteArray(text.getBytes())); + } + + @Override + public Object set(final Object o, final String string) { + return new BinaryWritable(string == null ? null : Binary.fromString(string)); + } + + @Override + public Object create(final Text text) { + if (text == null) { + return null; + } + return text.toString(); + } + + @Override + public Object create(final String string) { + return string; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BigDecimalWritable.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BigDecimalWritable.java new file mode 100644 index 0000000..d4498d3 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BigDecimalWritable.java @@ -0,0 +1,130 @@ +package org.apache.hadoop.hive.ql.io.parquet.writable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils.VInt; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableUtils; + +/** + * This file is taken from a patch to hive 0.11 + * Issue : https://issues.apache.org/jira/browse/HIVE-2693 + * + */ +public class BigDecimalWritable implements WritableComparable { + + private byte[] internalStorage = new byte[0]; + private int scale; + + private final VInt vInt = new VInt(); // reusable integer + + public BigDecimalWritable() { + } + + public BigDecimalWritable(final byte[] bytes, final int scale) { + set(bytes, scale); + } + + public BigDecimalWritable(final BigDecimalWritable writable) { + set(writable.getBigDecimal()); + } + + public BigDecimalWritable(final BigDecimal value) { + set(value); + } + + public void set(BigDecimal value) { + value = value.stripTrailingZeros(); + if (value.compareTo(BigDecimal.ZERO) == 0) { + // Special case for 0, because java doesn't strip zeros correctly on + // that number. + value = BigDecimal.ZERO; + } + set(value.unscaledValue().toByteArray(), value.scale()); + } + + public void set(final BigDecimalWritable writable) { + set(writable.getBigDecimal()); + } + + public void set(final byte[] bytes, final int scale) { + this.internalStorage = bytes; + this.scale = scale; + } + + public void setFromBytes(final byte[] bytes, int offset, final int length) { + LazyBinaryUtils.readVInt(bytes, offset, vInt); + scale = vInt.value; + offset += vInt.length; + LazyBinaryUtils.readVInt(bytes, offset, vInt); + offset += vInt.length; + if (internalStorage.length != vInt.value) { + internalStorage = new byte[vInt.value]; + } + System.arraycopy(bytes, offset, internalStorage, 0, vInt.value); + } + + public BigDecimal getBigDecimal() { + return new BigDecimal(new BigInteger(internalStorage), scale); + } + + @Override + public void readFields(final DataInput in) throws IOException { + scale = WritableUtils.readVInt(in); + final int byteArrayLen = WritableUtils.readVInt(in); + if (internalStorage.length != byteArrayLen) { + internalStorage = new byte[byteArrayLen]; + } + in.readFully(internalStorage); + } + + @Override + public void write(final DataOutput out) throws IOException { + WritableUtils.writeVInt(out, scale); + WritableUtils.writeVInt(out, internalStorage.length); + out.write(internalStorage); + } + + @Override + public int compareTo(final BigDecimalWritable that) { + return getBigDecimal().compareTo(that.getBigDecimal()); + } + + public void writeToByteStream(final Output byteStream) { + LazyBinaryUtils.writeVInt(byteStream, scale); + LazyBinaryUtils.writeVInt(byteStream, internalStorage.length); + byteStream.write(internalStorage, 0, internalStorage.length); + } + + @Override + public String toString() { + return getBigDecimal().toString(); + } + + @Override + public boolean equals(final Object other) { + if (other == null || !(other instanceof BigDecimalWritable)) { + return false; + } + final BigDecimalWritable bdw = (BigDecimalWritable) other; + + // 'equals' and 'compareTo' are not compatible with BigDecimals. We want + // compareTo which returns true iff the numbers are equal (e.g.: 3.14 is + // the same as 3.140). 'Equals' returns true iff equal and the same + // scale + // is set in the decimals (e.g.: 3.14 is not the same as 3.140) + return getBigDecimal().compareTo(bdw.getBigDecimal()) == 0; + } + + @Override + public int hashCode() { + return getBigDecimal().hashCode(); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BinaryWritable.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BinaryWritable.java new file mode 100644 index 0000000..dd92c81 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BinaryWritable.java @@ -0,0 +1,80 @@ +package org.apache.hadoop.hive.ql.io.parquet.writable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Binary; + +/** + * + * A Wrapper to support constructor with Binary and String + * + * TODO : remove it, and call BytesWritable with the getBytes() + * + */ +public class BinaryWritable implements Writable { + + private Binary binary; + + public BinaryWritable(final Binary binary) { + this.binary = binary; + } + + public Binary getBinary() { + return binary; + } + + public byte[] getBytes() { + return binary.getBytes(); + } + + public String getString() { + return binary.toStringUsingUTF8(); + } + + @Override + public void readFields(DataInput input) throws IOException { + byte[] bytes = new byte[input.readInt()]; + input.readFully(bytes); + binary = Binary.fromByteArray(bytes); + } + + @Override + public void write(DataOutput output) throws IOException { + output.writeInt(binary.length()); + binary.writeTo(output); + } + + @Override + public int hashCode() { + return binary == null ? 0 : binary.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof BinaryWritable) { + final BinaryWritable other = (BinaryWritable)obj; + return binary.equals(other.binary); + } + return false; + } + + public static class DicBinaryWritable extends BinaryWritable { + + private final String string; + + public DicBinaryWritable(Binary binary, String string) { + super(binary); + this.string = string; + } + + @Override + public String getString() { + return string; + } + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java new file mode 100644 index 0000000..00ec229 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java @@ -0,0 +1,47 @@ +package org.apache.hadoop.hive.ql.io.parquet.write; + +import java.util.HashMap; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.ArrayWritable; + +import parquet.hadoop.api.WriteSupport; +import parquet.io.api.RecordConsumer; +import parquet.schema.MessageType; +import parquet.schema.MessageTypeParser; + +/** + * + * DataWritableWriteSupport is a WriteSupport for the DataWritableWriter + * + */ +public class DataWritableWriteSupport extends WriteSupport { + + public static final String PARQUET_HIVE_SCHEMA = "parquet.hive.schema"; + + public static void setSchema(final MessageType schema, final Configuration configuration) { + configuration.set(PARQUET_HIVE_SCHEMA, schema.toString()); + } + + public static MessageType getSchema(final Configuration configuration) { + return MessageTypeParser.parseMessageType(configuration.get(PARQUET_HIVE_SCHEMA)); + } + private DataWritableWriter writer; + private MessageType schema; + + @Override + public WriteContext init(final Configuration configuration) { + schema = getSchema(configuration); + return new WriteContext(schema, new HashMap()); + } + + @Override + public void prepareForWrite(final RecordConsumer recordConsumer) { + writer = new DataWritableWriter(recordConsumer, schema); + } + + @Override + public void write(final ArrayWritable record) { + writer.write(record); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java new file mode 100644 index 0000000..4bc2379 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java @@ -0,0 +1,147 @@ +package org.apache.hadoop.hive.ql.io.parquet.write; + +import org.apache.hadoop.hive.ql.io.parquet.writable.BigDecimalWritable; +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; + +import parquet.io.ParquetEncodingException; +import parquet.io.api.RecordConsumer; +import parquet.schema.GroupType; +import parquet.schema.Type; + +/** + * + * DataWritableWriter is a writer, + * that will read an ArrayWritable and give the data to parquet + * with the expected schema + * + */ +public class DataWritableWriter { + + private final RecordConsumer recordConsumer; + private final GroupType schema; + + public DataWritableWriter(final RecordConsumer recordConsumer, final GroupType schema) { + this.recordConsumer = recordConsumer; + this.schema = schema; + } + + public void write(final ArrayWritable arr) { + + if (arr == null) { + return; + } + recordConsumer.startMessage(); + writeData(arr, schema); + recordConsumer.endMessage(); + } + + private void writeData(final ArrayWritable arr, final GroupType type) { + + if (arr == null) { + return; + } + + final int fieldCount = type.getFieldCount(); + Writable[] values = arr.get(); + for (int field = 0; field < fieldCount; ++field) { + final Type fieldType = type.getType(field); + final String fieldName = fieldType.getName(); + final Writable value = values[field]; + if (value == null) { + continue; + } + recordConsumer.startField(fieldName, field); + + if (fieldType.isPrimitive()) { + writePrimitive(value); + } else { + recordConsumer.startGroup(); + if (value instanceof ArrayWritable) { + if (fieldType.asGroupType().getRepetition().equals(Type.Repetition.REPEATED)) { + writeArray((ArrayWritable) value, fieldType.asGroupType()); + } else { + writeData((ArrayWritable) value, fieldType.asGroupType()); + } + } else if (value != null) { + throw new ParquetEncodingException("This should be an ArrayWritable or MapWritable: " + value); + } + + recordConsumer.endGroup(); + } + + recordConsumer.endField(fieldName, field); + } + } + + private void writeArray(final ArrayWritable array, final GroupType type) { + if (array == null) { + return; + } + + final Writable[] subValues = array.get(); + + final int fieldCount = type.getFieldCount(); + for (int field = 0; field < fieldCount; ++field) { + final Type subType = type.getType(field); + recordConsumer.startField(subType.getName(), field); + for (int i = 0; i < subValues.length; ++i) { + final Writable subValue = subValues[i]; + if (subValue != null) { + if (subType.isPrimitive()) { + if (subValue instanceof ArrayWritable) { + writePrimitive(((ArrayWritable) subValue).get()[field]);// 0 ? + } else { + writePrimitive(subValue); + } + } else { + if (!(subValue instanceof ArrayWritable)) { + throw new RuntimeException("This should be a ArrayWritable: " + subValue); + } else { + recordConsumer.startGroup(); + writeData((ArrayWritable) subValue, subType.asGroupType()); + recordConsumer.endGroup(); + } + } + } + } + recordConsumer.endField(subType.getName(), field); + } + + } + + private void writePrimitive(final Writable value) { + if (value == null) { + return; + } + if (value instanceof DoubleWritable) { + recordConsumer.addDouble(((DoubleWritable) value).get()); + } else if (value instanceof BooleanWritable) { + recordConsumer.addBoolean(((BooleanWritable) value).get()); + } else if (value instanceof FloatWritable) { + recordConsumer.addFloat(((FloatWritable) value).get()); + } else if (value instanceof IntWritable) { + recordConsumer.addInteger(((IntWritable) value).get()); + } else if (value instanceof LongWritable) { + recordConsumer.addLong(((LongWritable) value).get()); + } else if (value instanceof ShortWritable) { + recordConsumer.addInteger(((ShortWritable) value).get()); + } else if (value instanceof ByteWritable) { + recordConsumer.addInteger(((ByteWritable) value).get()); + } else if (value instanceof BigDecimalWritable) { + throw new UnsupportedOperationException("BigDecimal writing not implemented"); + } else if (value instanceof BinaryWritable) { + recordConsumer.addBinary(((BinaryWritable) value).getBinary()); + } else { + throw new RuntimeException("Unknown value type: " + value + " " + value.getClass()); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java new file mode 100644 index 0000000..1219975 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java @@ -0,0 +1,81 @@ +package org.apache.hadoop.hive.ql.io.parquet.write; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapreduce.OutputFormat; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.hadoop.util.Progressable; +import org.apache.hadoop.hive.ql.io.FSRecordWriter; + +import parquet.hadoop.ParquetOutputFormat; +import parquet.hadoop.util.ContextUtil; + +public class ParquetRecordWriterWrapper implements RecordWriter, + FSRecordWriter { + + public static final Log LOG = LogFactory.getLog(ParquetRecordWriterWrapper.class); + + private final org.apache.hadoop.mapreduce.RecordWriter realWriter; + private TaskAttemptContext taskContext; + + public ParquetRecordWriterWrapper( + final OutputFormat realOutputFormat, + final JobConf jobConf, + final String name, + final Progressable progress) throws IOException { + try { + // create a TaskInputOutputContext + TaskAttemptID taskAttemptID = TaskAttemptID.forName(jobConf.get("mapred.task.id")); + if (taskAttemptID == null) { + taskAttemptID = new TaskAttemptID(); + } + taskContext = ContextUtil.newTaskAttemptContext(jobConf, taskAttemptID); + + LOG.info("creating real writer to write at " + name); + realWriter = (org.apache.hadoop.mapreduce.RecordWriter) ((ParquetOutputFormat) realOutputFormat) + .getRecordWriter(taskContext, new Path(name)); + LOG.info("real writer: " + realWriter); + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + + @Override + public void close(final Reporter reporter) throws IOException { + try { + realWriter.close(taskContext); + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + + @Override + public void write(final Void key, final ArrayWritable value) throws IOException { + try { + realWriter.write(key, value); + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + + @Override + public void close(final boolean abort) throws IOException { + close(null); + } + + @Override + public void write(final Writable w) throws IOException { + write(null, (ArrayWritable) w); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java index 13d0a56..fa136fb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java @@ -59,6 +59,9 @@ import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcSerde; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -138,6 +141,10 @@ protected static final String ORCFILE_SERDE = OrcSerde.class .getName(); + protected static final String PARQUETFILE_INPUT = MapredParquetInputFormat.class.getName(); + protected static final String PARQUETFILE_OUTPUT = MapredParquetOutputFormat.class.getName(); + protected static final String PARQUETFILE_SERDE = ParquetHiveSerDe.class.getName(); + class RowFormatParams { String fieldDelim = null; String fieldEscape = null; @@ -225,6 +232,12 @@ protected boolean fillStorageFormat(ASTNode child, AnalyzeCreateCommonVars share shared.serde = ORCFILE_SERDE; storageFormat = true; break; + case HiveParser.TOK_TBLPARQUETFILE: + inputFormat = PARQUETFILE_INPUT; + outputFormat = PARQUETFILE_OUTPUT; + shared.serde = PARQUETFILE_SERDE; + storageFormat = true; + break; case HiveParser.TOK_TABLEFILEFORMAT: inputFormat = unescapeSQLString(child.getChild(0).getText()); outputFormat = unescapeSQLString(child.getChild(1).getText()); @@ -256,6 +269,10 @@ protected void fillDefaultStorageFormat(AnalyzeCreateCommonVars shared) { inputFormat = ORCFILE_INPUT; outputFormat = ORCFILE_OUTPUT; shared.serde = ORCFILE_SERDE; + } else if ("PARQUET".equalsIgnoreCase(conf.getVar(HiveConf.ConfVars.HIVEDEFAULTFILEFORMAT))) { + inputFormat = PARQUETFILE_INPUT; + outputFormat = PARQUETFILE_OUTPUT; + shared.serde = PARQUETFILE_SERDE; } else { inputFormat = TEXTFILE_INPUT; outputFormat = TEXTFILE_OUTPUT; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g index f83c15d..aea9c1c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g @@ -153,6 +153,7 @@ KW_SEQUENCEFILE: 'SEQUENCEFILE'; KW_TEXTFILE: 'TEXTFILE'; KW_RCFILE: 'RCFILE'; KW_ORCFILE: 'ORC'; +KW_PARQUETFILE: 'PARQUET'; KW_INPUTFORMAT: 'INPUTFORMAT'; KW_OUTPUTFORMAT: 'OUTPUTFORMAT'; KW_INPUTDRIVER: 'INPUTDRIVER'; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g index c15c4b5..6585fcc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g @@ -182,6 +182,7 @@ TOK_TABLEROWFORMATMAPKEYS; TOK_TABLEROWFORMATLINES; TOK_TABLEROWFORMATNULL; TOK_TBLORCFILE; +TOK_TBLPARQUETFILE; TOK_TBLSEQUENCEFILE; TOK_TBLTEXTFILE; TOK_TBLRCFILE; @@ -1197,6 +1198,7 @@ fileFormat | KW_TEXTFILE -> ^(TOK_TBLTEXTFILE) | KW_RCFILE -> ^(TOK_TBLRCFILE) | KW_ORCFILE -> ^(TOK_TBLORCFILE) + | KW_PARQUETFILE -> ^(TOK_TBLPARQUETFILE) | KW_INPUTFORMAT inFmt=StringLiteral KW_OUTPUTFORMAT outFmt=StringLiteral (KW_INPUTDRIVER inDriver=StringLiteral KW_OUTPUTDRIVER outDriver=StringLiteral)? -> ^(TOK_TABLEFILEFORMAT $inFmt $outFmt $inDriver? $outDriver?) | genericSpec=identifier -> ^(TOK_FILEFORMAT_GENERIC $genericSpec) @@ -1675,6 +1677,7 @@ tableFileFormat | KW_STORED KW_AS KW_TEXTFILE -> TOK_TBLTEXTFILE | KW_STORED KW_AS KW_RCFILE -> TOK_TBLRCFILE | KW_STORED KW_AS KW_ORCFILE -> TOK_TBLORCFILE + | KW_STORED KW_AS KW_PARQUETFILE -> TOK_TBLPARQUETFILE | KW_STORED KW_AS KW_INPUTFORMAT inFmt=StringLiteral KW_OUTPUTFORMAT outFmt=StringLiteral (KW_INPUTDRIVER inDriver=StringLiteral KW_OUTPUTDRIVER outDriver=StringLiteral)? -> ^(TOK_TABLEFILEFORMAT $inFmt $outFmt $inDriver? $outDriver?) | KW_STORED KW_BY storageHandler=StringLiteral diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g index 4147503..2b57116 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g @@ -535,5 +535,5 @@ identifier nonReserved : - KW_TRUE | KW_FALSE | KW_LIKE | KW_EXISTS | KW_ASC | KW_DESC | KW_ORDER | KW_GROUP | KW_BY | KW_AS | KW_INSERT | KW_OVERWRITE | KW_OUTER | KW_LEFT | KW_RIGHT | KW_FULL | KW_PARTITION | KW_PARTITIONS | KW_TABLE | KW_TABLES | KW_COLUMNS | KW_INDEX | KW_INDEXES | KW_REBUILD | KW_FUNCTIONS | KW_SHOW | KW_MSCK | KW_REPAIR | KW_DIRECTORY | KW_LOCAL | KW_USING | KW_CLUSTER | KW_DISTRIBUTE | KW_SORT | KW_UNION | KW_LOAD | KW_EXPORT | KW_IMPORT | KW_DATA | KW_INPATH | KW_IS | KW_NULL | KW_CREATE | KW_EXTERNAL | KW_ALTER | KW_CHANGE | KW_FIRST | KW_AFTER | KW_DESCRIBE | KW_DROP | KW_RENAME | KW_IGNORE | KW_PROTECTION | KW_TO | KW_COMMENT | KW_BOOLEAN | KW_TINYINT | KW_SMALLINT | KW_INT | KW_BIGINT | KW_FLOAT | KW_DOUBLE | KW_DATE | KW_DATETIME | KW_TIMESTAMP | KW_DECIMAL | KW_STRING | KW_ARRAY | KW_STRUCT | KW_UNIONTYPE | KW_PARTITIONED | KW_CLUSTERED | KW_SORTED | KW_INTO | KW_BUCKETS | KW_ROW | KW_ROWS | KW_FORMAT | KW_DELIMITED | KW_FIELDS | KW_TERMINATED | KW_ESCAPED | KW_COLLECTION | KW_ITEMS | KW_KEYS | KW_KEY_TYPE | KW_LINES | KW_STORED | KW_FILEFORMAT | KW_SEQUENCEFILE | KW_TEXTFILE | KW_RCFILE | KW_ORCFILE | KW_INPUTFORMAT | KW_OUTPUTFORMAT | KW_INPUTDRIVER | KW_OUTPUTDRIVER | KW_OFFLINE | KW_ENABLE | KW_DISABLE | KW_READONLY | KW_NO_DROP | KW_LOCATION | KW_BUCKET | KW_OUT | KW_OF | KW_PERCENT | KW_ADD | KW_REPLACE | KW_RLIKE | KW_REGEXP | KW_TEMPORARY | KW_EXPLAIN | KW_FORMATTED | KW_PRETTY | KW_DEPENDENCY | KW_LOGICAL | KW_SERDE | KW_WITH | KW_DEFERRED | KW_SERDEPROPERTIES | KW_DBPROPERTIES | KW_LIMIT | KW_SET | KW_UNSET | KW_TBLPROPERTIES | KW_IDXPROPERTIES | KW_VALUE_TYPE | KW_ELEM_TYPE | KW_MAPJOIN | KW_STREAMTABLE | KW_HOLD_DDLTIME | KW_CLUSTERSTATUS | KW_UTC | KW_UTCTIMESTAMP | KW_LONG | KW_DELETE | KW_PLUS | KW_MINUS | KW_FETCH | KW_INTERSECT | KW_VIEW | KW_IN | KW_DATABASES | KW_MATERIALIZED | KW_SCHEMA | KW_SCHEMAS | KW_GRANT | KW_REVOKE | KW_SSL | KW_UNDO | KW_LOCK | KW_LOCKS | KW_UNLOCK | KW_SHARED | KW_EXCLUSIVE | KW_PROCEDURE | KW_UNSIGNED | KW_WHILE | KW_READ | KW_READS | KW_PURGE | KW_RANGE | KW_ANALYZE | KW_BEFORE | KW_BETWEEN | KW_BOTH | KW_BINARY | KW_CONTINUE | KW_CURSOR | KW_TRIGGER | KW_RECORDREADER | KW_RECORDWRITER | KW_SEMI | KW_LATERAL | KW_TOUCH | KW_ARCHIVE | KW_UNARCHIVE | KW_COMPUTE | KW_STATISTICS | KW_USE | KW_OPTION | KW_CONCATENATE | KW_SHOW_DATABASE | KW_UPDATE | KW_RESTRICT | KW_CASCADE | KW_SKEWED | KW_ROLLUP | KW_CUBE | KW_DIRECTORIES | KW_FOR | KW_GROUPING | KW_SETS | KW_TRUNCATE | KW_NOSCAN | KW_USER | KW_ROLE | KW_ROLES | KW_INNER | KW_DEFINED | KW_ADMIN + KW_TRUE | KW_FALSE | KW_LIKE | KW_EXISTS | KW_ASC | KW_DESC | KW_ORDER | KW_GROUP | KW_BY | KW_AS | KW_INSERT | KW_OVERWRITE | KW_OUTER | KW_LEFT | KW_RIGHT | KW_FULL | KW_PARTITION | KW_PARTITIONS | KW_TABLE | KW_TABLES | KW_COLUMNS | KW_INDEX | KW_INDEXES | KW_REBUILD | KW_FUNCTIONS | KW_SHOW | KW_MSCK | KW_REPAIR | KW_DIRECTORY | KW_LOCAL | KW_USING | KW_CLUSTER | KW_DISTRIBUTE | KW_SORT | KW_UNION | KW_LOAD | KW_EXPORT | KW_IMPORT | KW_DATA | KW_INPATH | KW_IS | KW_NULL | KW_CREATE | KW_EXTERNAL | KW_ALTER | KW_CHANGE | KW_FIRST | KW_AFTER | KW_DESCRIBE | KW_DROP | KW_RENAME | KW_IGNORE | KW_PROTECTION | KW_TO | KW_COMMENT | KW_BOOLEAN | KW_TINYINT | KW_SMALLINT | KW_INT | KW_BIGINT | KW_FLOAT | KW_DOUBLE | KW_DATE | KW_DATETIME | KW_TIMESTAMP | KW_DECIMAL | KW_STRING | KW_ARRAY | KW_STRUCT | KW_UNIONTYPE | KW_PARTITIONED | KW_CLUSTERED | KW_SORTED | KW_INTO | KW_BUCKETS | KW_ROW | KW_ROWS | KW_FORMAT | KW_DELIMITED | KW_FIELDS | KW_TERMINATED | KW_ESCAPED | KW_COLLECTION | KW_ITEMS | KW_KEYS | KW_KEY_TYPE | KW_LINES | KW_STORED | KW_FILEFORMAT | KW_SEQUENCEFILE | KW_TEXTFILE | KW_RCFILE | KW_ORCFILE | KW_PARQUETFILE | KW_INPUTFORMAT | KW_OUTPUTFORMAT | KW_INPUTDRIVER | KW_OUTPUTDRIVER | KW_OFFLINE | KW_ENABLE | KW_DISABLE | KW_READONLY | KW_NO_DROP | KW_LOCATION | KW_BUCKET | KW_OUT | KW_OF | KW_PERCENT | KW_ADD | KW_REPLACE | KW_RLIKE | KW_REGEXP | KW_TEMPORARY | KW_EXPLAIN | KW_FORMATTED | KW_PRETTY | KW_DEPENDENCY | KW_LOGICAL | KW_SERDE | KW_WITH | KW_DEFERRED | KW_SERDEPROPERTIES | KW_DBPROPERTIES | KW_LIMIT | KW_SET | KW_UNSET | KW_TBLPROPERTIES | KW_IDXPROPERTIES | KW_VALUE_TYPE | KW_ELEM_TYPE | KW_MAPJOIN | KW_STREAMTABLE | KW_HOLD_DDLTIME | KW_CLUSTERSTATUS | KW_UTC | KW_UTCTIMESTAMP | KW_LONG | KW_DELETE | KW_PLUS | KW_MINUS | KW_FETCH | KW_INTERSECT | KW_VIEW | KW_IN | KW_DATABASES | KW_MATERIALIZED | KW_SCHEMA | KW_SCHEMAS | KW_GRANT | KW_REVOKE | KW_SSL | KW_UNDO | KW_LOCK | KW_LOCKS | KW_UNLOCK | KW_SHARED | KW_EXCLUSIVE | KW_PROCEDURE | KW_UNSIGNED | KW_WHILE | KW_READ | KW_READS | KW_PURGE | KW_RANGE | KW_ANALYZE | KW_BEFORE | KW_BETWEEN | KW_BOTH | KW_BINARY | KW_CONTINUE | KW_CURSOR | KW_TRIGGER | KW_RECORDREADER | KW_RECORDWRITER | KW_SEMI | KW_LATERAL | KW_TOUCH | KW_ARCHIVE | KW_UNARCHIVE | KW_COMPUTE | KW_STATISTICS | KW_USE | KW_OPTION | KW_CONCATENATE | KW_SHOW_DATABASE | KW_UPDATE | KW_RESTRICT | KW_CASCADE | KW_SKEWED | KW_ROLLUP | KW_CUBE | KW_DIRECTORIES | KW_FOR | KW_GROUPING | KW_SETS | KW_TRUNCATE | KW_NOSCAN | KW_USER | KW_ROLE | KW_ROLES | KW_INNER | KW_DEFINED | KW_ADMIN ; diff --git a/ql/src/java/parquet/hive/DeprecatedParquetInputFormat.java b/ql/src/java/parquet/hive/DeprecatedParquetInputFormat.java new file mode 100644 index 0000000..1470957 --- /dev/null +++ b/ql/src/java/parquet/hive/DeprecatedParquetInputFormat.java @@ -0,0 +1,37 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package parquet.hive; + +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.io.ArrayWritable; + +import parquet.hadoop.ParquetInputFormat; + +/** + * Deprecated name of the parquet-hive input format. This class exists + * simply to provide backwards compatibility with users who specified + * this name in the Hive metastore. All users should now use + * {@link MapredParquetInputFormat MapredParquetInputFormat} + */ +@Deprecated +public class DeprecatedParquetInputFormat extends MapredParquetInputFormat { + + public DeprecatedParquetInputFormat() { + super(); + } + + public DeprecatedParquetInputFormat(final ParquetInputFormat realInputFormat) { + super(realInputFormat); + } +} diff --git a/ql/src/java/parquet/hive/DeprecatedParquetOutputFormat.java b/ql/src/java/parquet/hive/DeprecatedParquetOutputFormat.java new file mode 100644 index 0000000..2063702 --- /dev/null +++ b/ql/src/java/parquet/hive/DeprecatedParquetOutputFormat.java @@ -0,0 +1,36 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package parquet.hive; + +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapreduce.OutputFormat; + +/** + * Deprecated name of the parquet-hive output format. This class exists + * simply to provide backwards compatibility with users who specified + * this name in the Hive metastore. All users should now use + * {@link MapredParquetOutputFormat MapredParquetOutputFormat} + */ +@Deprecated +public class DeprecatedParquetOutputFormat extends MapredParquetOutputFormat { + + public DeprecatedParquetOutputFormat() { + super(); + } + + public DeprecatedParquetOutputFormat(final OutputFormat mapreduceOutputFormat) { + super(mapreduceOutputFormat); + } +} diff --git a/ql/src/java/parquet/hive/MapredParquetInputFormat.java b/ql/src/java/parquet/hive/MapredParquetInputFormat.java new file mode 100644 index 0000000..cbdc750 --- /dev/null +++ b/ql/src/java/parquet/hive/MapredParquetInputFormat.java @@ -0,0 +1,36 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package parquet.hive; + +import org.apache.hadoop.io.ArrayWritable; + +import parquet.hadoop.ParquetInputFormat; + +/** + * Deprecated name of the parquet-hive input format. This class exists + * simply to provide backwards compatibility with users who specified + * this name in the Hive metastore. All users should now use + * {@link org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat MapredParquetInputFormat} + */ +@Deprecated +public class MapredParquetInputFormat extends org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat { + + public MapredParquetInputFormat() { + super(); + } + + public MapredParquetInputFormat(final ParquetInputFormat realInputFormat) { + super(realInputFormat); + } +} diff --git a/ql/src/java/parquet/hive/MapredParquetOutputFormat.java b/ql/src/java/parquet/hive/MapredParquetOutputFormat.java new file mode 100644 index 0000000..5ccdf70 --- /dev/null +++ b/ql/src/java/parquet/hive/MapredParquetOutputFormat.java @@ -0,0 +1,35 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package parquet.hive; + +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapreduce.OutputFormat; + +/** + * Deprecated name of the parquet-hive output format. This class exists + * simply to provide backwards compatibility with users who specified + * this name in the Hive metastore. All users should now use + * {@link org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat MapredParquetOutputFormat} + */ +@Deprecated +public class MapredParquetOutputFormat extends org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat { + + public MapredParquetOutputFormat () { + super(); + } + + public MapredParquetOutputFormat(final OutputFormat mapreduceOutputFormat) { + super(mapreduceOutputFormat); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java new file mode 100644 index 0000000..f3657cc --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java @@ -0,0 +1,109 @@ +package org.apache.hadoop.hive.ql.io.parquet; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.junit.Test; + +import parquet.schema.MessageType; +import parquet.schema.MessageTypeParser; + +/** + * + * TestHiveSchemaConverter + * + * + * @author Mickaël Lacour + * + */ +public class TestHiveSchemaConverter { + + private List createHiveColumnsFrom(final String columnNamesStr) { + List columnNames; + if (columnNamesStr.length() == 0) { + columnNames = new ArrayList(); + } else { + columnNames = Arrays.asList(columnNamesStr.split(",")); + } + + return columnNames; + } + + private List createHiveTypeInfoFrom(final String columnsTypeStr) { + List columnTypes; + + if (columnsTypeStr.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnsTypeStr); + } + + return columnTypes; + } + + private void testConversion(final String columnNamesStr, final String columnsTypeStr, final String expectedSchema) throws Exception { + final List columnNames = createHiveColumnsFrom(columnNamesStr); + final List columnTypes = createHiveTypeInfoFrom(columnsTypeStr); + final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); + final MessageType expectedMT = MessageTypeParser.parseMessageType(expectedSchema); + assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + expectedSchema, expectedMT, messageTypeFound); + } + + @Test + public void testSimpleType() throws Exception { + testConversion( + "a,b,c", + "int,double,boolean", + "message hive_schema {\n" + + " optional int32 a;\n" + + " optional double b;\n" + + " optional boolean c;\n" + + "}\n"); + } + + @Test + public void testArray() throws Exception { + testConversion("arrayCol", + "array", + "message hive_schema {\n" + + " optional group arrayCol (LIST) {\n" + + " repeated group bag {\n" + + " optional int32 array_element;\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testStruct() throws Exception { + testConversion("structCol", + "struct", + "message hive_schema {\n" + + " optional group structCol {\n" + + " optional int32 a;\n" + + " optional double b;\n" + + " optional boolean c;\n" + + " }\n" + + "}\n"); + } + + @Test + public void testMap() throws Exception { + testConversion("mapCol", + "map", + "message hive_schema {\n" + + " optional group mapCol (MAP) {\n" + + " repeated group map (MAP_KEY_VALUE) {\n" + + " required binary key;\n" + + " optional binary value;\n" + + " }\n" + + " }\n" + + "}\n"); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetInputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetInputFormat.java new file mode 100644 index 0000000..4bd5f43 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetInputFormat.java @@ -0,0 +1,31 @@ +package org.apache.hadoop.hive.ql.io.parquet; + +import static org.mockito.Mockito.mock; + +import org.apache.hadoop.io.ArrayWritable; +import org.junit.Test; + +import parquet.hadoop.ParquetInputFormat; + +/** + * + * Tests for MapredParquetInputFormat. + * + * @author Justin Coffey + * + */ +public class TestMapredParquetInputFormat { + @Test + public void testDefaultConstructor() { + new MapredParquetInputFormat(); + } + + @SuppressWarnings("unchecked") + @Test + public void testConstructorWithParquetInputFormat() { + new MapredParquetInputFormat( + (ParquetInputFormat) mock(ParquetInputFormat.class) + ); + } + +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetOutputFormat.java new file mode 100644 index 0000000..0104857 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetOutputFormat.java @@ -0,0 +1,83 @@ +package org.apache.hadoop.hive.ql.io.parquet; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; +import static org.mockito.Mockito.mock; + +import java.io.IOException; +import java.util.Properties; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport; +import org.apache.hadoop.hive.ql.io.parquet.write.ParquetRecordWriterWrapper; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.util.Progressable; +import org.junit.Test; + +import parquet.hadoop.ParquetOutputFormat; + +/** + * Tests for MapredParquetOutputFormat. + * + * @author Justin Coffey + * + */ +public class TestMapredParquetOutputFormat { + + @Test + public void testConstructor() { + new MapredParquetOutputFormat(); + } + + @SuppressWarnings("unchecked") + @Test + public void testConstructorWithFormat() { + new MapredParquetOutputFormat((ParquetOutputFormat) mock(ParquetOutputFormat.class)); + } + + @Test + public void testGetRecordWriterThrowsException() { + try { + new MapredParquetOutputFormat().getRecordWriter(null, null, null, null); + fail("should throw runtime exception."); + } catch (Exception e) { + assertEquals("Should never be used", e.getMessage()); + } + } + + @SuppressWarnings("unchecked") + @Test + public void testGetHiveRecordWriter() throws IOException { + Properties tableProps = new Properties(); + tableProps.setProperty("columns", "foo,bar"); + tableProps.setProperty("columns.types", "int:int"); + + final Progressable mockProgress = mock(Progressable.class); + final ParquetOutputFormat outputFormat = (ParquetOutputFormat) mock(ParquetOutputFormat.class); + + JobConf jobConf = new JobConf(); + + try { + new MapredParquetOutputFormat(outputFormat) { + @Override + protected ParquetRecordWriterWrapper getParquerRecordWriterWrapper( + ParquetOutputFormat realOutputFormat, + JobConf jobConf, + String finalOutPath, + Progressable progress + ) throws IOException { + assertEquals(outputFormat, realOutputFormat); + assertNotNull(jobConf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA)); + assertEquals("/foo", finalOutPath.toString()); + assertEquals(mockProgress, progress); + throw new RuntimeException("passed tests"); + } + }.getHiveRecordWriter(jobConf, new Path("/foo"), null, false, tableProps, mockProgress); + fail("should throw runtime exception."); + } catch (RuntimeException e) { + assertEquals("passed tests", e.getMessage()); + } + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java new file mode 100644 index 0000000..03f4fd6 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java @@ -0,0 +1,135 @@ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.util.Properties; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Binary; + +/** + * + * testParquetHiveSerDe + * + * + * @author Mickaël Lacour + * + */ +public class TestParquetSerDe extends TestCase { + + public void testParquetHiveSerDe() throws Throwable { + try { + // Create the SerDe + System.out.println("test: testParquetHiveSerDe"); + + final ParquetHiveSerDe serDe = new ParquetHiveSerDe(); + final Configuration conf = new Configuration(); + final Properties tbl = createProperties(); + serDe.initialize(conf, tbl); + + // Data + final Writable[] arr = new Writable[8]; + + arr[0] = new ByteWritable((byte) 123); + arr[1] = new ShortWritable((short) 456); + arr[2] = new IntWritable(789); + arr[3] = new LongWritable(1000l); + arr[4] = new DoubleWritable((double) 5.3); + arr[5] = new BinaryWritable(Binary.fromString("hive and hadoop and parquet. Big family.")); + + final Writable[] mapContainer = new Writable[1]; + final Writable[] map = new Writable[3]; + for (int i = 0; i < 3; ++i) { + final Writable[] pair = new Writable[2]; + pair[0] = new BinaryWritable(Binary.fromString("key_" + i)); + pair[1] = new IntWritable(i); + map[i] = new ArrayWritable(Writable.class, pair); + } + mapContainer[0] = new ArrayWritable(Writable.class, map); + arr[6] = new ArrayWritable(Writable.class, mapContainer); + + final Writable[] arrayContainer = new Writable[1]; + final Writable[] array = new Writable[5]; + for (int i = 0; i < 5; ++i) { + array[i] = new BinaryWritable(Binary.fromString("elem_" + i)); + } + arrayContainer[0] = new ArrayWritable(Writable.class, array); + arr[7] = new ArrayWritable(Writable.class, arrayContainer); + + final ArrayWritable arrWritable = new ArrayWritable(Writable.class, arr); + // Test + deserializeAndSerializeLazySimple(serDe, arrWritable); + System.out.println("test: testParquetHiveSerDe - OK"); + + } catch (final Throwable e) { + e.printStackTrace(); + throw e; + } + } + + private void deserializeAndSerializeLazySimple(final ParquetHiveSerDe serDe, final ArrayWritable t) throws SerDeException { + + // Get the row structure + final StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector(); + + // Deserialize + final Object row = serDe.deserialize(t); + assertEquals("deserialization gives the wrong object class", row.getClass(), ArrayWritable.class); + assertEquals("size correct after deserialization", serDe.getSerDeStats().getRawDataSize(), t.get().length); + assertEquals("deserialization gives the wrong object", t, row); + + // Serialize + final ArrayWritable serializedArr = (ArrayWritable) serDe.serialize(row, oi); + assertEquals("size correct after serialization", serDe.getSerDeStats().getRawDataSize(), serializedArr.get().length); + assertTrue("serialized object should be equal to starting object", arrayWritableEquals(t, serializedArr)); + } + + private Properties createProperties() { + final Properties tbl = new Properties(); + + // Set the configuration parameters + tbl.setProperty("columns", "abyte,ashort,aint,along,adouble,astring,amap,alist"); + tbl.setProperty("columns.types", "tinyint:smallint:int:bigint:double:string:map:array"); + tbl.setProperty(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + return tbl; + } + + public static boolean arrayWritableEquals(final ArrayWritable a1, final ArrayWritable a2) { + final Writable[] a1Arr = a1.get(); + final Writable[] a2Arr = a2.get(); + + if (a1Arr.length != a2Arr.length) { + return false; + } + + for (int i = 0; i < a1Arr.length; ++i) { + if (a1Arr[i] instanceof ArrayWritable) { + if (!(a2Arr[i] instanceof ArrayWritable)) { + return false; + } + if (!arrayWritableEquals((ArrayWritable) a1Arr[i], (ArrayWritable) a2Arr[i])) { + return false; + } + } else { + if (!a1Arr[i].equals(a2Arr[i])) { + return false; + } + } + + } + return true; + } + +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestAbstractParquetMapInspector.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestAbstractParquetMapInspector.java new file mode 100644 index 0000000..66c5da6 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestAbstractParquetMapInspector.java @@ -0,0 +1,89 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +/** + * + * @author Rémy Pecqueur + */ +public class TestAbstractParquetMapInspector extends TestCase { + + class TestableAbstractParquetMapInspector extends AbstractParquetMapInspector { + + public TestableAbstractParquetMapInspector(ObjectInspector keyInspector, ObjectInspector valueInspector) { + super(keyInspector, valueInspector); + } + + @Override + public Object getMapValueElement(Object o, Object o1) { + throw new UnsupportedOperationException("Should not be called"); + } + } + private TestableAbstractParquetMapInspector inspector; + + @Override + public void setUp() { + inspector = new TestableAbstractParquetMapInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } + + @Test + public void testNullMap() { + assertEquals("Wrong size", -1, inspector.getMapSize(null)); + assertNull("Should be null", inspector.getMap(null)); + } + + @Test + public void testNullContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, null); + assertEquals("Wrong size", -1, inspector.getMapSize(map)); + assertNull("Should be null", inspector.getMap(map)); + } + + @Test + public void testEmptyContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new ArrayWritable[0]); + assertEquals("Wrong size", -1, inspector.getMapSize(map)); + assertNull("Should be null", inspector.getMap(map)); + } + + @Test + public void testRegularMap() { + final Writable[] entry1 = new Writable[]{new IntWritable(0), new IntWritable(1)}; + final Writable[] entry2 = new Writable[]{new IntWritable(2), new IntWritable(3)}; + + final ArrayWritable internalMap = new ArrayWritable(ArrayWritable.class, new Writable[]{ + new ArrayWritable(Writable.class, entry1), new ArrayWritable(Writable.class, entry2)}); + + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new Writable[]{internalMap}); + + final Map expected = new HashMap(); + expected.put(new IntWritable(0), new IntWritable(1)); + expected.put(new IntWritable(2), new IntWritable(3)); + + assertEquals("Wrong size", 2, inspector.getMapSize(map)); + assertEquals("Wrong result of inspection", expected, inspector.getMap(map)); + } + + @Test + public void testHashMap() { + final Map map = new HashMap(); + map.put(new IntWritable(0), new IntWritable(1)); + map.put(new IntWritable(2), new IntWritable(3)); + map.put(new IntWritable(4), new IntWritable(5)); + map.put(new IntWritable(6), new IntWritable(7)); + + assertEquals("Wrong size", 4, inspector.getMapSize(map)); + assertEquals("Wrong result of inspection", map, inspector.getMap(map)); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestDeepParquetHiveMapInspector.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestDeepParquetHiveMapInspector.java new file mode 100644 index 0000000..33db935 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestDeepParquetHiveMapInspector.java @@ -0,0 +1,81 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.ql.io.parquet.serde.primitive.ParquetPrimitiveInspectorFactory; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +/** + * + * @author Rémy Pecqueur + */ +public class TestDeepParquetHiveMapInspector extends TestCase { + + private DeepParquetHiveMapInspector inspector; + + @Override + public void setUp() { + inspector = new DeepParquetHiveMapInspector(ParquetPrimitiveInspectorFactory.parquetShortInspector, + PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } + + @Test + public void testNullMap() { + assertNull("Should be null", inspector.getMapValueElement(null, new ShortWritable((short) 0))); + } + + @Test + public void testNullContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, null); + assertNull("Should be null", inspector.getMapValueElement(map, new ShortWritable((short) 0))); + } + + @Test + public void testEmptyContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new ArrayWritable[0]); + assertNull("Should be null", inspector.getMapValueElement(map, new ShortWritable((short) 0))); + } + + @Test + public void testRegularMap() { + final Writable[] entry1 = new Writable[]{new IntWritable(0), new IntWritable(1)}; + final Writable[] entry2 = new Writable[]{new IntWritable(2), new IntWritable(3)}; + + final ArrayWritable internalMap = new ArrayWritable(ArrayWritable.class, new Writable[]{ + new ArrayWritable(Writable.class, entry1), new ArrayWritable(Writable.class, entry2)}); + + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new Writable[]{internalMap}); + + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new IntWritable(0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new IntWritable(2))); + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new ShortWritable((short) 0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new ShortWritable((short) 2))); + } + + @Test + public void testHashMap() { + final Map map = new HashMap(); + map.put(new IntWritable(0), new IntWritable(1)); + map.put(new IntWritable(2), new IntWritable(3)); + map.put(new IntWritable(4), new IntWritable(5)); + map.put(new IntWritable(6), new IntWritable(7)); + + + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new IntWritable(0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new IntWritable(2))); + assertEquals("Wrong result of inspection", new IntWritable(5), inspector.getMapValueElement(map, new IntWritable(4))); + assertEquals("Wrong result of inspection", new IntWritable(7), inspector.getMapValueElement(map, new IntWritable(6))); + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new ShortWritable((short) 0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new ShortWritable((short) 2))); + assertEquals("Wrong result of inspection", new IntWritable(5), inspector.getMapValueElement(map, new ShortWritable((short) 4))); + assertEquals("Wrong result of inspection", new IntWritable(7), inspector.getMapValueElement(map, new ShortWritable((short) 6))); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetHiveArrayInspector.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetHiveArrayInspector.java new file mode 100644 index 0000000..2232659 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetHiveArrayInspector.java @@ -0,0 +1,71 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.ArrayList; +import java.util.List; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +/** + * + * @author Rémy Pecqueur + */ +public class TestParquetHiveArrayInspector extends TestCase { + + private ParquetHiveArrayInspector inspector; + + @Override + public void setUp() { + inspector = new ParquetHiveArrayInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } + + @Test + public void testNullArray() { + assertEquals("Wrong size", -1, inspector.getListLength(null)); + assertNull("Should be null", inspector.getList(null)); + assertNull("Should be null", inspector.getListElement(null, 0)); + } + + @Test + public void testNullContainer() { + final ArrayWritable list = new ArrayWritable(ArrayWritable.class, null); + assertEquals("Wrong size", -1, inspector.getListLength(list)); + assertNull("Should be null", inspector.getList(list)); + assertNull("Should be null", inspector.getListElement(list, 0)); + } + + @Test + public void testEmptyContainer() { + final ArrayWritable list = new ArrayWritable(ArrayWritable.class, new ArrayWritable[0]); + assertEquals("Wrong size", -1, inspector.getListLength(list)); + assertNull("Should be null", inspector.getList(list)); + assertNull("Should be null", inspector.getListElement(list, 0)); + } + + @Test + public void testRegularList() { + final ArrayWritable internalList = new ArrayWritable(Writable.class, + new Writable[]{new IntWritable(3), new IntWritable(5), new IntWritable(1)}); + final ArrayWritable list = new ArrayWritable(ArrayWritable.class, new ArrayWritable[]{internalList}); + + final List expected = new ArrayList(); + expected.add(new IntWritable(3)); + expected.add(new IntWritable(5)); + expected.add(new IntWritable(1)); + + assertEquals("Wrong size", 3, inspector.getListLength(list)); + assertEquals("Wrong result of inspection", expected, inspector.getList(list)); + + for (int i = 0; i < expected.size(); ++i) { + assertEquals("Wrong result of inspection", expected.get(i), inspector.getListElement(list, i)); + + } + + assertNull("Should be null", inspector.getListElement(list, 3)); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestStandardParquetHiveMapInspector.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestStandardParquetHiveMapInspector.java new file mode 100644 index 0000000..8cccc0f --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestStandardParquetHiveMapInspector.java @@ -0,0 +1,79 @@ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +/** + * + * @author Rémy Pecqueur + */ +public class TestStandardParquetHiveMapInspector extends TestCase { + + private StandardParquetHiveMapInspector inspector; + + @Override + public void setUp() { + inspector = new StandardParquetHiveMapInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } + + @Test + public void testNullMap() { + assertNull("Should be null", inspector.getMapValueElement(null, new IntWritable(0))); + } + + @Test + public void testNullContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, null); + assertNull("Should be null", inspector.getMapValueElement(map, new IntWritable(0))); + } + + @Test + public void testEmptyContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new ArrayWritable[0]); + assertNull("Should be null", inspector.getMapValueElement(map, new IntWritable(0))); + } + + @Test + public void testRegularMap() { + final Writable[] entry1 = new Writable[]{new IntWritable(0), new IntWritable(1)}; + final Writable[] entry2 = new Writable[]{new IntWritable(2), new IntWritable(3)}; + + final ArrayWritable internalMap = new ArrayWritable(ArrayWritable.class, new Writable[]{ + new ArrayWritable(Writable.class, entry1), new ArrayWritable(Writable.class, entry2)}); + + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new Writable[]{internalMap}); + + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new IntWritable(0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new IntWritable(2))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 0))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 2))); + } + + @Test + public void testHashMap() { + final Map map = new HashMap(); + map.put(new IntWritable(0), new IntWritable(1)); + map.put(new IntWritable(2), new IntWritable(3)); + map.put(new IntWritable(4), new IntWritable(5)); + map.put(new IntWritable(6), new IntWritable(7)); + + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new IntWritable(0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new IntWritable(2))); + assertEquals("Wrong result of inspection", new IntWritable(5), inspector.getMapValueElement(map, new IntWritable(4))); + assertEquals("Wrong result of inspection", new IntWritable(7), inspector.getMapValueElement(map, new IntWritable(6))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 0))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 2))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 4))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 6))); + } +} diff --git a/ql/src/test/queries/clientpositive/parquet_create.q b/ql/src/test/queries/clientpositive/parquet_create.q new file mode 100644 index 0000000..0b976bd --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_create.q @@ -0,0 +1,36 @@ +DROP TABLE parquet_create_staging; +DROP TABLE parquet_create; + +CREATE TABLE parquet_create_staging ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':'; + +CREATE TABLE parquet_create ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) STORED AS PARQUET; + +DESCRIBE FORMATTED parquet_create; + +LOAD DATA LOCAL INPATH '../../data/files/parquet_create.txt' OVERWRITE INTO TABLE parquet_create_staging; + +SELECT * FROM parquet_create_staging; + +INSERT OVERWRITE TABLE parquet_create SELECT * FROM parquet_create_staging; + +SELECT * FROM parquet_create group by id; +SELECT id, count(0) FROM parquet_create group by id; +SELECT str from parquet_create; +SELECT mp from parquet_create; +SELECT lst from parquet_create; +SELECT strct from parquet_create; diff --git a/ql/src/test/queries/clientpositive/parquet_partitioned.q b/ql/src/test/queries/clientpositive/parquet_partitioned.q new file mode 100644 index 0000000..103d26f --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_partitioned.q @@ -0,0 +1,34 @@ +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; + +DROP TABLE parquet_partitioned_staging; +DROP TABLE parquet_partitioned; + +CREATE TABLE parquet_partitioned_staging ( + id int, + str string, + part string +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|'; + +CREATE TABLE parquet_partitioned ( + id int, + str string +) PARTITIONED BY (part string) +STORED AS PARQUET; + +DESCRIBE FORMATTED parquet_partitioned; + +LOAD DATA LOCAL INPATH '../../data/files/parquet_partitioned.txt' OVERWRITE INTO TABLE parquet_partitioned_staging; + +SELECT * FROM parquet_partitioned_staging; + +INSERT OVERWRITE TABLE parquet_partitioned PARTITION (part) SELECT * FROM parquet_partitioned_staging; + +set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SELECT * FROM parquet_partitioned; +SELECT part, COUNT(0) FROM parquet_partitioned GROUP BY part; + +set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; +SELECT * FROM parquet_partitioned; +SELECT part, COUNT(0) FROM parquet_partitioned GROUP BY part; diff --git a/ql/src/test/results/clientpositive/parquet_create.q.out b/ql/src/test/results/clientpositive/parquet_create.q.out new file mode 100644 index 0000000..34fdea2 --- /dev/null +++ b/ql/src/test/results/clientpositive/parquet_create.q.out @@ -0,0 +1,206 @@ +PREHOOK: query: DROP TABLE parquet_create_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_create_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE parquet_create +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_create +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_create_staging ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE parquet_create_staging ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@parquet_create_staging +PREHOOK: query: CREATE TABLE parquet_create ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE parquet_create ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@parquet_create +PREHOOK: query: DESCRIBE FORMATTED parquet_create +PREHOOK: type: DESCTABLE +POSTHOOK: query: DESCRIBE FORMATTED parquet_create +POSTHOOK: type: DESCTABLE +# col_name data_type comment + +id int from deserializer +str string from deserializer +mp map from deserializer +lst array from deserializer +strct struct from deserializer + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe +InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_create.txt' OVERWRITE INTO TABLE parquet_create_staging +PREHOOK: type: LOAD +PREHOOK: Output: default@parquet_create_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_create.txt' OVERWRITE INTO TABLE parquet_create_staging +POSTHOOK: type: LOAD +POSTHOOK: Output: default@parquet_create_staging +PREHOOK: query: SELECT * FROM parquet_create_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create_staging +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_create_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create_staging +#### A masked pattern was here #### +1 foo line1 {"key11":"value11","key12":"value12","key13":"value13"} ["a","b","c"] {"a":"one","b":"two"} +2 bar line2 {"key21":"value21","key22":"value22","key23":"value23"} ["d","e","f"] {"a":"three","b":"four"} +3 baz line3 {"key31":"value31","key32":"value32","key33":"value33"} ["g","h","i"] {"a":"five","b":"six"} +PREHOOK: query: INSERT OVERWRITE TABLE parquet_create SELECT * FROM parquet_create_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create_staging +PREHOOK: Output: default@parquet_create +POSTHOOK: query: INSERT OVERWRITE TABLE parquet_create SELECT * FROM parquet_create_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create_staging +POSTHOOK: Output: default@parquet_create +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +PREHOOK: query: SELECT * FROM parquet_create group by id +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_create group by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +1 +2 +3 +PREHOOK: query: SELECT id, count(0) FROM parquet_create group by id +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT id, count(0) FROM parquet_create group by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +1 1 +2 1 +3 1 +PREHOOK: query: SELECT str from parquet_create +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT str from parquet_create +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +foo line1 +bar line2 +baz line3 +PREHOOK: query: SELECT mp from parquet_create +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT mp from parquet_create +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +{"key12":"value12","key11":"value11","key13":"value13"} +{"key21":"value21","key23":"value23","key22":"value22"} +{"key33":"value33","key31":"value31","key32":"value32"} +PREHOOK: query: SELECT lst from parquet_create +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT lst from parquet_create +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +["a","b","c"] +["d","e","f"] +["g","h","i"] +PREHOOK: query: SELECT strct from parquet_create +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT strct from parquet_create +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +{"a":"one","b":"two"} +{"a":"three","b":"four"} +{"a":"five","b":"six"} diff --git a/ql/src/test/results/clientpositive/parquet_partitioned.q.out b/ql/src/test/results/clientpositive/parquet_partitioned.q.out new file mode 100644 index 0000000..ecba6ce --- /dev/null +++ b/ql/src/test/results/clientpositive/parquet_partitioned.q.out @@ -0,0 +1,174 @@ +PREHOOK: query: DROP TABLE parquet_partitioned_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_partitioned_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE parquet_partitioned +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_partitioned +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_partitioned_staging ( + id int, + str string, + part string +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE parquet_partitioned_staging ( + id int, + str string, + part string +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@parquet_partitioned_staging +PREHOOK: query: CREATE TABLE parquet_partitioned ( + id int, + str string +) PARTITIONED BY (part string) +STORED AS PARQUET +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE parquet_partitioned ( + id int, + str string +) PARTITIONED BY (part string) +STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@parquet_partitioned +PREHOOK: query: DESCRIBE FORMATTED parquet_partitioned +PREHOOK: type: DESCTABLE +POSTHOOK: query: DESCRIBE FORMATTED parquet_partitioned +POSTHOOK: type: DESCTABLE +# col_name data_type comment + +id int from deserializer +str string from deserializer + +# Partition Information +# col_name data_type comment + +part string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe +InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_partitioned.txt' OVERWRITE INTO TABLE parquet_partitioned_staging +PREHOOK: type: LOAD +PREHOOK: Output: default@parquet_partitioned_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_partitioned.txt' OVERWRITE INTO TABLE parquet_partitioned_staging +POSTHOOK: type: LOAD +POSTHOOK: Output: default@parquet_partitioned_staging +PREHOOK: query: SELECT * FROM parquet_partitioned_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_partitioned_staging +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_partitioned_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_partitioned_staging +#### A masked pattern was here #### +1 foo part1 +2 bar part2 +3 baz part2 +PREHOOK: query: INSERT OVERWRITE TABLE parquet_partitioned PARTITION (part) SELECT * FROM parquet_partitioned_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_partitioned_staging +PREHOOK: Output: default@parquet_partitioned +POSTHOOK: query: INSERT OVERWRITE TABLE parquet_partitioned PARTITION (part) SELECT * FROM parquet_partitioned_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_partitioned_staging +POSTHOOK: Output: default@parquet_partitioned@part=part1 +POSTHOOK: Output: default@parquet_partitioned@part=part2 +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +PREHOOK: query: SELECT * FROM parquet_partitioned +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_partitioned +PREHOOK: Input: default@parquet_partitioned@part=part1 +PREHOOK: Input: default@parquet_partitioned@part=part2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_partitioned +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_partitioned +POSTHOOK: Input: default@parquet_partitioned@part=part1 +POSTHOOK: Input: default@parquet_partitioned@part=part2 +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +1 foo part1 +2 bar part2 +3 baz part2 +PREHOOK: query: SELECT part, COUNT(0) FROM parquet_partitioned GROUP BY part +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_partitioned +PREHOOK: Input: default@parquet_partitioned@part=part1 +PREHOOK: Input: default@parquet_partitioned@part=part2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT part, COUNT(0) FROM parquet_partitioned GROUP BY part +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_partitioned +POSTHOOK: Input: default@parquet_partitioned@part=part1 +POSTHOOK: Input: default@parquet_partitioned@part=part2 +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +part1 1 +part2 2 +PREHOOK: query: SELECT * FROM parquet_partitioned +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_partitioned +PREHOOK: Input: default@parquet_partitioned@part=part1 +PREHOOK: Input: default@parquet_partitioned@part=part2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_partitioned +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_partitioned +POSTHOOK: Input: default@parquet_partitioned@part=part1 +POSTHOOK: Input: default@parquet_partitioned@part=part2 +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +1 foo part1 +2 bar part2 +3 baz part2 +PREHOOK: query: SELECT part, COUNT(0) FROM parquet_partitioned GROUP BY part +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_partitioned +PREHOOK: Input: default@parquet_partitioned@part=part1 +PREHOOK: Input: default@parquet_partitioned@part=part2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT part, COUNT(0) FROM parquet_partitioned GROUP BY part +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_partitioned +POSTHOOK: Input: default@parquet_partitioned@part=part1 +POSTHOOK: Input: default@parquet_partitioned@part=part2 +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part1).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).id SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_partitioned PARTITION(part=part2).str SIMPLE [(parquet_partitioned_staging)parquet_partitioned_staging.FieldSchema(name:str, type:string, comment:null), ] +part1 1 +part2 2