diff --git a/data/files/parquet_create.txt b/data/files/parquet_create.txt new file mode 100644 index 0000000..ccd48ee --- /dev/null +++ b/data/files/parquet_create.txt @@ -0,0 +1,3 @@ +1|foo line1|key11:value11,key12:value12,key13:value13|a,b,c|one,two +2|bar line2|key21:value21,key22:value22,key23:value23|d,e,f|three,four +3|baz line3|key31:value31,key32:value32,key33:value33|g,h,i|five,six diff --git a/pom.xml b/pom.xml index 41f5337..fbb21df 100644 --- a/pom.xml +++ b/pom.xml @@ -127,6 +127,7 @@ requires netty < 3.6.0 we force hadoops version --> 3.4.0.Final + 1.3.2 0.10.1 2.5.0 1.0.1 @@ -222,6 +223,17 @@ ${bonecp.version} + com.twitter + parquet-hadoop-bundle + ${parquet.version} + + + com.twitter + parquet-column + ${parquet.version} + tests + + com.sun.jersey jersey-core ${jersey.version} diff --git a/ql/pom.xml b/ql/pom.xml index 7087a4c..53d0b9e 100644 --- a/ql/pom.xml +++ b/ql/pom.xml @@ -67,6 +67,10 @@ ${kryo.version} + com.twitter + parquet-hadoop-bundle + + commons-codec commons-codec ${commons-codec.version} @@ -204,6 +208,12 @@ + com.twitter + parquet-column + tests + test + + junit junit ${junit.version} @@ -476,6 +486,7 @@ org.apache.hive:hive-exec org.apache.hive:hive-serde com.esotericsoftware.kryo:kryo + com.twiter:parquet-hadoop-bundle org.apache.thrift:libthrift commons-lang:commons-lang org.json:json diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java new file mode 100644 index 0000000..2380ba9 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java @@ -0,0 +1,128 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.io.IOException; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; +import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapreduce.JobContext; + +import parquet.hadoop.ParquetInputFormat; +import parquet.hadoop.ParquetInputSplit; +import parquet.hadoop.util.ContextUtil; + + +/** + * + * A Parquet InputFormat for Hive (with the deprecated package mapred) + * + * TODO : Refactor all of the wrappers here Talk about it on : https://github.com/Parquet/parquet-mr/pull/28s + * + */ +public class MapredParquetInputFormat extends FileInputFormat { + + public static final Log LOG = LogFactory.getLog(MapredParquetInputFormat.class); + + private final ParquetInputFormat realInput; + private final ProjectionPusher projectionPusher; + + public MapredParquetInputFormat() { + this(new ParquetInputFormat(DataWritableReadSupport.class), new ProjectionPusher()); + } + + protected MapredParquetInputFormat(final ParquetInputFormat inputFormat, ProjectionPusher pusher) { + this.realInput = inputFormat; + this.projectionPusher = pusher; + } + + @Override + public org.apache.hadoop.mapred.InputSplit[] getSplits( + final org.apache.hadoop.mapred.JobConf job, + final int numSplits + ) throws IOException { + Path[] dirs = getInputPathsForJob(job); + if (dirs.length == 0) { + throw new IOException("No input paths specified in job"); + } + + return getSplits(job, numSplits, makeQualifiedPathFromPaths(dirs, job)); + } + + protected Path makeQualifiedPathFromPaths( + Path[] paths, + final org.apache.hadoop.mapred.JobConf job + ) throws IOException { + return new Path((paths[paths.length - 1]).makeQualified(getFsForJob(job)).toUri().getPath()); + } + + protected FileSystem getFsForJob(org.apache.hadoop.mapred.JobConf job) throws IOException { + return FileSystem.get(job); + } + + protected Path[] getInputPathsForJob(org.apache.hadoop.mapred.JobConf job) { + return FileInputFormat.getInputPaths(job); + } + + public org.apache.hadoop.mapred.InputSplit[] getSplits( + final org.apache.hadoop.mapred.JobConf job, + final int numSplits, + Path tmpPath + ) throws IOException { + final JobConf cloneJobConf = projectionPusher.pushProjectionsAndFilters(job, tmpPath); + final List splits = realInput.getSplits(getJobContext(cloneJobConf)); + + final InputSplit[] resultSplits = new InputSplit[splits.size()]; + int i = 0; + + for (final org.apache.hadoop.mapreduce.InputSplit split : splits) { + try { + resultSplits[i++] = new ParquetInputSplitWrapper((ParquetInputSplit) split); + } catch (final InterruptedException e) { + throw new RuntimeException("Cannot create an InputSplitWrapper", e); + } + } + + return resultSplits; + } + + protected JobContext getJobContext(JobConf jobConf) { + return ContextUtil.newJobContext(jobConf, null); + } + + @Override + public org.apache.hadoop.mapred.RecordReader getRecordReader( + final org.apache.hadoop.mapred.InputSplit split, + final org.apache.hadoop.mapred.JobConf job, + final org.apache.hadoop.mapred.Reporter reporter + ) throws IOException { + try { + return (RecordReader) new ParquetRecordReaderWrapper(realInput, split, job, reporter, projectionPusher); + } catch (final InterruptedException e) { + throw new RuntimeException("Cannot create a RecordReaderWrapper", e); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java new file mode 100644 index 0000000..aa5768a --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java @@ -0,0 +1,128 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.io.HiveOutputFormat; +import org.apache.hadoop.hive.ql.io.FSRecordWriter; +import org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter; +import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport; +import org.apache.hadoop.hive.ql.io.parquet.write.ParquetRecordWriterWrapper; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.OutputFormat; +import org.apache.hadoop.util.Progressable; + +import parquet.hadoop.ParquetOutputFormat; + +/** + * A Parquet OutputFormat for Hive (with the deprecated package mapred) + */ +@SuppressWarnings({"unchecked", "rawtypes"}) +public class MapredParquetOutputFormat extends FileOutputFormat implements +HiveOutputFormat { + + protected ParquetOutputFormat realOutputFormat; + public static final Log LOG = LogFactory.getLog(MapredParquetOutputFormat.class); + + public MapredParquetOutputFormat() { + realOutputFormat = new ParquetOutputFormat(new DataWritableWriteSupport()); + } + + public MapredParquetOutputFormat(final OutputFormat mapreduceOutputFormat) { + realOutputFormat = (ParquetOutputFormat) mapreduceOutputFormat; + } + + @Override + public void checkOutputSpecs(final FileSystem ignored, final JobConf job) throws IOException { + realOutputFormat.checkOutputSpecs(ShimLoader.getHadoopShims().getHCatShim().createJobContext(job, null)); + } + + @Override + public RecordWriter getRecordWriter( + final FileSystem ignored, + final JobConf job, + final String name, + final Progressable progress + ) throws IOException { + throw new RuntimeException("Should never be used"); + } + + /** + * + * Create the parquet schema from the hive schema, and return the RecordWriterWrapper which + * contains the real output format + */ + @Override + public FSRecordWriter getHiveRecordWriter( + final JobConf jobConf, + final Path finalOutPath, + final Class valueClass, + final boolean isCompressed, + final Properties tableProperties, + final Progressable progress) throws IOException { + + LOG.info("getHiveRecordWriter " + this); + LOG.info("creating new record writer..."); + + // Seriously? Hard coded property names? + final String columnNameProperty = tableProperties.getProperty("columns"); + final String columnTypeProperty = tableProperties.getProperty("columns.types"); + List columnNames; + List columnTypes; + + if (columnNameProperty.length() == 0) { + columnNames = new ArrayList(); + } else { + columnNames = Arrays.asList(columnNameProperty.split(",")); + } + + if (columnTypeProperty.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + + DataWritableWriteSupport.setSchema(HiveSchemaConverter.convert(columnNames, columnTypes), jobConf); + return getParquerRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress); + } + + protected ParquetRecordWriterWrapper getParquerRecordWriterWrapper( + ParquetOutputFormat realOutputFormat, + JobConf jobConf, + String finalOutPath, + Progressable progress + ) throws IOException { + return new ParquetRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetInputSplitWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetInputSplitWrapper.java new file mode 100644 index 0000000..f18695f --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetInputSplitWrapper.java @@ -0,0 +1,102 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.util.ReflectionUtils; + +import parquet.hadoop.ParquetInputSplit; + +public class ParquetInputSplitWrapper extends FileSplit implements InputSplit { + + private ParquetInputSplit realSplit; + + public ParquetInputSplit getRealSplit() { + return realSplit; + } + + // MapReduce instantiates this. + public ParquetInputSplitWrapper() { + super((Path) null, 0, 0, (String[]) null); + } + + public ParquetInputSplitWrapper(final ParquetInputSplit realSplit) throws IOException, InterruptedException { + super(realSplit.getPath(), realSplit.getStart(), realSplit.getLength(), realSplit.getLocations()); + this.realSplit = realSplit; + } + + @Override + public long getLength() { + if (realSplit == null) { + return 0; + } else { + try { + return realSplit.getLength(); + } catch (IOException ex) { + throw new RuntimeException("Cannot get the length of the ParquetInputSplit: " + realSplit, ex); + } catch (InterruptedException ex) { + throw new RuntimeException("Cannot get the length of the ParquetInputSplit: " + realSplit, ex); + } + } + } + + @Override + public String[] getLocations() throws IOException { + try { + return realSplit.getLocations(); + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + + @Override + public void readFields(final DataInput in) throws IOException { + final String className = WritableUtils.readString(in); + Class splitClass; + + try { + splitClass = Class.forName(className); + } catch (final ClassNotFoundException e) { + throw new IOException(e); + } + + realSplit = (ParquetInputSplit) ReflectionUtils.newInstance(splitClass, null); + ((Writable) realSplit).readFields(in); + } + + @Override + public void write(final DataOutput out) throws IOException { + WritableUtils.writeString(out, realSplit.getClass().getName()); + ((Writable) realSplit).write(out); + } + + @Override + public Path getPath() { + return realSplit.getPath(); + } + + @Override + public long getStart() { + return realSplit.getStart(); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java new file mode 100644 index 0000000..0288e38 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java @@ -0,0 +1,174 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.MapWork; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.util.StringUtils; + +public class ProjectionPusher { + + public static final Log LOG = LogFactory.getLog(ProjectionPusher.class); + + private final Map pathToPartitionInfo = + new LinkedHashMap(); + /** + * MapredWork is the Hive object which describes input files, + * columns projections, and filters. + */ + private MapWork mapWork; + + private static final List virtualColumns; + + static { + List vcols = new ArrayList(); + vcols.add("INPUT__FILE__NAME"); + vcols.add("BLOCK__OFFSET__INSIDE__FILE"); + vcols.add("ROW__OFFSET__INSIDE__BLOCK"); + vcols.add("RAW__DATA__SIZE"); + virtualColumns = Collections.unmodifiableList(vcols); + } + + public List getColumns(final String columns) { + final List result = (List) StringUtils.getStringCollection(columns); + result.removeAll(virtualColumns); + return result; + } + + /** + * Sets the mrwork variable based on the current JobConf in order to get all partitions. + * + * @param job + */ + private void updateMrWork(final JobConf job) { + final String plan = HiveConf.getVar(job, HiveConf.ConfVars.PLAN); + if (mapWork == null && plan != null && plan.length() > 0) { + mapWork = Utilities.getMapWork(job); + pathToPartitionInfo.clear(); + for (final Map.Entry entry : mapWork.getPathToPartitionInfo().entrySet()) { + pathToPartitionInfo.put(new Path(entry.getKey()).toUri().getPath().toString(), entry.getValue()); + } + } + } + + private void pushProjectionsAndFilters(final JobConf jobConf, + final String splitPath, final String splitPathWithNoSchema) { + + if (mapWork == null) { + //LOG.debug("Not pushing projections and filters because MapredWork is null"); + return; + } else if (mapWork.getPathToAliases() == null) { + //LOG.debug("Not pushing projections and filters because pathToAliases is null"); + return; + } + + final ArrayList aliases = new ArrayList(); + final Iterator>> iterator = mapWork.getPathToAliases().entrySet().iterator(); + + while (iterator.hasNext()) { + final Entry> entry = iterator.next(); + final String key = new Path(entry.getKey()).toUri().getPath(); + + if (splitPath.equals(key) || splitPathWithNoSchema.equals(key)) { + final ArrayList list = entry.getValue(); + for (final String val : list) { + aliases.add(val); + } + } + } + + for (final String alias : aliases) { + final Operator op = mapWork.getAliasToWork().get( + alias); + if (op != null && op instanceof TableScanOperator) { + final TableScanOperator tableScan = (TableScanOperator) op; + + // push down projections + final List list = tableScan.getNeededColumnIDs(); + + if (list != null) { + ColumnProjectionUtils.appendReadColumnIDs(jobConf, list); + } else { + ColumnProjectionUtils.setFullyReadColumns(jobConf); + } + + pushFilters(jobConf, tableScan); + } + } + } + + private void pushFilters(final JobConf jobConf, final TableScanOperator tableScan) { + + final TableScanDesc scanDesc = tableScan.getConf(); + if (scanDesc == null) { + LOG.debug("Not pushing filters because TableScanDesc is null"); + return; + } + + // construct column name list for reference by filter push down + Utilities.setColumnNameList(jobConf, tableScan); + + // push down filters + final ExprNodeGenericFuncDesc filterExpr = scanDesc.getFilterExpr(); + if (filterExpr == null) { + LOG.debug("Not pushing filters because FilterExpr is null"); + return; + } + + final String filterText = filterExpr.getExprString(); + final String filterExprSerialized = Utilities.serializeExpression(filterExpr); + jobConf.set( + TableScanDesc.FILTER_TEXT_CONF_STR, + filterText); + jobConf.set( + TableScanDesc.FILTER_EXPR_CONF_STR, + filterExprSerialized); + } + + + public JobConf pushProjectionsAndFilters(JobConf jobConf, Path path) + throws IOException { + updateMrWork(jobConf); // TODO: refactor this + final JobConf cloneJobConf = new JobConf(jobConf); + final PartitionDesc part = pathToPartitionInfo.get(path.toString()); + + if ((part != null) && (part.getTableDesc() != null)) { + Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), cloneJobConf); + } + + pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().toString()); + return cloneJobConf; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java new file mode 100644 index 0000000..68fc99e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java @@ -0,0 +1,95 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +import parquet.io.ParquetDecodingException; +import parquet.io.api.Converter; +import parquet.schema.GroupType; + +/** + * + * A ArrayWritableGroupConverter + * + */ +public class ArrayWritableGroupConverter extends HiveGroupConverter { + + private final Converter[] converters; + private final HiveGroupConverter parent; + private final int index; + private final boolean isMap; + private Writable currentValue; + private Writable[] mapPairContainer; + + public ArrayWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent, final int index) { + this.parent = parent; + this.index = index; + + if (groupType.getFieldCount() == 2) { + converters = new Converter[2]; + converters[0] = getConverterFromDescription(groupType.getType(0), 0, this); + converters[1] = getConverterFromDescription(groupType.getType(1), 1, this); + isMap = true; + } else if (groupType.getFieldCount() == 1) { + converters = new Converter[1]; + converters[0] = getConverterFromDescription(groupType.getType(0), 0, this); + isMap = false; + } else { + throw new RuntimeException("Invalid parquet hive schema: " + groupType); + } + + } + + @Override + public Converter getConverter(final int fieldIndex) { + return converters[fieldIndex]; + } + + @Override + public void start() { + if (isMap) { + mapPairContainer = new Writable[2]; + } + } + + @Override + public void end() { + if (isMap) { + currentValue = new ArrayWritable(Writable.class, mapPairContainer); + } + parent.add(index, currentValue); + } + + @Override + protected void set(final int index, final Writable value) { + if (index != 0 && mapPairContainer == null || index > 1) { + throw new ParquetDecodingException("Repeated group can only have one or two fields for maps. Not allowed to set for the index : " + index); + } + + if (isMap) { + mapPairContainer[index] = value; + } else { + currentValue = value; + } + } + + @Override + protected void add(final int index, final Writable value) { + set(index, value); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java new file mode 100644 index 0000000..700f3b3 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java @@ -0,0 +1,140 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Converter; +import parquet.schema.GroupType; +import parquet.schema.Type; + +/** + * + * A MapWritableGroupConverter, real converter between hive and parquet types recursively for complex types. + * + */ +public class DataWritableGroupConverter extends HiveGroupConverter { + + private final Converter[] converters; + private final HiveGroupConverter parent; + private final int index; + private final Object[] currentArr; + private Writable[] rootMap; + + public DataWritableGroupConverter(final GroupType requestedSchema, final GroupType tableSchema) { + this(requestedSchema, null, 0, tableSchema); + final int fieldCount = tableSchema.getFieldCount(); + this.rootMap = new Writable[fieldCount]; + } + + public DataWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent, final int index) { + this(groupType, parent, index, groupType); + } + + public DataWritableGroupConverter(final GroupType selectedGroupType, final HiveGroupConverter parent, final int index, final GroupType containingGroupType) { + this.parent = parent; + this.index = index; + final int totalFieldCount = containingGroupType.getFieldCount(); + final int selectedFieldCount = selectedGroupType.getFieldCount(); + + currentArr = new Object[totalFieldCount]; + converters = new Converter[selectedFieldCount]; + + int i = 0; + for (final Type subtype : selectedGroupType.getFields()) { + if (containingGroupType.getFields().contains(subtype)) { + converters[i] = getConverterFromDescription(subtype, containingGroupType.getFieldIndex(subtype.getName()), this); + } else { + throw new RuntimeException("Group type [" + containingGroupType + "] does not contain requested field: " + subtype); + } + ++i; + } + } + + final public ArrayWritable getCurrentArray() { + final Writable[] writableArr; + if (this.rootMap != null) { // We're at the root : we can safely re-use the same map to save perf + writableArr = this.rootMap; + } else { + writableArr = new Writable[currentArr.length]; + } + + for (int i = 0; i < currentArr.length; i++) { + final Object obj = currentArr[i]; + if (obj instanceof List) { + final List objList = (List)obj; + final ArrayWritable arr = new ArrayWritable(Writable.class, objList.toArray(new Writable[objList.size()])); + writableArr[i] = arr; + } else { + writableArr[i] = (Writable) obj; + } + } + return new ArrayWritable(Writable.class, writableArr); + } + + @Override + final protected void set(final int index, final Writable value) { + currentArr[index] = value; + } + + @Override + public Converter getConverter(final int fieldIndex) { + return converters[fieldIndex]; + } + + @Override + public void start() { + for (int i = 0; i < currentArr.length; i++) { + currentArr[i] = null; + } + } + + @Override + public void end() { + if (parent != null) { + parent.set(index, getCurrentArray()); + } + } + + @Override + protected void add(final int index, final Writable value) { + + if (currentArr[index] != null) { + + final Object obj = currentArr[index]; + if (obj instanceof List) { + final List list = (List) obj; + list.add(value); + } else { + throw new RuntimeException("This should be a List: " + obj); + } + + } else { + // create a list here because we don't know the final length of the object + // and it is more flexible than ArrayWritable. + // + // converted to ArrayWritable by getCurrentArray(). + final List buffer = new ArrayList(); + buffer.add(value); + currentArr[index] = (Object) buffer; + } + + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java new file mode 100644 index 0000000..7087f16 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java @@ -0,0 +1,46 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import org.apache.hadoop.io.ArrayWritable; + +import parquet.io.api.GroupConverter; +import parquet.io.api.RecordMaterializer; +import parquet.schema.GroupType; + +/** + * + * A MapWritableReadSupport, encapsulates the tuples + * + */ +public class DataWritableRecordConverter extends RecordMaterializer { + + private final DataWritableGroupConverter root; + + public DataWritableRecordConverter(final GroupType requestedSchema, final GroupType tableSchema) { + this.root = new DataWritableGroupConverter(requestedSchema, tableSchema); + } + + @Override + public ArrayWritable getCurrentRecord() { + return root.getCurrentArray(); + } + + @Override + public GroupConverter getRootConverter() { + return root; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java new file mode 100644 index 0000000..f977df2 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java @@ -0,0 +1,163 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import java.math.BigDecimal; + +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable.DicBinaryWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; + +import parquet.column.Dictionary; +import parquet.io.api.Binary; +import parquet.io.api.Converter; +import parquet.io.api.PrimitiveConverter; + +/** + * + * ETypeConverter is an easy way to set the converter for the right type. + * + * + */ +public enum ETypeConverter { + + EDOUBLE_CONVERTER(Double.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addDouble(final double value) { + parent.set(index, new DoubleWritable(value)); + } + }; + } + }, + EBOOLEAN_CONVERTER(Boolean.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addBoolean(final boolean value) { + parent.set(index, new BooleanWritable(value)); + } + }; + } + }, + EFLOAT_CONVERTER(Float.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addFloat(final float value) { + parent.set(index, new FloatWritable(value)); + } + }; + } + }, + EINT32_CONVERTER(Integer.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addInt(final int value) { + parent.set(index, new IntWritable(value)); + } + }; + } + }, + EINT64_CONVERTER(Long.TYPE) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addLong(final long value) { + parent.set(index, new LongWritable(value)); + } + }; + } + }, + EINT96_CONVERTER(BigDecimal.class) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + @Override + final public void addDouble(final double value) { + parent.set(index, new DoubleWritable(value)); + } + }; + } + }, + EBINARY_CONVERTER(Binary.class) { + @Override + Converter getConverter(final Class type, final int index, final HiveGroupConverter parent) { + return new PrimitiveConverter() { + private Binary[] dictBinary; + private String[] dict; + + @Override + public boolean hasDictionarySupport() { + return true; + } + + @Override + public void setDictionary(Dictionary dictionary) { + dictBinary = new Binary[dictionary.getMaxId() + 1]; + dict = new String[dictionary.getMaxId() + 1]; + for (int i = 0; i <= dictionary.getMaxId(); i++) { + Binary binary = dictionary.decodeToBinary(i); + dictBinary[i] = binary; + dict[i] = binary.toStringUsingUTF8(); + } + } + + @Override + public void addValueFromDictionary(int dictionaryId) { + parent.set(index, new DicBinaryWritable(dictBinary[dictionaryId], dict[dictionaryId])); + } + + @Override + final public void addBinary(Binary value) { + parent.set(index, new BinaryWritable(value)); + } + }; + } + }; + final Class _type; + + private ETypeConverter(final Class type) { + this._type = type; + } + + private Class getType() { + return _type; + } + + abstract Converter getConverter(final Class type, final int index, final HiveGroupConverter parent); + + static public Converter getNewConverter(final Class type, final int index, final HiveGroupConverter parent) { + for (final ETypeConverter eConverter : values()) { + if (eConverter.getType() == type) { + return eConverter.getConverter(type, index, parent); + } + } + throw new RuntimeException("Converter not found ... for type : " + type); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java new file mode 100644 index 0000000..6138424 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java @@ -0,0 +1,47 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Converter; +import parquet.io.api.GroupConverter; +import parquet.schema.Type; +import parquet.schema.Type.Repetition; + +public abstract class HiveGroupConverter extends GroupConverter { + + static protected Converter getConverterFromDescription(final Type type, final int index, final HiveGroupConverter parent) { + if (type == null) { + return null; + } + + if (type.isPrimitive()) { + return ETypeConverter.getNewConverter(type.asPrimitiveType().getPrimitiveTypeName().javaType, index, parent); + } else { + if (type.asGroupType().getRepetition() == Repetition.REPEATED) { + return new ArrayWritableGroupConverter(type.asGroupType(), parent, index); + } else { + return new DataWritableGroupConverter(type.asGroupType(), parent, index); + } + } + } + + abstract protected void set(int index, Writable value); + + abstract protected void add(int index, Writable value); + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java new file mode 100644 index 0000000..7ec91fe --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java @@ -0,0 +1,136 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.convert; + +import java.util.List; + +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import parquet.Log; +import parquet.schema.GroupType; +import parquet.schema.MessageType; +import parquet.schema.OriginalType; +import parquet.schema.PrimitiveType; +import parquet.schema.PrimitiveType.PrimitiveTypeName; +import parquet.schema.Type; +import parquet.schema.Type.Repetition; + +/** + * + * A HiveSchemaConverter + * + * + */ +public class HiveSchemaConverter { + + private static final Log LOG = Log.getLog(HiveSchemaConverter.class); + + static public MessageType convert(final List columnNames, final List columnTypes) { + final MessageType schema = new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); + return schema; + } + + static private Type[] convertTypes(final List columnNames, final List columnTypes) { + if (columnNames.size() != columnTypes.size()) { + throw new RuntimeException("Mismatched Hive columns and types. Hive columns names found : " + columnNames + + " . And Hive types found : " + columnTypes); + } + + final Type[] types = new Type[columnNames.size()]; + + for (int i = 0; i < columnNames.size(); ++i) { + types[i] = convertType(columnNames.get(i), columnTypes.get(i)); + } + + return types; + } + + static private Type convertType(final String name, final TypeInfo typeInfo) { + return convertType(name, typeInfo, Repetition.OPTIONAL); + } + + static private Type convertType(final String name, final TypeInfo typeInfo, final Repetition repetition) { + if (typeInfo.getCategory().equals(Category.PRIMITIVE)) { + if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.BINARY, name); + } else if (typeInfo.equals(TypeInfoFactory.intTypeInfo) || typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.INT32, name); + } else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.INT64, name); + } else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.DOUBLE, name); + } else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.FLOAT, name); + } else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) { + return new PrimitiveType(repetition, PrimitiveTypeName.BOOLEAN, name); + } else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) { + // TODO : binaryTypeInfo is a byte array. Need to map it + throw new UnsupportedOperationException("Binary type not implemented"); + } else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { + throw new UnsupportedOperationException("Timestamp type not implemented"); + } else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { + throw new UnsupportedOperationException("Void type not implemented"); + } else if (typeInfo.equals(TypeInfoFactory.unknownTypeInfo)) { + throw new UnsupportedOperationException("Unknown type not implemented"); + } else { + throw new RuntimeException("Unknown type: " + typeInfo); + } + } else if (typeInfo.getCategory().equals(Category.LIST)) { + return convertArrayType(name, (ListTypeInfo) typeInfo); + } else if (typeInfo.getCategory().equals(Category.STRUCT)) { + return convertStructType(name, (StructTypeInfo) typeInfo); + } else if (typeInfo.getCategory().equals(Category.MAP)) { + return convertMapType(name, (MapTypeInfo) typeInfo); + } else if (typeInfo.getCategory().equals(Category.UNION)) { + throw new UnsupportedOperationException("Union type not implemented"); + } else { + throw new RuntimeException("Unknown type: " + typeInfo); + } + } + + // An optional group containing a repeated anonymous group "bag", containing + // 1 anonymous element "array_element" + static private GroupType convertArrayType(final String name, final ListTypeInfo typeInfo) { + final TypeInfo subType = typeInfo.getListElementTypeInfo(); + return listWrapper(name, OriginalType.LIST, new GroupType(Repetition.REPEATED, ParquetHiveSerDe.ARRAY.toString(), convertType("array_element", subType))); + } + + // An optional group containing multiple elements + static private GroupType convertStructType(final String name, final StructTypeInfo typeInfo) { + final List columnNames = typeInfo.getAllStructFieldNames(); + final List columnTypes = typeInfo.getAllStructFieldTypeInfos(); + return new GroupType(Repetition.OPTIONAL, name, convertTypes(columnNames, columnTypes)); + + } + + // An optional group containing a repeated anonymous group "map", containing + // 2 elements: "key", "value" + static private GroupType convertMapType(final String name, final MapTypeInfo typeInfo) { + final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED); + final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), typeInfo.getMapValueTypeInfo()); + return listWrapper(name, OriginalType.MAP_KEY_VALUE, new GroupType(Repetition.REPEATED, ParquetHiveSerDe.MAP.toString(), keyType, valueType)); + } + + static private GroupType listWrapper(final String name, final OriginalType originalType, final GroupType groupType) { + return new GroupType(Repetition.OPTIONAL, name, originalType, groupType); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java new file mode 100644 index 0000000..ff16988 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -0,0 +1,135 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.read; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.io.parquet.convert.DataWritableRecordConverter; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.util.StringUtils; + +import parquet.hadoop.api.ReadSupport; +import parquet.io.api.RecordMaterializer; +import parquet.schema.MessageType; +import parquet.schema.MessageTypeParser; +import parquet.schema.PrimitiveType; +import parquet.schema.PrimitiveType.PrimitiveTypeName; +import parquet.schema.Type; +import parquet.schema.Type.Repetition; + +/** + * + * A MapWritableReadSupport + * + * Manages the translation between Hive and Parquet + * + */ +public class DataWritableReadSupport extends ReadSupport { + + public static final String HIVE_SCHEMA_KEY = "HIVE_TABLE_SCHEMA"; + private static final List virtualColumns; + + static { + List vcols = new ArrayList(); + vcols.add("INPUT__FILE__NAME"); + vcols.add("BLOCK__OFFSET__INSIDE__FILE"); + vcols.add("ROW__OFFSET__INSIDE__BLOCK"); + vcols.add("RAW__DATA__SIZE"); + virtualColumns = Collections.unmodifiableList(vcols); + } + + /** + * From a string which columns names (including hive column), return a list + * of string columns + * + * @param comma separated list of columns + * @return list with virtual columns removed + */ + private static List getColumns(final String columns) { + final List result = (List) StringUtils.getStringCollection(columns); + result.removeAll(virtualColumns); + return result; + } + /** + * + * It creates the readContext for Parquet side with the requested schema during the init phase. + * + * @param configuration needed to get the wanted columns + * @param keyValueMetaData // unused + * @param fileSchema parquet file schema + * @return the parquet ReadContext + */ + @Override + public parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration, final Map keyValueMetaData, final MessageType fileSchema) { + final String columns = configuration.get("columns"); + final Map contextMetadata = new HashMap(); + if (columns != null) { + final List listColumns = getColumns(columns); + + final List typeListTable = new ArrayList(); + for (final String col : listColumns) { + if (fileSchema.containsField(col)) { + typeListTable.add(fileSchema.getType(col)); + } else { // dummy type, should not be called + typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col)); + } + } + MessageType tableSchema = new MessageType("table_schema", typeListTable); + contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString()); + + MessageType requestedSchemaByUser = tableSchema; + final List indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration); + + final List typeListWanted = new ArrayList(); + for (final Integer idx : indexColumnsWanted) { + typeListWanted.add(tableSchema.getType(listColumns.get(idx))); + } + requestedSchemaByUser = new MessageType(fileSchema.getName(), typeListWanted); + + return new ReadContext(requestedSchemaByUser, contextMetadata); + } else { + contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString()); + return new ReadContext(fileSchema, contextMetadata); + } + } + + /** + * + * It creates the hive read support to interpret data from parquet to hive + * + * @param configuration // unused + * @param keyValueMetaData + * @param fileSchema // unused + * @param readContext containing the requested schema and the schema of the hive table + * @return Record Materialize for Hive + */ + @Override + public RecordMaterializer prepareForRead(final Configuration configuration, final Map keyValueMetaData, final MessageType fileSchema, + final parquet.hadoop.api.ReadSupport.ReadContext readContext) { + final Map metadata = readContext.getReadSupportMetadata(); + if (metadata == null) { + throw new RuntimeException("ReadContext not initialized properly. Don't know the Hive Schema."); + } + final MessageType tableSchema = MessageTypeParser.parseMessageType(metadata.get(HIVE_SCHEMA_KEY)); + return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java new file mode 100644 index 0000000..bbdeca2 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetRecordReaderWrapper.java @@ -0,0 +1,235 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.read; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.parquet.ParquetInputSplitWrapper; +import org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskAttemptID; + +import parquet.hadoop.ParquetFileReader; +import parquet.hadoop.ParquetInputFormat; +import parquet.hadoop.ParquetInputSplit; +import parquet.hadoop.api.ReadSupport.ReadContext; +import parquet.hadoop.metadata.BlockMetaData; +import parquet.hadoop.metadata.FileMetaData; +import parquet.hadoop.metadata.ParquetMetadata; +import parquet.hadoop.util.ContextUtil; +import parquet.schema.MessageTypeParser; + +public class ParquetRecordReaderWrapper implements RecordReader { + public static final Log LOG = LogFactory.getLog(ParquetRecordReaderWrapper.class); + + private final long splitLen; // for getPos() + + private org.apache.hadoop.mapreduce.RecordReader realReader; + // expect readReader return same Key & Value objects (common case) + // this avoids extra serialization & deserialization of these objects + private ArrayWritable valueObj = null; + private boolean firstRecord = false; + private boolean eof = false; + private int schemaSize; + + private final ProjectionPusher projectionPusher; + + public ParquetRecordReaderWrapper( + final ParquetInputFormat newInputFormat, + final InputSplit oldSplit, + final JobConf oldJobConf, + final Reporter reporter, + final ProjectionPusher pusher) + throws IOException, InterruptedException { + this.splitLen = oldSplit.getLength(); + this.projectionPusher = pusher; + + final ParquetInputSplit split = getSplit(oldSplit, oldJobConf); + + TaskAttemptID taskAttemptID = TaskAttemptID.forName(oldJobConf.get("mapred.task.id")); + if (taskAttemptID == null) { + taskAttemptID = new TaskAttemptID(); + } + + // create a TaskInputOutputContext + final TaskAttemptContext taskContext = ContextUtil.newTaskAttemptContext(oldJobConf, taskAttemptID); + + if (split != null) { + try { + realReader = newInputFormat.createRecordReader(split, taskContext); + realReader.initialize(split, taskContext); + + // read once to gain access to key and value objects + if (realReader.nextKeyValue()) { + firstRecord = true; + valueObj = realReader.getCurrentValue(); + } else { + eof = true; + } + } catch (final InterruptedException e) { + throw new IOException(e); + } + } else { + realReader = null; + eof = true; + if (valueObj == null) { // Should initialize the value for createValue + valueObj = new ArrayWritable(Writable.class, new Writable[schemaSize]); + } + } + } + + @Override + public void close() throws IOException { + if (realReader != null) { + realReader.close(); + } + } + + @Override + public Void createKey() { + return null; + } + + @Override + public ArrayWritable createValue() { + return valueObj; + } + + @Override + public long getPos() throws IOException { + return (long) (splitLen * getProgress()); + } + + @Override + public float getProgress() throws IOException { + if (realReader == null) { + return 1f; + } else { + try { + return realReader.getProgress(); + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + } + + @Override + public boolean next(final Void key, final ArrayWritable value) throws IOException { + if (eof) { + return false; + } + + try { + if (firstRecord) { // key & value are already read. + firstRecord = false; + } else if (!realReader.nextKeyValue()) { + eof = true; // strictly not required, just for consistency + return false; + } + + final ArrayWritable tmpCurValue = realReader.getCurrentValue(); + + if (value != tmpCurValue) { + final Writable[] arrValue = value.get(); + final Writable[] arrCurrent = tmpCurValue.get(); + if (value != null && arrValue.length == arrCurrent.length) { + System.arraycopy(arrCurrent, 0, arrValue, 0, arrCurrent.length); + } else { + if (arrValue.length != arrCurrent.length) { + throw new IOException("DeprecatedParquetHiveInput : size of object differs. Value size : " + arrValue.length + ", Current Object size : " + + arrCurrent.length); + } else { + throw new IOException("DeprecatedParquetHiveInput can not support RecordReaders that don't return same key & value & value is null"); + } + } + } + return true; + + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + + /** + * gets a ParquetInputSplit corresponding to a split given by Hive + * + * @param oldSplit The split given by Hive + * @param conf The JobConf of the Hive job + * @return a ParquetInputSplit corresponding to the oldSplit + * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file + */ + protected ParquetInputSplit getSplit( + final InputSplit oldSplit, + final JobConf conf + ) throws IOException { + + ParquetInputSplit split; + + if (oldSplit instanceof ParquetInputSplitWrapper) { + split = ((ParquetInputSplitWrapper) oldSplit).getRealSplit(); + } else if (oldSplit instanceof FileSplit) { + final Path finalPath = ((FileSplit) oldSplit).getPath(); + final JobConf cloneJob = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent()); + + final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath); + final List blocks = parquetMetadata.getBlocks(); + final FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); + + final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema()); + schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount(); + + final List splitGroup = new ArrayList(); + final long splitStart = ((FileSplit) oldSplit).getStart(); + final long splitLength = ((FileSplit) oldSplit).getLength(); + for (final BlockMetaData block : blocks) { + final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); + if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) { + splitGroup.add(block); + } + } + + if (splitGroup.isEmpty()) { + LOG.warn("Skipping split, could not find row group in: " + (FileSplit) oldSplit); + split = null; + } else { + split = new ParquetInputSplit(finalPath, + splitStart, + splitLength, + ((FileSplit) oldSplit).getLocations(), + splitGroup, + readContext.getRequestedSchema().toString(), + fileMetaData.getSchema().toString(), + fileMetaData.getKeyValueMetaData(), + readContext.getReadSupportMetadata()); + } + + } else { + throw new IllegalArgumentException("Unknown split type: " + oldSplit); + } + + return split; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/AbstractParquetMapInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/AbstractParquetMapInspector.java new file mode 100644 index 0000000..7e00c4b --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/AbstractParquetMapInspector.java @@ -0,0 +1,162 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.HashMap; +import java.util.Map; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableMapObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +public abstract class AbstractParquetMapInspector implements SettableMapObjectInspector { + + protected final ObjectInspector keyInspector; + protected final ObjectInspector valueInspector; + + public AbstractParquetMapInspector(final ObjectInspector keyInspector, final ObjectInspector valueInspector) { + this.keyInspector = keyInspector; + this.valueInspector = valueInspector; + } + + @Override + public String getTypeName() { + return "map<" + keyInspector.getTypeName() + "," + valueInspector.getTypeName() + ">"; + } + + @Override + public Category getCategory() { + return Category.MAP; + } + + @Override + public ObjectInspector getMapKeyObjectInspector() { + return keyInspector; + } + + @Override + public ObjectInspector getMapValueObjectInspector() { + return valueInspector; + } + + @Override + public Map getMap(final Object data) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] mapContainer = ((ArrayWritable) data).get(); + + if (mapContainer == null || mapContainer.length == 0) { + return null; + } + + final Writable[] mapArray = ((ArrayWritable) mapContainer[0]).get(); + final Map map = new HashMap(); + + for (final Writable obj : mapArray) { + final ArrayWritable mapObj = (ArrayWritable) obj; + final Writable[] arr = mapObj.get(); + map.put(arr[0], arr[1]); + } + + return map; + } + + if (data instanceof Map) { + return (Map) data; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public int getMapSize(final Object data) { + if (data == null) { + return -1; + } + + if (data instanceof ArrayWritable) { + final Writable[] mapContainer = ((ArrayWritable) data).get(); + + if (mapContainer == null || mapContainer.length == 0) { + return -1; + } else { + return ((ArrayWritable) mapContainer[0]).get().length; + } + } + + if (data instanceof Map) { + return ((Map) data).size(); + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public Object create() { + Map m = new HashMap(); + return m; + } + + @Override + public Object put(Object map, Object key, Object value) { + Map m = (HashMap) map; + m.put(key, value); + return m; + } + + @Override + public Object remove(Object map, Object key) { + Map m = (HashMap) map; + m.remove(key); + return m; + } + + @Override + public Object clear(Object map) { + Map m = (HashMap) map; + m.clear(); + return m; + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final StandardParquetHiveMapInspector other = (StandardParquetHiveMapInspector) obj; + if (this.keyInspector != other.keyInspector && (this.keyInspector == null || !this.keyInspector.equals(other.keyInspector))) { + return false; + } + if (this.valueInspector != other.valueInspector && (this.valueInspector == null || !this.valueInspector.equals(other.valueInspector))) { + return false; + } + return true; + } + + @Override + public int hashCode() { + int hash = 7; + hash = 59 * hash + (this.keyInspector != null ? this.keyInspector.hashCode() : 0); + hash = 59 * hash + (this.valueInspector != null ? this.valueInspector.hashCode() : 0); + return hash; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java new file mode 100644 index 0000000..829a3df --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java @@ -0,0 +1,224 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +import org.apache.hadoop.hive.ql.io.parquet.serde.primitive.ParquetPrimitiveInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.ArrayWritable; + +/** + * + * The ArrayWritableObjectInspector will inspect an ArrayWritable, considering it as a Hive struct.
+ * It can also inspect a List if Hive decides to inspect the result of an inspection. + * + */ +public class ArrayWritableObjectInspector extends SettableStructObjectInspector { + + private final TypeInfo typeInfo; + private final List fieldInfos; + private final List fieldNames; + private final List fields; + private final HashMap fieldsByName; + + public ArrayWritableObjectInspector(final StructTypeInfo rowTypeInfo) { + + typeInfo = rowTypeInfo; + fieldNames = rowTypeInfo.getAllStructFieldNames(); + fieldInfos = rowTypeInfo.getAllStructFieldTypeInfos(); + fields = new ArrayList(fieldNames.size()); + fieldsByName = new HashMap(); + + for (int i = 0; i < fieldNames.size(); ++i) { + final String name = fieldNames.get(i); + final TypeInfo fieldInfo = fieldInfos.get(i); + + final StructFieldImpl field = new StructFieldImpl(name, getObjectInspector(fieldInfo), i); + fields.add(field); + fieldsByName.put(name, field); + } + } + + private ObjectInspector getObjectInspector(final TypeInfo typeInfo) { + if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableFloatObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.intTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableIntObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { + return PrimitiveObjectInspectorFactory.writableLongObjectInspector; + } else if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { + return ParquetPrimitiveInspectorFactory.parquetStringInspector; + } else if (typeInfo.getCategory().equals(Category.STRUCT)) { + return new ArrayWritableObjectInspector((StructTypeInfo) typeInfo); + } else if (typeInfo.getCategory().equals(Category.LIST)) { + final TypeInfo subTypeInfo = ((ListTypeInfo) typeInfo).getListElementTypeInfo(); + return new ParquetHiveArrayInspector(getObjectInspector(subTypeInfo)); + } else if (typeInfo.getCategory().equals(Category.MAP)) { + final TypeInfo keyTypeInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo(); + final TypeInfo valueTypeInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo(); + if (keyTypeInfo.equals(TypeInfoFactory.stringTypeInfo) || keyTypeInfo.equals(TypeInfoFactory.byteTypeInfo) + || keyTypeInfo.equals(TypeInfoFactory.shortTypeInfo)) { + return new DeepParquetHiveMapInspector(getObjectInspector(keyTypeInfo), getObjectInspector(valueTypeInfo)); + } else { + return new StandardParquetHiveMapInspector(getObjectInspector(keyTypeInfo), getObjectInspector(valueTypeInfo)); + } + } else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { + throw new UnsupportedOperationException("timestamp not implemented yet"); + } else if (typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { + return ParquetPrimitiveInspectorFactory.parquetByteInspector; + } else if (typeInfo.equals(TypeInfoFactory.shortTypeInfo)) { + return ParquetPrimitiveInspectorFactory.parquetShortInspector; + } else { + throw new RuntimeException("Unknown field info: " + typeInfo); + } + + } + + @Override + public Category getCategory() { + return Category.STRUCT; + } + + @Override + public String getTypeName() { + return typeInfo.getTypeName(); + } + + @Override + public List getAllStructFieldRefs() { + return fields; + } + + @Override + public Object getStructFieldData(final Object data, final StructField fieldRef) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final ArrayWritable arr = (ArrayWritable) data; + return arr.get()[((StructFieldImpl) fieldRef).getIndex()]; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public StructField getStructFieldRef(final String name) { + return fieldsByName.get(name); + } + + @Override + public List getStructFieldsDataAsList(final Object data) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final ArrayWritable arr = (ArrayWritable) data; + final Object[] arrWritable = arr.get(); + return new ArrayList(Arrays.asList(arrWritable)); + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public Object create() { + final ArrayList list = new ArrayList(fields.size()); + for (int i = 0; i < fields.size(); ++i) { + list.add(null); + } + return list; + } + + @Override + public Object setStructFieldData(Object struct, StructField field, Object fieldValue) { + final ArrayList list = (ArrayList) struct; + list.set(((StructFieldImpl) field).getIndex(), fieldValue); + return list; + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final ArrayWritableObjectInspector other = (ArrayWritableObjectInspector) obj; + if (this.typeInfo != other.typeInfo && (this.typeInfo == null || !this.typeInfo.equals(other.typeInfo))) { + return false; + } + return true; + } + + @Override + public int hashCode() { + int hash = 5; + hash = 29 * hash + (this.typeInfo != null ? this.typeInfo.hashCode() : 0); + return hash; + } + + class StructFieldImpl implements StructField { + + private final String name; + private final ObjectInspector inspector; + private final int index; + + public StructFieldImpl(final String name, final ObjectInspector inspector, final int index) { + this.name = name; + this.inspector = inspector; + this.index = index; + } + + @Override + public String getFieldComment() { + return ""; + } + + @Override + public String getFieldName() { + return name; + } + + public int getIndex() { + return index; + } + + @Override + public ObjectInspector getFieldObjectInspector() { + return inspector; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/DeepParquetHiveMapInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/DeepParquetHiveMapInspector.java new file mode 100644 index 0000000..927c9dd --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/DeepParquetHiveMapInspector.java @@ -0,0 +1,83 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.Map; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +/** + * The DeepParquetHiveMapInspector will inspect an ArrayWritable, considering it as a Hive map.
+ * It can also inspect a Map if Hive decides to inspect the result of an inspection.
+ * When trying to access elements from the map it will iterate over all keys, inspecting them and comparing them to the + * desired key. + */ +public class DeepParquetHiveMapInspector extends AbstractParquetMapInspector { + + public DeepParquetHiveMapInspector(final ObjectInspector keyInspector, final ObjectInspector valueInspector) { + super(keyInspector, valueInspector); + } + + @Override + public Object getMapValueElement(final Object data, final Object key) { + if (data == null || key == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] mapContainer = ((ArrayWritable) data).get(); + + if (mapContainer == null || mapContainer.length == 0) { + return null; + } + + final Writable[] mapArray = ((ArrayWritable) mapContainer[0]).get(); + + for (final Writable obj : mapArray) { + final ArrayWritable mapObj = (ArrayWritable) obj; + final Writable[] arr = mapObj.get(); + if (key.equals(arr[0]) || key.equals(((PrimitiveObjectInspector) keyInspector).getPrimitiveJavaObject(arr[0])) + || key.equals(((PrimitiveObjectInspector) keyInspector).getPrimitiveWritableObject(arr[0]))) { + return arr[1]; + } + } + + return null; + } + + if (data instanceof Map) { + final Map map = (Map) data; + + if (map.containsKey(key)) { + return map.get(key); + } + + for (final Map.Entry entry : map.entrySet()) { + if (key.equals(((PrimitiveObjectInspector) keyInspector).getPrimitiveJavaObject(entry.getKey())) + || key.equals(((PrimitiveObjectInspector) keyInspector).getPrimitiveWritableObject(entry.getKey()))) { + return entry.getValue(); + } + } + + return null; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveArrayInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveArrayInspector.java new file mode 100644 index 0000000..853d3a4 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveArrayInspector.java @@ -0,0 +1,186 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableListObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +/** + * The ParquetHiveArrayInspector will inspect an ArrayWritable, considering it as an Hive array.
+ * It can also inspect a List if Hive decides to inspect the result of an inspection. + */ +public class ParquetHiveArrayInspector implements SettableListObjectInspector { + + ObjectInspector arrayElementInspector; + + public ParquetHiveArrayInspector(final ObjectInspector arrayElementInspector) { + this.arrayElementInspector = arrayElementInspector; + } + + @Override + public String getTypeName() { + return "array<" + arrayElementInspector.getTypeName() + ">"; + } + + @Override + public Category getCategory() { + return Category.LIST; + } + + @Override + public ObjectInspector getListElementObjectInspector() { + return arrayElementInspector; + } + + @Override + public Object getListElement(final Object data, final int index) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] listContainer = ((ArrayWritable) data).get(); + + if (listContainer == null || listContainer.length == 0) { + return null; + } + + final Writable subObj = listContainer[0]; + + if (subObj == null) { + return null; + } + + if (index >= 0 && index < ((ArrayWritable) subObj).get().length) { + return ((ArrayWritable) subObj).get()[index]; + } else { + return null; + } + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public int getListLength(final Object data) { + if (data == null) { + return -1; + } + + if (data instanceof ArrayWritable) { + final Writable[] listContainer = ((ArrayWritable) data).get(); + + if (listContainer == null || listContainer.length == 0) { + return -1; + } + + final Writable subObj = listContainer[0]; + + if (subObj == null) { + return 0; + } + + return ((ArrayWritable) subObj).get().length; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public List getList(final Object data) { + if (data == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] listContainer = ((ArrayWritable) data).get(); + + if (listContainer == null || listContainer.length == 0) { + return null; + } + + final Writable subObj = listContainer[0]; + + if (subObj == null) { + return null; + } + + final Writable[] array = ((ArrayWritable) subObj).get(); + final List list = new ArrayList(); + + for (final Writable obj : array) { + list.add(obj); + } + + return list; + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } + + @Override + public Object create(final int size) { + final ArrayList result = new ArrayList(size); + for (int i = 0; i < size; ++i) { + result.add(null); + } + return result; + } + + @Override + public Object set(final Object list, final int index, final Object element) { + final ArrayList l = (ArrayList) list; + l.set(index, element); + return list; + } + + @Override + public Object resize(final Object list, final int newSize) { + final ArrayList l = (ArrayList) list; + l.ensureCapacity(newSize); + while (l.size() < newSize) { + l.add(null); + } + while (l.size() > newSize) { + l.remove(l.size() - 1); + } + return list; + } + + @Override + public boolean equals(final Object o) { + if (o == null || o.getClass() != getClass()) { + return false; + } else if (o == this) { + return true; + } else { + final ObjectInspector other = ((ParquetHiveArrayInspector) o).arrayElementInspector; + return other.equals(arrayElementInspector); + } + } + + @Override + public int hashCode() { + int hash = 3; + hash = 29 * hash + (this.arrayElementInspector != null ? this.arrayElementInspector.hashCode() : 0); + return hash; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java new file mode 100644 index 0000000..49c81da --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java @@ -0,0 +1,287 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Binary; + +/** + * + * A ParquetHiveSerDe for Hive (with the deprecated package mapred) + * + */ +public class ParquetHiveSerDe implements SerDe { + + public static Text MAP_KEY = new Text("key"); + public static Text MAP_VALUE = new Text("value"); + public static Text MAP = new Text("map"); + public static Text ARRAY = new Text("bag"); + private SerDeStats stats; + ObjectInspector objInspector; + + private enum LAST_OPERATION { + + SERIALIZE, + DESERIALIZE, + UNKNOWN + } + LAST_OPERATION status; + private long serializedSize; + private long deserializedSize; + + @Override + final public void initialize(final Configuration conf, final Properties tbl) throws SerDeException { + + final TypeInfo rowTypeInfo; + final List columnNames; + final List columnTypes; + // Get column names and sort order + final String columnNameProperty = tbl.getProperty("columns"); + final String columnTypeProperty = tbl.getProperty("columns.types"); + + if (columnNameProperty.length() == 0) { + columnNames = new ArrayList(); + } else { + columnNames = Arrays.asList(columnNameProperty.split(",")); + } + + if (columnTypeProperty.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + + if (columnNames.size() != columnTypes.size()) { + throw new RuntimeException("ParquetHiveSerde initialization failed. Number of column name and column type differs."); + } + + // Create row related objects + rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo); + + // Stats part + stats = new SerDeStats(); + serializedSize = 0; + deserializedSize = 0; + status = LAST_OPERATION.UNKNOWN; + } + + @Override + public Object deserialize(final Writable blob) throws SerDeException { + status = LAST_OPERATION.DESERIALIZE; + deserializedSize = 0; + if (blob instanceof ArrayWritable) { + deserializedSize = ((ArrayWritable) blob).get().length; + return blob; + } else { + return null; + } + } + + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return objInspector; + } + + @Override + public Class getSerializedClass() { + return ArrayWritable.class; + } + + @Override + public Writable serialize(final Object obj, final ObjectInspector objInspector) throws SerDeException { + if (!objInspector.getCategory().equals(Category.STRUCT)) { + throw new SerDeException("Cannot serialize " + objInspector.getCategory() + ". Can only serialize a struct"); + } + + final ArrayWritable serializeData = createStruct(obj, (StructObjectInspector) objInspector); + + serializedSize = serializeData.get().length; + status = LAST_OPERATION.SERIALIZE; + + return serializeData; + } + + private ArrayWritable createStruct(final Object obj, final StructObjectInspector inspector) throws SerDeException { + + final List fields = inspector.getAllStructFieldRefs(); + final Writable[] arr = new Writable[fields.size()]; + + int i = 0; + + for (final StructField field : fields) { + final Object subObj = inspector.getStructFieldData(obj, field); + final ObjectInspector subInspector = field.getFieldObjectInspector(); + + arr[i] = createObject(subObj, subInspector); + ++i; + } + + return new ArrayWritable(Writable.class, arr); + + } + + private Writable createMap(final Object obj, final MapObjectInspector inspector) throws SerDeException { + final Map sourceMap = inspector.getMap(obj); + final ObjectInspector keyInspector = inspector.getMapKeyObjectInspector(); + final ObjectInspector valueInspector = inspector.getMapValueObjectInspector(); + final List array = new ArrayList(); + + if (sourceMap != null) { + for (final Entry keyValue : sourceMap.entrySet()) { + final Writable key = createObject(keyValue.getKey(), keyInspector); + final Writable value = createObject(keyValue.getValue(), valueInspector); + + if (key != null) { + Writable[] arr = new Writable[2]; + arr[0] = key; + arr[1] = value; + array.add(new ArrayWritable(Writable.class, arr)); + } + + } + } + + if (array.size() > 0) { + final ArrayWritable subArray = new ArrayWritable(ArrayWritable.class, array.toArray(new ArrayWritable[array.size()])); + return new ArrayWritable(Writable.class, new Writable[] {subArray}); + } else { + return null; + } + } + + private ArrayWritable createArray(final Object obj, final ListObjectInspector inspector) throws SerDeException { + final List sourceArray = inspector.getList(obj); + final ObjectInspector subInspector = inspector.getListElementObjectInspector(); + final List array = new ArrayList(); + + if (sourceArray != null) { + for (final Object curObj : sourceArray) { + final Writable newObj = createObject(curObj, subInspector); + if (newObj != null) { + array.add(newObj); + } + } + } + + if (array.size() > 0) { + final ArrayWritable subArray = new ArrayWritable(array.get(0).getClass(), array.toArray(new Writable[array.size()])); + return new ArrayWritable(Writable.class, new Writable[] {subArray}); + } else { + return null; + } + } + + private Writable createPrimitive(final Object obj, final PrimitiveObjectInspector inspector) throws SerDeException { + + if (obj == null) { + return null; + } + + switch (inspector.getPrimitiveCategory()) { + case VOID: + return null; + case BOOLEAN: + return new BooleanWritable(((BooleanObjectInspector) inspector).get(obj) ? Boolean.TRUE : Boolean.FALSE); + case BYTE: + return new ByteWritable((byte) ((ByteObjectInspector) inspector).get(obj)); + case DOUBLE: + return new DoubleWritable(((DoubleObjectInspector) inspector).get(obj)); + case FLOAT: + return new FloatWritable(((FloatObjectInspector) inspector).get(obj)); + case INT: + return new IntWritable(((IntObjectInspector) inspector).get(obj)); + case LONG: + return new LongWritable(((LongObjectInspector) inspector).get(obj)); + case SHORT: + return new ShortWritable((short) ((ShortObjectInspector) inspector).get(obj)); + case STRING: + return new BinaryWritable(Binary.fromString(((StringObjectInspector) inspector).getPrimitiveJavaObject(obj))); + default: + throw new SerDeException("Unknown primitive : " + inspector.getPrimitiveCategory()); + } + } + + private Writable createObject(final Object obj, final ObjectInspector inspector) throws SerDeException { + switch (inspector.getCategory()) { + case STRUCT: + return createStruct(obj, (StructObjectInspector) inspector); + case LIST: + return createArray(obj, (ListObjectInspector) inspector); + case MAP: + return createMap(obj, (MapObjectInspector) inspector); + case PRIMITIVE: + return createPrimitive(obj, (PrimitiveObjectInspector) inspector); + default: + throw new SerDeException("Unknown data type" + inspector.getCategory()); + } + } + + // + @Override + public SerDeStats getSerDeStats() { + // must be different + assert (status != LAST_OPERATION.UNKNOWN); + + if (status == LAST_OPERATION.SERIALIZE) { + stats.setRawDataSize(serializedSize); + } else { + stats.setRawDataSize(deserializedSize); + } + return stats; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/StandardParquetHiveMapInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/StandardParquetHiveMapInspector.java new file mode 100644 index 0000000..c9f8236 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/StandardParquetHiveMapInspector.java @@ -0,0 +1,66 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.Map; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; + +/** + * The StandardParquetHiveMapInspector will inspect an ArrayWritable, considering it as a Hive map.
+ * It can also inspect a Map if Hive decides to inspect the result of an inspection. + */ +public class StandardParquetHiveMapInspector extends AbstractParquetMapInspector { + + public StandardParquetHiveMapInspector(final ObjectInspector keyInspector, final ObjectInspector valueInspector) { + super(keyInspector, valueInspector); + } + + @Override + public Object getMapValueElement(final Object data, final Object key) { + if (data == null || key == null) { + return null; + } + + if (data instanceof ArrayWritable) { + final Writable[] mapContainer = ((ArrayWritable) data).get(); + + if (mapContainer == null || mapContainer.length == 0) { + return null; + } + + final Writable[] mapArray = ((ArrayWritable) mapContainer[0]).get(); + + for (final Writable obj : mapArray) { + final ArrayWritable mapObj = (ArrayWritable) obj; + final Writable[] arr = mapObj.get(); + if (key.equals(arr[0])) { + return arr[1]; + } + } + + return null; + } + + if (data instanceof Map) { + return ((Map) data).get(key); + } + + throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetByteInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetByteInspector.java new file mode 100644 index 0000000..7a47712 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetByteInspector.java @@ -0,0 +1,57 @@ +/* + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde.primitive; + +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableByteObjectInspector; +import org.apache.hadoop.io.IntWritable; + +/** + * The ParquetByteInspector can inspect both ByteWritables and IntWritables into bytes. + */ +public class ParquetByteInspector extends AbstractPrimitiveJavaObjectInspector implements SettableByteObjectInspector { + + ParquetByteInspector() { + super(TypeInfoFactory.byteTypeInfo); + } + + @Override + public Object getPrimitiveWritableObject(final Object o) { + return o == null ? null : new ByteWritable(get(o)); + } + + @Override + public Object create(final byte val) { + return new ByteWritable(val); + } + + @Override + public Object set(final Object o, final byte val) { + ((ByteWritable) o).set(val); + return o; + } + + @Override + public byte get(Object o) { + // Accept int writables and convert them. + if (o instanceof IntWritable) { + return (byte) ((IntWritable) o).get(); + } + return ((ByteWritable) o).get(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetPrimitiveInspectorFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetPrimitiveInspectorFactory.java new file mode 100644 index 0000000..ff3800a --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetPrimitiveInspectorFactory.java @@ -0,0 +1,30 @@ +/* + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde.primitive; + +/** + * The ParquetPrimitiveInspectorFactory allows us to be sure that the same object is inspected by the same inspector. + */ +public class ParquetPrimitiveInspectorFactory { + + public static final ParquetByteInspector parquetByteInspector = new ParquetByteInspector(); + public static final ParquetShortInspector parquetShortInspector = new ParquetShortInspector(); + public static final ParquetStringInspector parquetStringInspector = new ParquetStringInspector(); + + private ParquetPrimitiveInspectorFactory() { + // prevent instantiation + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetShortInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetShortInspector.java new file mode 100644 index 0000000..f22c1ae --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetShortInspector.java @@ -0,0 +1,57 @@ +/* + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde.primitive; + +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableShortObjectInspector; +import org.apache.hadoop.io.IntWritable; + +/** + * The ParquetShortInspector can inspect both ShortWritables and IntWritables into shorts. + */ +public class ParquetShortInspector extends AbstractPrimitiveJavaObjectInspector implements SettableShortObjectInspector { + + ParquetShortInspector() { + super(TypeInfoFactory.shortTypeInfo); + } + + @Override + public Object getPrimitiveWritableObject(final Object o) { + return o == null ? null : new ShortWritable(get(o)); + } + + @Override + public Object create(final short val) { + return new ShortWritable(val); + } + + @Override + public Object set(final Object o, final short val) { + ((ShortWritable) o).set(val); + return o; + } + + @Override + public short get(Object o) { + // Accept int writables and convert them. + if (o instanceof IntWritable) { + return (short) ((IntWritable) o).get(); + } + return ((ShortWritable) o).get(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java new file mode 100644 index 0000000..5caffd7 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/primitive/ParquetStringInspector.java @@ -0,0 +1,100 @@ +/* + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde.primitive; + +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableStringObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.Text; + +import parquet.io.api.Binary; + +/** + * The ParquetStringInspector inspects a BinaryWritable to give a Text or String. + */ +public class ParquetStringInspector extends AbstractPrimitiveJavaObjectInspector implements SettableStringObjectInspector { + + + ParquetStringInspector() { + super(TypeInfoFactory.stringTypeInfo); + } + + @Override + public Text getPrimitiveWritableObject(final Object o) { + if (o == null) { + return null; + } + + if (o instanceof BinaryWritable) { + return new Text(((BinaryWritable) o).getBytes()); + } + + if (o instanceof Text) { + return (Text) o; + } + + if (o instanceof String) { + return new Text((String) o); + } + + throw new UnsupportedOperationException("Cannot inspect " + o.getClass().getCanonicalName()); + } + + @Override + public String getPrimitiveJavaObject(final Object o) { + if (o == null) { + return null; + } + + if (o instanceof BinaryWritable) { + return ((BinaryWritable) o).getString(); + } + + if (o instanceof Text) { + return ((Text) o).toString(); + } + + if (o instanceof String) { + return (String) o; + } + + throw new UnsupportedOperationException("Cannot inspect " + o.getClass().getCanonicalName()); + } + + @Override + public Object set(final Object o, final Text text) { + return new BinaryWritable(text == null ? null : Binary.fromByteArray(text.getBytes())); + } + + @Override + public Object set(final Object o, final String string) { + return new BinaryWritable(string == null ? null : Binary.fromString(string)); + } + + @Override + public Object create(final Text text) { + if (text == null) { + return null; + } + return text.toString(); + } + + @Override + public Object create(final String string) { + return string; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BigDecimalWritable.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BigDecimalWritable.java new file mode 100644 index 0000000..fdec25d --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BigDecimalWritable.java @@ -0,0 +1,152 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.parquet.writable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.apache.hadoop.hive.serde2.ByteStream.Output; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils.VInt; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableUtils; + +/** + * This file is taken from hive 0.11 + * website : http://hive.apache.org/ + * github : https://github.com/apache/hive + * branch : branch-0.11 + * Issue : https://issues.apache.org/jira/browse/HIVE-2693 + * License : Same as the header of this file + * + */ +public class BigDecimalWritable implements WritableComparable { + + private byte[] internalStorage = new byte[0]; + private int scale; + + private final VInt vInt = new VInt(); // reusable integer + + public BigDecimalWritable() { + } + + public BigDecimalWritable(final byte[] bytes, final int scale) { + set(bytes, scale); + } + + public BigDecimalWritable(final BigDecimalWritable writable) { + set(writable.getBigDecimal()); + } + + public BigDecimalWritable(final BigDecimal value) { + set(value); + } + + public void set(BigDecimal value) { + value = value.stripTrailingZeros(); + if (value.compareTo(BigDecimal.ZERO) == 0) { + // Special case for 0, because java doesn't strip zeros correctly on + // that number. + value = BigDecimal.ZERO; + } + set(value.unscaledValue().toByteArray(), value.scale()); + } + + public void set(final BigDecimalWritable writable) { + set(writable.getBigDecimal()); + } + + public void set(final byte[] bytes, final int scale) { + this.internalStorage = bytes; + this.scale = scale; + } + + public void setFromBytes(final byte[] bytes, int offset, final int length) { + LazyBinaryUtils.readVInt(bytes, offset, vInt); + scale = vInt.value; + offset += vInt.length; + LazyBinaryUtils.readVInt(bytes, offset, vInt); + offset += vInt.length; + if (internalStorage.length != vInt.value) { + internalStorage = new byte[vInt.value]; + } + System.arraycopy(bytes, offset, internalStorage, 0, vInt.value); + } + + public BigDecimal getBigDecimal() { + return new BigDecimal(new BigInteger(internalStorage), scale); + } + + @Override + public void readFields(final DataInput in) throws IOException { + scale = WritableUtils.readVInt(in); + final int byteArrayLen = WritableUtils.readVInt(in); + if (internalStorage.length != byteArrayLen) { + internalStorage = new byte[byteArrayLen]; + } + in.readFully(internalStorage); + } + + @Override + public void write(final DataOutput out) throws IOException { + WritableUtils.writeVInt(out, scale); + WritableUtils.writeVInt(out, internalStorage.length); + out.write(internalStorage); + } + + @Override + public int compareTo(final BigDecimalWritable that) { + return getBigDecimal().compareTo(that.getBigDecimal()); + } + + public void writeToByteStream(final Output byteStream) { + LazyBinaryUtils.writeVInt(byteStream, scale); + LazyBinaryUtils.writeVInt(byteStream, internalStorage.length); + byteStream.write(internalStorage, 0, internalStorage.length); + } + + @Override + public String toString() { + return getBigDecimal().toString(); + } + + @Override + public boolean equals(final Object other) { + if (other == null || !(other instanceof BigDecimalWritable)) { + return false; + } + final BigDecimalWritable bdw = (BigDecimalWritable) other; + + // 'equals' and 'compareTo' are not compatible with BigDecimals. We want + // compareTo which returns true iff the numbers are equal (e.g.: 3.14 is + // the same as 3.140). 'Equals' returns true iff equal and the same + // scale + // is set in the decimals (e.g.: 3.14 is not the same as 3.140) + return getBigDecimal().compareTo(bdw.getBigDecimal()) == 0; + } + + @Override + public int hashCode() { + return getBigDecimal().hashCode(); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BinaryWritable.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BinaryWritable.java new file mode 100644 index 0000000..10b4a20 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/writable/BinaryWritable.java @@ -0,0 +1,94 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.writable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Binary; + +/** + * + * A Wrapper to support constructor with Binary and String + * + * TODO : remove it, and call BytesWritable with the getBytes() + * + */ +public class BinaryWritable implements Writable { + + private Binary binary; + + public BinaryWritable(final Binary binary) { + this.binary = binary; + } + + public Binary getBinary() { + return binary; + } + + public byte[] getBytes() { + return binary.getBytes(); + } + + public String getString() { + return binary.toStringUsingUTF8(); + } + + @Override + public void readFields(DataInput input) throws IOException { + byte[] bytes = new byte[input.readInt()]; + input.readFully(bytes); + binary = Binary.fromByteArray(bytes); + } + + @Override + public void write(DataOutput output) throws IOException { + output.writeInt(binary.length()); + binary.writeTo(output); + } + + @Override + public int hashCode() { + return binary == null ? 0 : binary.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof BinaryWritable) { + final BinaryWritable other = (BinaryWritable)obj; + return binary.equals(other.binary); + } + return false; + } + + public static class DicBinaryWritable extends BinaryWritable { + + private String string; + + public DicBinaryWritable(Binary binary, String string) { + super(binary); + this.string = string; + } + + public String getString() { + return string; + } + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java new file mode 100644 index 0000000..cebded9 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriteSupport.java @@ -0,0 +1,62 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.write; + +import java.util.HashMap; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.ArrayWritable; + +import parquet.hadoop.api.WriteSupport; +import parquet.io.api.RecordConsumer; +import parquet.schema.MessageType; +import parquet.schema.MessageTypeParser; + +/** + * + * DataWritableWriteSupport is a WriteSupport for the DataWritableWriter + * + */ +public class DataWritableWriteSupport extends WriteSupport { + + public static final String PARQUET_HIVE_SCHEMA = "parquet.hive.schema"; + + public static void setSchema(final MessageType schema, final Configuration configuration) { + configuration.set(PARQUET_HIVE_SCHEMA, schema.toString()); + } + + public static MessageType getSchema(final Configuration configuration) { + return MessageTypeParser.parseMessageType(configuration.get(PARQUET_HIVE_SCHEMA)); + } + private DataWritableWriter writer; + private MessageType schema; + + @Override + public WriteContext init(final Configuration configuration) { + schema = getSchema(configuration); + return new WriteContext(schema, new HashMap()); + } + + @Override + public void prepareForWrite(final RecordConsumer recordConsumer) { + writer = new DataWritableWriter(recordConsumer, schema); + } + + @Override + public void write(final ArrayWritable record) { + writer.write(record); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java new file mode 100644 index 0000000..3e6814b --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java @@ -0,0 +1,162 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.write; + +import org.apache.hadoop.hive.ql.io.parquet.writable.BigDecimalWritable; +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; + +import parquet.io.ParquetEncodingException; +import parquet.io.api.RecordConsumer; +import parquet.schema.GroupType; +import parquet.schema.Type; + +/** + * + * DataWritableWriter is a writer, + * that will read an ArrayWritable and give the data to parquet + * with the expected schema + * + */ +public class DataWritableWriter { + + private final RecordConsumer recordConsumer; + private final GroupType schema; + + public DataWritableWriter(final RecordConsumer recordConsumer, final GroupType schema) { + this.recordConsumer = recordConsumer; + this.schema = schema; + } + + public void write(final ArrayWritable arr) { + + if (arr == null) { + return; + } + recordConsumer.startMessage(); + writeData(arr, schema); + recordConsumer.endMessage(); + } + + private void writeData(final ArrayWritable arr, final GroupType type) { + + if (arr == null) { + return; + } + + final int fieldCount = type.getFieldCount(); + Writable[] values = arr.get(); + for (int field = 0; field < fieldCount; ++field) { + final Type fieldType = type.getType(field); + final String fieldName = fieldType.getName(); + final Writable value = values[field]; + if (value == null) { + continue; + } + recordConsumer.startField(fieldName, field); + + if (fieldType.isPrimitive()) { + writePrimitive(value); + } else { + recordConsumer.startGroup(); + if (value instanceof ArrayWritable) { + if (fieldType.asGroupType().getRepetition().equals(Type.Repetition.REPEATED)) { + writeArray((ArrayWritable) value, fieldType.asGroupType()); + } else { + writeData((ArrayWritable) value, fieldType.asGroupType()); + } + } else if (value != null) { + throw new ParquetEncodingException("This should be an ArrayWritable or MapWritable: " + value); + } + + recordConsumer.endGroup(); + } + + recordConsumer.endField(fieldName, field); + } + } + + private void writeArray(final ArrayWritable array, final GroupType type) { + if (array == null) { + return; + } + + final Writable[] subValues = array.get(); + + final int fieldCount = type.getFieldCount(); + for (int field = 0; field < fieldCount; ++field) { + final Type subType = type.getType(field); + recordConsumer.startField(subType.getName(), field); + for (int i = 0; i < subValues.length; ++i) { + final Writable subValue = subValues[i]; + if (subValue != null) { + if (subType.isPrimitive()) { + if (subValue instanceof ArrayWritable) { + writePrimitive(((ArrayWritable) subValue).get()[field]);// 0 ? + } else { + writePrimitive(subValue); + } + } else { + if (!(subValue instanceof ArrayWritable)) { + throw new RuntimeException("This should be a ArrayWritable: " + subValue); + } else { + recordConsumer.startGroup(); + writeData((ArrayWritable) subValue, subType.asGroupType()); + recordConsumer.endGroup(); + } + } + } + } + recordConsumer.endField(subType.getName(), field); + } + + } + + private void writePrimitive(final Writable value) { + if (value == null) { + return; + } + if (value instanceof DoubleWritable) { + recordConsumer.addDouble(((DoubleWritable) value).get()); + } else if (value instanceof BooleanWritable) { + recordConsumer.addBoolean(((BooleanWritable) value).get()); + } else if (value instanceof FloatWritable) { + recordConsumer.addFloat(((FloatWritable) value).get()); + } else if (value instanceof IntWritable) { + recordConsumer.addInteger(((IntWritable) value).get()); + } else if (value instanceof LongWritable) { + recordConsumer.addLong(((LongWritable) value).get()); + } else if (value instanceof ShortWritable) { + recordConsumer.addInteger(((ShortWritable) value).get()); + } else if (value instanceof ByteWritable) { + recordConsumer.addInteger(((ByteWritable) value).get()); + } else if (value instanceof BigDecimalWritable) { + throw new UnsupportedOperationException("BigDecimal writing not implemented"); + } else if (value instanceof BinaryWritable) { + recordConsumer.addBinary(((BinaryWritable) value).getBinary()); + } else { + throw new RuntimeException("Unknown value type: " + value + " " + value.getClass()); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java new file mode 100644 index 0000000..eb779b4 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/ParquetRecordWriterWrapper.java @@ -0,0 +1,92 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.write; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapreduce.OutputFormat; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.hadoop.util.Progressable; +import org.apache.hadoop.hive.ql.io.FSRecordWriter; + +import parquet.hadoop.ParquetOutputFormat; +import parquet.hadoop.util.ContextUtil; + +public class ParquetRecordWriterWrapper implements RecordWriter, + FSRecordWriter { + + public static final Log LOG = LogFactory.getLog(ParquetRecordWriterWrapper.class); + + private final org.apache.hadoop.mapreduce.RecordWriter realWriter; + private TaskAttemptContext taskContext; + + public ParquetRecordWriterWrapper( + final OutputFormat realOutputFormat, + final JobConf jobConf, + final String name, + final Progressable progress) throws IOException { + try { + // create a TaskInputOutputContext + TaskAttemptID taskAttemptID = TaskAttemptID.forName(jobConf.get("mapred.task.id")); + if (taskAttemptID == null) { + taskAttemptID = new TaskAttemptID(); + } + taskContext = ContextUtil.newTaskAttemptContext(jobConf, taskAttemptID); + + LOG.info("creating real writer to write at " + name); + realWriter = (org.apache.hadoop.mapreduce.RecordWriter) ((ParquetOutputFormat) realOutputFormat) + .getRecordWriter(taskContext, new Path(name)); + LOG.info("real writer: " + realWriter); + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + + @Override + public void close(final Reporter reporter) throws IOException { + try { + realWriter.close(taskContext); + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + @Override + public void write(final Void key, final ArrayWritable value) throws IOException { + try { + realWriter.write(key, value); + } catch (final InterruptedException e) { + throw new IOException(e); + } + } + + @Override + public void close(final boolean abort) throws IOException { + close(null); + } + + @Override + public void write(final Writable w) throws IOException { + write(null, (ArrayWritable) w); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java index 13d0a56..fa136fb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java @@ -59,6 +59,9 @@ import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcSerde; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -138,6 +141,10 @@ protected static final String ORCFILE_SERDE = OrcSerde.class .getName(); + protected static final String PARQUETFILE_INPUT = MapredParquetInputFormat.class.getName(); + protected static final String PARQUETFILE_OUTPUT = MapredParquetOutputFormat.class.getName(); + protected static final String PARQUETFILE_SERDE = ParquetHiveSerDe.class.getName(); + class RowFormatParams { String fieldDelim = null; String fieldEscape = null; @@ -225,6 +232,12 @@ protected boolean fillStorageFormat(ASTNode child, AnalyzeCreateCommonVars share shared.serde = ORCFILE_SERDE; storageFormat = true; break; + case HiveParser.TOK_TBLPARQUETFILE: + inputFormat = PARQUETFILE_INPUT; + outputFormat = PARQUETFILE_OUTPUT; + shared.serde = PARQUETFILE_SERDE; + storageFormat = true; + break; case HiveParser.TOK_TABLEFILEFORMAT: inputFormat = unescapeSQLString(child.getChild(0).getText()); outputFormat = unescapeSQLString(child.getChild(1).getText()); @@ -256,6 +269,10 @@ protected void fillDefaultStorageFormat(AnalyzeCreateCommonVars shared) { inputFormat = ORCFILE_INPUT; outputFormat = ORCFILE_OUTPUT; shared.serde = ORCFILE_SERDE; + } else if ("PARQUET".equalsIgnoreCase(conf.getVar(HiveConf.ConfVars.HIVEDEFAULTFILEFORMAT))) { + inputFormat = PARQUETFILE_INPUT; + outputFormat = PARQUETFILE_OUTPUT; + shared.serde = PARQUETFILE_SERDE; } else { inputFormat = TEXTFILE_INPUT; outputFormat = TEXTFILE_OUTPUT; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g index f83c15d..aea9c1c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g @@ -153,6 +153,7 @@ KW_SEQUENCEFILE: 'SEQUENCEFILE'; KW_TEXTFILE: 'TEXTFILE'; KW_RCFILE: 'RCFILE'; KW_ORCFILE: 'ORC'; +KW_PARQUETFILE: 'PARQUET'; KW_INPUTFORMAT: 'INPUTFORMAT'; KW_OUTPUTFORMAT: 'OUTPUTFORMAT'; KW_INPUTDRIVER: 'INPUTDRIVER'; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g index 1ce6bf3..09134a4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g @@ -182,6 +182,7 @@ TOK_TABLEROWFORMATMAPKEYS; TOK_TABLEROWFORMATLINES; TOK_TABLEROWFORMATNULL; TOK_TBLORCFILE; +TOK_TBLPARQUETFILE; TOK_TBLSEQUENCEFILE; TOK_TBLTEXTFILE; TOK_TBLRCFILE; @@ -1197,6 +1198,7 @@ fileFormat | KW_TEXTFILE -> ^(TOK_TBLTEXTFILE) | KW_RCFILE -> ^(TOK_TBLRCFILE) | KW_ORCFILE -> ^(TOK_TBLORCFILE) + | KW_PARQUETFILE -> ^(TOK_TBLPARQUETFILE) | KW_INPUTFORMAT inFmt=StringLiteral KW_OUTPUTFORMAT outFmt=StringLiteral (KW_INPUTDRIVER inDriver=StringLiteral KW_OUTPUTDRIVER outDriver=StringLiteral)? -> ^(TOK_TABLEFILEFORMAT $inFmt $outFmt $inDriver? $outDriver?) | genericSpec=identifier -> ^(TOK_FILEFORMAT_GENERIC $genericSpec) @@ -1674,6 +1676,7 @@ tableFileFormat | KW_STORED KW_AS KW_TEXTFILE -> TOK_TBLTEXTFILE | KW_STORED KW_AS KW_RCFILE -> TOK_TBLRCFILE | KW_STORED KW_AS KW_ORCFILE -> TOK_TBLORCFILE + | KW_STORED KW_AS KW_PARQUETFILE -> TOK_TBLPARQUETFILE | KW_STORED KW_AS KW_INPUTFORMAT inFmt=StringLiteral KW_OUTPUTFORMAT outFmt=StringLiteral (KW_INPUTDRIVER inDriver=StringLiteral KW_OUTPUTDRIVER outDriver=StringLiteral)? -> ^(TOK_TABLEFILEFORMAT $inFmt $outFmt $inDriver? $outDriver?) | KW_STORED KW_BY storageHandler=StringLiteral diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g index 4147503..2b57116 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g @@ -535,5 +535,5 @@ identifier nonReserved : - KW_TRUE | KW_FALSE | KW_LIKE | KW_EXISTS | KW_ASC | KW_DESC | KW_ORDER | KW_GROUP | KW_BY | KW_AS | KW_INSERT | KW_OVERWRITE | KW_OUTER | KW_LEFT | KW_RIGHT | KW_FULL | KW_PARTITION | KW_PARTITIONS | KW_TABLE | KW_TABLES | KW_COLUMNS | KW_INDEX | KW_INDEXES | KW_REBUILD | KW_FUNCTIONS | KW_SHOW | KW_MSCK | KW_REPAIR | KW_DIRECTORY | KW_LOCAL | KW_USING | KW_CLUSTER | KW_DISTRIBUTE | KW_SORT | KW_UNION | KW_LOAD | KW_EXPORT | KW_IMPORT | KW_DATA | KW_INPATH | KW_IS | KW_NULL | KW_CREATE | KW_EXTERNAL | KW_ALTER | KW_CHANGE | KW_FIRST | KW_AFTER | KW_DESCRIBE | KW_DROP | KW_RENAME | KW_IGNORE | KW_PROTECTION | KW_TO | KW_COMMENT | KW_BOOLEAN | KW_TINYINT | KW_SMALLINT | KW_INT | KW_BIGINT | KW_FLOAT | KW_DOUBLE | KW_DATE | KW_DATETIME | KW_TIMESTAMP | KW_DECIMAL | KW_STRING | KW_ARRAY | KW_STRUCT | KW_UNIONTYPE | KW_PARTITIONED | KW_CLUSTERED | KW_SORTED | KW_INTO | KW_BUCKETS | KW_ROW | KW_ROWS | KW_FORMAT | KW_DELIMITED | KW_FIELDS | KW_TERMINATED | KW_ESCAPED | KW_COLLECTION | KW_ITEMS | KW_KEYS | KW_KEY_TYPE | KW_LINES | KW_STORED | KW_FILEFORMAT | KW_SEQUENCEFILE | KW_TEXTFILE | KW_RCFILE | KW_ORCFILE | KW_INPUTFORMAT | KW_OUTPUTFORMAT | KW_INPUTDRIVER | KW_OUTPUTDRIVER | KW_OFFLINE | KW_ENABLE | KW_DISABLE | KW_READONLY | KW_NO_DROP | KW_LOCATION | KW_BUCKET | KW_OUT | KW_OF | KW_PERCENT | KW_ADD | KW_REPLACE | KW_RLIKE | KW_REGEXP | KW_TEMPORARY | KW_EXPLAIN | KW_FORMATTED | KW_PRETTY | KW_DEPENDENCY | KW_LOGICAL | KW_SERDE | KW_WITH | KW_DEFERRED | KW_SERDEPROPERTIES | KW_DBPROPERTIES | KW_LIMIT | KW_SET | KW_UNSET | KW_TBLPROPERTIES | KW_IDXPROPERTIES | KW_VALUE_TYPE | KW_ELEM_TYPE | KW_MAPJOIN | KW_STREAMTABLE | KW_HOLD_DDLTIME | KW_CLUSTERSTATUS | KW_UTC | KW_UTCTIMESTAMP | KW_LONG | KW_DELETE | KW_PLUS | KW_MINUS | KW_FETCH | KW_INTERSECT | KW_VIEW | KW_IN | KW_DATABASES | KW_MATERIALIZED | KW_SCHEMA | KW_SCHEMAS | KW_GRANT | KW_REVOKE | KW_SSL | KW_UNDO | KW_LOCK | KW_LOCKS | KW_UNLOCK | KW_SHARED | KW_EXCLUSIVE | KW_PROCEDURE | KW_UNSIGNED | KW_WHILE | KW_READ | KW_READS | KW_PURGE | KW_RANGE | KW_ANALYZE | KW_BEFORE | KW_BETWEEN | KW_BOTH | KW_BINARY | KW_CONTINUE | KW_CURSOR | KW_TRIGGER | KW_RECORDREADER | KW_RECORDWRITER | KW_SEMI | KW_LATERAL | KW_TOUCH | KW_ARCHIVE | KW_UNARCHIVE | KW_COMPUTE | KW_STATISTICS | KW_USE | KW_OPTION | KW_CONCATENATE | KW_SHOW_DATABASE | KW_UPDATE | KW_RESTRICT | KW_CASCADE | KW_SKEWED | KW_ROLLUP | KW_CUBE | KW_DIRECTORIES | KW_FOR | KW_GROUPING | KW_SETS | KW_TRUNCATE | KW_NOSCAN | KW_USER | KW_ROLE | KW_ROLES | KW_INNER | KW_DEFINED | KW_ADMIN + KW_TRUE | KW_FALSE | KW_LIKE | KW_EXISTS | KW_ASC | KW_DESC | KW_ORDER | KW_GROUP | KW_BY | KW_AS | KW_INSERT | KW_OVERWRITE | KW_OUTER | KW_LEFT | KW_RIGHT | KW_FULL | KW_PARTITION | KW_PARTITIONS | KW_TABLE | KW_TABLES | KW_COLUMNS | KW_INDEX | KW_INDEXES | KW_REBUILD | KW_FUNCTIONS | KW_SHOW | KW_MSCK | KW_REPAIR | KW_DIRECTORY | KW_LOCAL | KW_USING | KW_CLUSTER | KW_DISTRIBUTE | KW_SORT | KW_UNION | KW_LOAD | KW_EXPORT | KW_IMPORT | KW_DATA | KW_INPATH | KW_IS | KW_NULL | KW_CREATE | KW_EXTERNAL | KW_ALTER | KW_CHANGE | KW_FIRST | KW_AFTER | KW_DESCRIBE | KW_DROP | KW_RENAME | KW_IGNORE | KW_PROTECTION | KW_TO | KW_COMMENT | KW_BOOLEAN | KW_TINYINT | KW_SMALLINT | KW_INT | KW_BIGINT | KW_FLOAT | KW_DOUBLE | KW_DATE | KW_DATETIME | KW_TIMESTAMP | KW_DECIMAL | KW_STRING | KW_ARRAY | KW_STRUCT | KW_UNIONTYPE | KW_PARTITIONED | KW_CLUSTERED | KW_SORTED | KW_INTO | KW_BUCKETS | KW_ROW | KW_ROWS | KW_FORMAT | KW_DELIMITED | KW_FIELDS | KW_TERMINATED | KW_ESCAPED | KW_COLLECTION | KW_ITEMS | KW_KEYS | KW_KEY_TYPE | KW_LINES | KW_STORED | KW_FILEFORMAT | KW_SEQUENCEFILE | KW_TEXTFILE | KW_RCFILE | KW_ORCFILE | KW_PARQUETFILE | KW_INPUTFORMAT | KW_OUTPUTFORMAT | KW_INPUTDRIVER | KW_OUTPUTDRIVER | KW_OFFLINE | KW_ENABLE | KW_DISABLE | KW_READONLY | KW_NO_DROP | KW_LOCATION | KW_BUCKET | KW_OUT | KW_OF | KW_PERCENT | KW_ADD | KW_REPLACE | KW_RLIKE | KW_REGEXP | KW_TEMPORARY | KW_EXPLAIN | KW_FORMATTED | KW_PRETTY | KW_DEPENDENCY | KW_LOGICAL | KW_SERDE | KW_WITH | KW_DEFERRED | KW_SERDEPROPERTIES | KW_DBPROPERTIES | KW_LIMIT | KW_SET | KW_UNSET | KW_TBLPROPERTIES | KW_IDXPROPERTIES | KW_VALUE_TYPE | KW_ELEM_TYPE | KW_MAPJOIN | KW_STREAMTABLE | KW_HOLD_DDLTIME | KW_CLUSTERSTATUS | KW_UTC | KW_UTCTIMESTAMP | KW_LONG | KW_DELETE | KW_PLUS | KW_MINUS | KW_FETCH | KW_INTERSECT | KW_VIEW | KW_IN | KW_DATABASES | KW_MATERIALIZED | KW_SCHEMA | KW_SCHEMAS | KW_GRANT | KW_REVOKE | KW_SSL | KW_UNDO | KW_LOCK | KW_LOCKS | KW_UNLOCK | KW_SHARED | KW_EXCLUSIVE | KW_PROCEDURE | KW_UNSIGNED | KW_WHILE | KW_READ | KW_READS | KW_PURGE | KW_RANGE | KW_ANALYZE | KW_BEFORE | KW_BETWEEN | KW_BOTH | KW_BINARY | KW_CONTINUE | KW_CURSOR | KW_TRIGGER | KW_RECORDREADER | KW_RECORDWRITER | KW_SEMI | KW_LATERAL | KW_TOUCH | KW_ARCHIVE | KW_UNARCHIVE | KW_COMPUTE | KW_STATISTICS | KW_USE | KW_OPTION | KW_CONCATENATE | KW_SHOW_DATABASE | KW_UPDATE | KW_RESTRICT | KW_CASCADE | KW_SKEWED | KW_ROLLUP | KW_CUBE | KW_DIRECTORIES | KW_FOR | KW_GROUPING | KW_SETS | KW_TRUNCATE | KW_NOSCAN | KW_USER | KW_ROLE | KW_ROLES | KW_INNER | KW_DEFINED | KW_ADMIN ; diff --git a/ql/src/java/parquet/hive/DeprecatedParquetInputFormat.java b/ql/src/java/parquet/hive/DeprecatedParquetInputFormat.java new file mode 100644 index 0000000..c508709 --- /dev/null +++ b/ql/src/java/parquet/hive/DeprecatedParquetInputFormat.java @@ -0,0 +1,38 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package parquet.hive; + +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher; +import org.apache.hadoop.io.ArrayWritable; + +import parquet.hadoop.ParquetInputFormat; + +/** + * Deprecated name of the parquet-hive input format. This class exists + * simply to provide backwards compatibility with users who specified + * this name in the Hive metastore. All users should now use + * {@link MapredParquetInputFormat MapredParquetInputFormat} + */ +@Deprecated +public class DeprecatedParquetInputFormat extends MapredParquetInputFormat { + + public DeprecatedParquetInputFormat() { + super(); + } + + public DeprecatedParquetInputFormat(final ParquetInputFormat realInputFormat) { + super(realInputFormat, new ProjectionPusher()); + } +} diff --git a/ql/src/java/parquet/hive/DeprecatedParquetOutputFormat.java b/ql/src/java/parquet/hive/DeprecatedParquetOutputFormat.java new file mode 100644 index 0000000..2063702 --- /dev/null +++ b/ql/src/java/parquet/hive/DeprecatedParquetOutputFormat.java @@ -0,0 +1,36 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package parquet.hive; + +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapreduce.OutputFormat; + +/** + * Deprecated name of the parquet-hive output format. This class exists + * simply to provide backwards compatibility with users who specified + * this name in the Hive metastore. All users should now use + * {@link MapredParquetOutputFormat MapredParquetOutputFormat} + */ +@Deprecated +public class DeprecatedParquetOutputFormat extends MapredParquetOutputFormat { + + public DeprecatedParquetOutputFormat() { + super(); + } + + public DeprecatedParquetOutputFormat(final OutputFormat mapreduceOutputFormat) { + super(mapreduceOutputFormat); + } +} diff --git a/ql/src/java/parquet/hive/MapredParquetInputFormat.java b/ql/src/java/parquet/hive/MapredParquetInputFormat.java new file mode 100644 index 0000000..418e2de --- /dev/null +++ b/ql/src/java/parquet/hive/MapredParquetInputFormat.java @@ -0,0 +1,37 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package parquet.hive; + +import org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher; +import org.apache.hadoop.io.ArrayWritable; + +import parquet.hadoop.ParquetInputFormat; + +/** + * Deprecated name of the parquet-hive input format. This class exists + * simply to provide backwards compatibility with users who specified + * this name in the Hive metastore. All users should now use + * {@link org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat MapredParquetInputFormat} + */ +@Deprecated +public class MapredParquetInputFormat extends org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat { + + public MapredParquetInputFormat() { + super(); + } + + public MapredParquetInputFormat(final ParquetInputFormat realInputFormat) { + super(realInputFormat, new ProjectionPusher()); + } +} diff --git a/ql/src/java/parquet/hive/MapredParquetOutputFormat.java b/ql/src/java/parquet/hive/MapredParquetOutputFormat.java new file mode 100644 index 0000000..5ccdf70 --- /dev/null +++ b/ql/src/java/parquet/hive/MapredParquetOutputFormat.java @@ -0,0 +1,35 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package parquet.hive; + +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapreduce.OutputFormat; + +/** + * Deprecated name of the parquet-hive output format. This class exists + * simply to provide backwards compatibility with users who specified + * this name in the Hive metastore. All users should now use + * {@link org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat MapredParquetOutputFormat} + */ +@Deprecated +public class MapredParquetOutputFormat extends org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat { + + public MapredParquetOutputFormat () { + super(); + } + + public MapredParquetOutputFormat(final OutputFormat mapreduceOutputFormat) { + super(mapreduceOutputFormat); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java new file mode 100644 index 0000000..8f3a2dc --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestHiveSchemaConverter.java @@ -0,0 +1,121 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.junit.Test; + +import parquet.schema.MessageType; +import parquet.schema.MessageTypeParser; + +/** + * + * TestHiveSchemaConverter + * + */ +public class TestHiveSchemaConverter { + + private List createHiveColumnsFrom(final String columnNamesStr) { + List columnNames; + if (columnNamesStr.length() == 0) { + columnNames = new ArrayList(); + } else { + columnNames = Arrays.asList(columnNamesStr.split(",")); + } + + return columnNames; + } + + private List createHiveTypeInfoFrom(final String columnsTypeStr) { + List columnTypes; + + if (columnsTypeStr.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnsTypeStr); + } + + return columnTypes; + } + + private void testConversion(final String columnNamesStr, final String columnsTypeStr, final String expectedSchema) throws Exception { + final List columnNames = createHiveColumnsFrom(columnNamesStr); + final List columnTypes = createHiveTypeInfoFrom(columnsTypeStr); + final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); + final MessageType expectedMT = MessageTypeParser.parseMessageType(expectedSchema); + assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + expectedSchema, expectedMT, messageTypeFound); + } + + @Test + public void testSimpleType() throws Exception { + testConversion( + "a,b,c", + "int,double,boolean", + "message hive_schema {\n" + + " optional int32 a;\n" + + " optional double b;\n" + + " optional boolean c;\n" + + "}\n"); + } + + @Test + public void testArray() throws Exception { + testConversion("arrayCol", + "array", + "message hive_schema {\n" + + " optional group arrayCol (LIST) {\n" + + " repeated group bag {\n" + + " optional int32 array_element;\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testStruct() throws Exception { + testConversion("structCol", + "struct", + "message hive_schema {\n" + + " optional group structCol {\n" + + " optional int32 a;\n" + + " optional double b;\n" + + " optional boolean c;\n" + + " }\n" + + "}\n"); + } + + @Test + public void testMap() throws Exception { + testConversion("mapCol", + "map", + "message hive_schema {\n" + + " optional group mapCol (MAP) {\n" + + " repeated group map (MAP_KEY_VALUE) {\n" + + " required binary key;\n" + + " optional binary value;\n" + + " }\n" + + " }\n" + + "}\n"); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetInputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetInputFormat.java new file mode 100644 index 0000000..09789bf --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetInputFormat.java @@ -0,0 +1,127 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.JobContext; +import org.junit.Test; + +import parquet.hadoop.ParquetInputFormat; +import parquet.hadoop.ParquetInputSplit; + +/** + * + * Tests for MapredParquetInputFormat. + * + */ +public class TestMapredParquetInputFormat { + @Test + public void testDefaultConstructor() { + new MapredParquetInputFormat(); + } + + @SuppressWarnings("unchecked") + @Test + public void testConstructorWithParquetInputFormat() { + new MapredParquetInputFormat( + (ParquetInputFormat) mock(ParquetInputFormat.class), + mock(ProjectionPusher.class) + ); + } + + @SuppressWarnings("unchecked") + @Test + public void testGetSplits() throws IOException { + ParquetInputFormat inputFormat = (ParquetInputFormat) mock(ParquetInputFormat.class); + ProjectionPusher pusher = mock(ProjectionPusher.class); + + List inputSplits = new ArrayList(); + inputSplits.add(mock(ParquetInputSplit.class)); + inputSplits.add(mock(ParquetInputSplit.class)); + + when(inputFormat.getSplits(null)).thenReturn(inputSplits); + + MapredParquetInputFormat mapRedInputFormat = new MapredParquetInputFormat(inputFormat, pusher) { + @Override + protected JobContext getJobContext(JobConf jobConf) { + return null; + } + }; + + try { + org.apache.hadoop.mapred.InputSplit[] wrappedSplits = mapRedInputFormat.getSplits( + mock(org.apache.hadoop.mapred.JobConf.class), + 0, + mock(Path.class) + ); + assertEquals(2, wrappedSplits.length); + assertTrue(wrappedSplits[0] instanceof ParquetInputSplitWrapper); + assertTrue(wrappedSplits[1] instanceof ParquetInputSplitWrapper); + assertEquals(((ParquetInputSplitWrapper) wrappedSplits[0]).getRealSplit(), inputSplits.get(0)); + assertEquals(((ParquetInputSplitWrapper) wrappedSplits[1]).getRealSplit(), inputSplits.get(1)); + } catch (IOException e) { + fail(); + } + } + + @SuppressWarnings("unchecked") + @Test(expected = IOException.class) + public void testGetSplitsEmptyDirs() throws IOException { + new MapredParquetInputFormat( + (ParquetInputFormat) mock(ParquetInputFormat.class), + mock(ProjectionPusher.class) + ) { + @Override + protected Path[] getInputPathsForJob(org.apache.hadoop.mapred.JobConf job) { + return new Path[] {}; + } + }.getSplits(null, 0); + } + + @SuppressWarnings("unchecked") + @Test + public void testMakeQualified() throws IOException, URISyntaxException { + + final FileSystem fs = mock(FileSystem.class); + when(fs.getUri()).thenReturn(new URI("file:///foo/bar")); + + Path realPath = new MapredParquetInputFormat( + (ParquetInputFormat) mock(ParquetInputFormat.class), + mock(ProjectionPusher.class) + ) { + @Override + protected FileSystem getFsForJob(org.apache.hadoop.mapred.JobConf job) throws IOException { + return fs; + } + }.makeQualifiedPathFromPaths(new Path[] { new Path("file:///foo/bar") }, null); + + assertEquals(new Path("/foo/bar"), realPath); + } + +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetOutputFormat.java new file mode 100644 index 0000000..0407460 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestMapredParquetOutputFormat.java @@ -0,0 +1,94 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; +import static org.mockito.Mockito.mock; + +import java.io.IOException; +import java.util.Properties; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport; +import org.apache.hadoop.hive.ql.io.parquet.write.ParquetRecordWriterWrapper; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.util.Progressable; +import org.junit.Test; + +import parquet.hadoop.ParquetOutputFormat; + +/** + * Tests for MapredParquetOutputFormat. + * + */ +public class TestMapredParquetOutputFormat { + + @Test + public void testConstructor() { + new MapredParquetOutputFormat(); + } + + @SuppressWarnings("unchecked") + @Test + public void testConstructorWithFormat() { + new MapredParquetOutputFormat((ParquetOutputFormat) mock(ParquetOutputFormat.class)); + } + + @Test + public void testGetRecordWriterThrowsException() { + try { + new MapredParquetOutputFormat().getRecordWriter(null, null, null, null); + fail("should throw runtime exception."); + } catch (Exception e) { + assertEquals("Should never be used", e.getMessage()); + } + } + + @SuppressWarnings("unchecked") + @Test + public void testGetHiveRecordWriter() throws IOException { + Properties tableProps = new Properties(); + tableProps.setProperty("columns", "foo,bar"); + tableProps.setProperty("columns.types", "int:int"); + + final Progressable mockProgress = mock(Progressable.class); + final ParquetOutputFormat outputFormat = (ParquetOutputFormat) mock(ParquetOutputFormat.class); + + JobConf jobConf = new JobConf(); + + try { + new MapredParquetOutputFormat(outputFormat) { + @Override + protected ParquetRecordWriterWrapper getParquerRecordWriterWrapper( + ParquetOutputFormat realOutputFormat, + JobConf jobConf, + String finalOutPath, + Progressable progress + ) throws IOException { + assertEquals(outputFormat, realOutputFormat); + assertNotNull(jobConf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA)); + assertEquals("/foo", finalOutPath.toString()); + assertEquals(mockProgress, progress); + throw new RuntimeException("passed tests"); + } + }.getHiveRecordWriter(jobConf, new Path("/foo"), null, false, tableProps, mockProgress); + fail("should throw runtime exception."); + } catch (RuntimeException e) { + assertEquals("passed tests", e.getMessage()); + } + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java new file mode 100644 index 0000000..4b0f88d --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java @@ -0,0 +1,120 @@ +/** + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.util.Properties; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; + +import parquet.io.api.Binary; + +/** + * + * TestParquetHiveSerDe + * + */ +public class TestParquetSerDe extends TestCase { + + public void testParquetHiveSerDe() throws Throwable { + try { + // Create the SerDe + System.out.println("test: testParquetHiveSerDe"); + + final ParquetHiveSerDe serDe = new ParquetHiveSerDe(); + final Configuration conf = new Configuration(); + final Properties tbl = createProperties(); + serDe.initialize(conf, tbl); + + // Data + final Writable[] arr = new Writable[8]; + + arr[0] = new ByteWritable((byte) 123); + arr[1] = new ShortWritable((short) 456); + arr[2] = new IntWritable(789); + arr[3] = new LongWritable(1000l); + arr[4] = new DoubleWritable((double) 5.3); + arr[5] = new BinaryWritable(Binary.fromString("hive and hadoop and parquet. Big family.")); + + final Writable[] mapContainer = new Writable[1]; + final Writable[] map = new Writable[3]; + for (int i = 0; i < 3; ++i) { + final Writable[] pair = new Writable[2]; + pair[0] = new BinaryWritable(Binary.fromString("key_" + i)); + pair[1] = new IntWritable(i); + map[i] = new ArrayWritable(Writable.class, pair); + } + mapContainer[0] = new ArrayWritable(Writable.class, map); + arr[6] = new ArrayWritable(Writable.class, mapContainer); + + final Writable[] arrayContainer = new Writable[1]; + final Writable[] array = new Writable[5]; + for (int i = 0; i < 5; ++i) { + array[i] = new BinaryWritable(Binary.fromString("elem_" + i)); + } + arrayContainer[0] = new ArrayWritable(Writable.class, array); + arr[7] = new ArrayWritable(Writable.class, arrayContainer); + + final ArrayWritable arrWritable = new ArrayWritable(Writable.class, arr); + // Test + deserializeAndSerializeLazySimple(serDe, arrWritable); + System.out.println("test: testParquetHiveSerDe - OK"); + + } catch (final Throwable e) { + e.printStackTrace(); + throw e; + } + } + + private void deserializeAndSerializeLazySimple(final ParquetHiveSerDe serDe, final ArrayWritable t) throws SerDeException { + + // Get the row structure + final StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector(); + + // Deserialize + final Object row = serDe.deserialize(t); + assertEquals("deserialization gives the wrong object class", row.getClass(), ArrayWritable.class); + assertEquals("size correct after deserialization", serDe.getSerDeStats().getRawDataSize(), t.get().length); + assertEquals("deserialization gives the wrong object", t, row); + + // Serialize + final ArrayWritable serializedArr = (ArrayWritable) serDe.serialize(row, oi); + assertEquals("size correct after serialization", serDe.getSerDeStats().getRawDataSize(), serializedArr.get().length); + assertTrue("serialized object should be equal to starting object", UtilitiesTestMethods.arrayWritableEquals(t, serializedArr)); + } + + private Properties createProperties() { + final Properties tbl = new Properties(); + + // Set the configuration parameters + tbl.setProperty("columns", "abyte,ashort,aint,along,adouble,astring,amap,alist"); + tbl.setProperty("columns.types", "tinyint:smallint:int:bigint:double:string:map:array"); + tbl.setProperty(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL"); + return tbl; + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/UtilitiesTestMethods.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/UtilitiesTestMethods.java new file mode 100644 index 0000000..ddd1a71 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/UtilitiesTestMethods.java @@ -0,0 +1,258 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; + +import parquet.bytes.BytesInput; +import parquet.column.ColumnDescriptor; +import parquet.column.page.Page; +import parquet.column.page.PageReader; +import parquet.column.page.mem.MemPageStore; +import parquet.hadoop.ParquetFileWriter; +import parquet.hadoop.metadata.CompressionCodecName; +import parquet.io.api.Binary; +import parquet.io.api.RecordConsumer; +import parquet.schema.GroupType; +import parquet.schema.MessageType; +import parquet.schema.Type; + +public class UtilitiesTestMethods { + + public static void writeToFile(final Path file, final Configuration configuration, final MessageType schema, final MemPageStore pageStore, final int recordCount) + throws IOException { + final ParquetFileWriter w = startFile(file, configuration, schema); + writeBlock(schema, pageStore, recordCount, w); + endFile(w); + } + + public static void endFile(final ParquetFileWriter w) throws IOException { + w.end(new HashMap()); + } + + public static boolean smartCheckSchema(final GroupType expectedSchema, final GroupType actualSchema) { + if (expectedSchema.getFieldCount() != actualSchema.getFieldCount()) { + return false; + } + + for (int i = 0; i < expectedSchema.getFieldCount(); ++i) { + Type expectedType = expectedSchema.getType(i); + Type actualType = actualSchema.getType(i); + + if (!expectedType.getName().equals(actualType.getName()) + || expectedType.getRepetition() != actualType.getRepetition() + || expectedType.isPrimitive() != actualType.isPrimitive()) { + return false; + } + + if (expectedType.isPrimitive()) { + if (expectedType.asPrimitiveType().getPrimitiveTypeName() != actualType.asPrimitiveType().getPrimitiveTypeName() + || expectedType.asPrimitiveType().getTypeLength() != actualType.asPrimitiveType().getTypeLength()) { + return false; + } + } else { + if (!smartCheckSchema(expectedType.asGroupType(), actualType.asGroupType())) { + return false; + } + } + } + + return true; + } + + public static boolean smartCheckArray(final Writable[] arrValue, final Writable[] arrExpected, final Integer[] arrCheckIndexValues) { + + int i = 0; + for (final Integer index : arrCheckIndexValues) { + if (index != Integer.MIN_VALUE) { + final Writable value = arrValue[index]; + final Writable expectedValue = arrExpected[index]; + + if (((value == null && expectedValue == null) + || (((value != null && expectedValue != null) && (value.equals(expectedValue)))) + || (value != null && expectedValue != null && value instanceof ArrayWritable && expectedValue instanceof ArrayWritable && arrayWritableEquals((ArrayWritable) value, (ArrayWritable) expectedValue))) == false) { + return false; + } + } else { + final Writable value = arrValue[i]; + if (value != null) { + return false; + } + } + ++i; + } + + return true; + } + + public static boolean arrayWritableEquals(final ArrayWritable a1, final ArrayWritable a2) { + final Writable[] a1Arr = a1.get(); + final Writable[] a2Arr = a2.get(); + + if (a1Arr.length != a2Arr.length) { + return false; + } + + for (int i = 0; i < a1Arr.length; ++i) { + if (a1Arr[i] instanceof ArrayWritable) { + if (!(a2Arr[i] instanceof ArrayWritable)) { + return false; + } + if (!arrayWritableEquals((ArrayWritable) a1Arr[i], (ArrayWritable) a2Arr[i])) { + return false; + } + } else { + if (!a1Arr[i].equals(a2Arr[i])) { + return false; + } + } + + } + return true; + } + + static public ArrayWritable createArrayWritable(final Integer custkey, final String name, final String address, final Integer nationkey, final String phone, final Double acctbal, final String mktsegment, final String comment, final Map map, final List list) { + + final Writable[] arr = new Writable[11]; // The last one is for the unknown column + arr[0] = new IntWritable(custkey); + if (name != null) { + arr[1] = new BinaryWritable(Binary.fromString(name)); + } + if (address != null) { + arr[2] = new BinaryWritable(Binary.fromString(address)); + } + if (nationkey != null) { + arr[3] = new IntWritable(nationkey); + } + if (phone != null) { + arr[4] = new BinaryWritable(Binary.fromString(phone)); + } + if (acctbal != null) { + arr[5] = new DoubleWritable(acctbal); + } + if (mktsegment != null) { + arr[6] = new BinaryWritable(Binary.fromString(mktsegment)); + } + if (comment != null) { + arr[7] = new BinaryWritable(Binary.fromString(comment)); + } + if (map != null) { + final Writable[] mapContainer = new Writable[1]; + final Writable[] mapArr = new Writable[map.size()]; + int i = 0; + for (Map.Entry entry : map.entrySet()) { + final Writable[] pair = new Writable[2]; + pair[0] = new BinaryWritable(Binary.fromString(entry.getKey())); + pair[1] = new BinaryWritable(Binary.fromString(entry.getValue())); + mapArr[i] = new ArrayWritable(Writable.class, pair); + ++i; + } + mapContainer[0] = new ArrayWritable(Writable.class, mapArr); + arr[8] = new ArrayWritable(Writable.class, mapContainer); + } + if (list != null) { + final Writable[] listContainer = new Writable[1]; + final Writable[] listArr = new Writable[list.size()]; + for (int i = 0; i < list.size(); ++i) { + listArr[i] = new IntWritable(list.get(i)); + } + listContainer[0] = new ArrayWritable(Writable.class, listArr); + arr[9] = new ArrayWritable(Writable.class, listContainer); + } + + return new ArrayWritable(Writable.class, arr); + } + + public static void writeBlock(final MessageType schema, final MemPageStore pageStore, + final int recordCount, final ParquetFileWriter w) throws IOException { + w.startBlock(recordCount); + final List columns = schema.getColumns(); + for (final ColumnDescriptor columnDescriptor : columns) { + final PageReader pageReader = pageStore.getPageReader(columnDescriptor); + final long totalValueCount = pageReader.getTotalValueCount(); + w.startColumn(columnDescriptor, totalValueCount, CompressionCodecName.UNCOMPRESSED); + int n = 0; + do { + final Page page = pageReader.readPage(); + n += page.getValueCount(); + // TODO: change INTFC + w.writeDataPage( + page.getValueCount(), + (int) page.getBytes().size(), + BytesInput.from(page.getBytes().toByteArray()), + page.getRlEncoding(), + page.getDlEncoding(), + page.getValueEncoding()); + } while (n < totalValueCount); + w.endColumn(); + } + w.endBlock(); + } + + public static ParquetFileWriter startFile(final Path file, + final Configuration configuration, final MessageType schema) throws IOException { + final ParquetFileWriter w = new ParquetFileWriter(configuration, schema, file); + w.start(); + return w; + } + + public static void writeField(final RecordConsumer recordWriter, final int index, final String name, final Object value) { + if (value != null) { + recordWriter.startField(name, index); + if (value instanceof Integer) { + recordWriter.addInteger((Integer) value); + } else if (value instanceof String) { + recordWriter.addBinary(Binary.fromString((String) value)); + } else if (value instanceof Double) { + recordWriter.addDouble((Double) value); + } else if (value instanceof Map) { + recordWriter.startGroup(); + recordWriter.startField("map", 0); + for (Map.Entry entry : ((Map) value).entrySet()) { + recordWriter.startGroup(); + writeField(recordWriter, 0, "key", entry.getKey()); + writeField(recordWriter, 1, "value", entry.getValue()); + recordWriter.endGroup(); + } + recordWriter.endField("map", 0); + recordWriter.endGroup(); + } else if (value instanceof List) { + recordWriter.startGroup(); + recordWriter.startField("bag", 0); + for (Object element : (List) value) { + recordWriter.startGroup(); + writeField(recordWriter, 0, "array_element", element); + recordWriter.endGroup(); + } + recordWriter.endField("bag", 0); + recordWriter.endGroup(); + } else { + throw new IllegalArgumentException(value.getClass().getName() + " not supported"); + } + + recordWriter.endField(name, index); + } + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestAbstractParquetMapInspector.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestAbstractParquetMapInspector.java new file mode 100644 index 0000000..2e3f7e7 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestAbstractParquetMapInspector.java @@ -0,0 +1,98 @@ +/* + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.HashMap; +import java.util.Map; +import junit.framework.TestCase; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +public class TestAbstractParquetMapInspector extends TestCase { + + class TestableAbstractParquetMapInspector extends AbstractParquetMapInspector { + + public TestableAbstractParquetMapInspector(ObjectInspector keyInspector, ObjectInspector valueInspector) { + super(keyInspector, valueInspector); + } + + @Override + public Object getMapValueElement(Object o, Object o1) { + throw new UnsupportedOperationException("Should not be called"); + } + } + private TestableAbstractParquetMapInspector inspector; + + @Override + public void setUp() { + inspector = new TestableAbstractParquetMapInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } + + @Test + public void testNullMap() { + assertEquals("Wrong size", -1, inspector.getMapSize(null)); + assertNull("Should be null", inspector.getMap(null)); + } + + @Test + public void testNullContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, null); + assertEquals("Wrong size", -1, inspector.getMapSize(map)); + assertNull("Should be null", inspector.getMap(map)); + } + + @Test + public void testEmptyContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new ArrayWritable[0]); + assertEquals("Wrong size", -1, inspector.getMapSize(map)); + assertNull("Should be null", inspector.getMap(map)); + } + + @Test + public void testRegularMap() { + final Writable[] entry1 = new Writable[]{new IntWritable(0), new IntWritable(1)}; + final Writable[] entry2 = new Writable[]{new IntWritable(2), new IntWritable(3)}; + + final ArrayWritable internalMap = new ArrayWritable(ArrayWritable.class, new Writable[]{ + new ArrayWritable(Writable.class, entry1), new ArrayWritable(Writable.class, entry2)}); + + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new Writable[]{internalMap}); + + final Map expected = new HashMap(); + expected.put(new IntWritable(0), new IntWritable(1)); + expected.put(new IntWritable(2), new IntWritable(3)); + + assertEquals("Wrong size", 2, inspector.getMapSize(map)); + assertEquals("Wrong result of inspection", expected, inspector.getMap(map)); + } + + @Test + public void testHashMap() { + final Map map = new HashMap(); + map.put(new IntWritable(0), new IntWritable(1)); + map.put(new IntWritable(2), new IntWritable(3)); + map.put(new IntWritable(4), new IntWritable(5)); + map.put(new IntWritable(6), new IntWritable(7)); + + assertEquals("Wrong size", 4, inspector.getMapSize(map)); + assertEquals("Wrong result of inspection", map, inspector.getMap(map)); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestDeepParquetHiveMapInspector.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestDeepParquetHiveMapInspector.java new file mode 100644 index 0000000..ca740e0 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestDeepParquetHiveMapInspector.java @@ -0,0 +1,92 @@ +/* + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.ql.io.parquet.serde.primitive.ParquetPrimitiveInspectorFactory; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +public class TestDeepParquetHiveMapInspector extends TestCase { + + private DeepParquetHiveMapInspector inspector; + + @Override + public void setUp() { + inspector = new DeepParquetHiveMapInspector(ParquetPrimitiveInspectorFactory.parquetShortInspector, + PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } + + @Test + public void testNullMap() { + assertNull("Should be null", inspector.getMapValueElement(null, new ShortWritable((short) 0))); + } + + @Test + public void testNullContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, null); + assertNull("Should be null", inspector.getMapValueElement(map, new ShortWritable((short) 0))); + } + + @Test + public void testEmptyContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new ArrayWritable[0]); + assertNull("Should be null", inspector.getMapValueElement(map, new ShortWritable((short) 0))); + } + + @Test + public void testRegularMap() { + final Writable[] entry1 = new Writable[]{new IntWritable(0), new IntWritable(1)}; + final Writable[] entry2 = new Writable[]{new IntWritable(2), new IntWritable(3)}; + + final ArrayWritable internalMap = new ArrayWritable(ArrayWritable.class, new Writable[]{ + new ArrayWritable(Writable.class, entry1), new ArrayWritable(Writable.class, entry2)}); + + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new Writable[]{internalMap}); + + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new IntWritable(0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new IntWritable(2))); + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new ShortWritable((short) 0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new ShortWritable((short) 2))); + } + + @Test + public void testHashMap() { + final Map map = new HashMap(); + map.put(new IntWritable(0), new IntWritable(1)); + map.put(new IntWritable(2), new IntWritable(3)); + map.put(new IntWritable(4), new IntWritable(5)); + map.put(new IntWritable(6), new IntWritable(7)); + + + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new IntWritable(0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new IntWritable(2))); + assertEquals("Wrong result of inspection", new IntWritable(5), inspector.getMapValueElement(map, new IntWritable(4))); + assertEquals("Wrong result of inspection", new IntWritable(7), inspector.getMapValueElement(map, new IntWritable(6))); + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new ShortWritable((short) 0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new ShortWritable((short) 2))); + assertEquals("Wrong result of inspection", new IntWritable(5), inspector.getMapValueElement(map, new ShortWritable((short) 4))); + assertEquals("Wrong result of inspection", new IntWritable(7), inspector.getMapValueElement(map, new ShortWritable((short) 6))); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetHiveArrayInspector.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetHiveArrayInspector.java new file mode 100644 index 0000000..aa3bbf5 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetHiveArrayInspector.java @@ -0,0 +1,80 @@ +/* + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.ArrayList; +import java.util.List; +import junit.framework.TestCase; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +public class TestParquetHiveArrayInspector extends TestCase { + + private ParquetHiveArrayInspector inspector; + + @Override + public void setUp() { + inspector = new ParquetHiveArrayInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } + + @Test + public void testNullArray() { + assertEquals("Wrong size", -1, inspector.getListLength(null)); + assertNull("Should be null", inspector.getList(null)); + assertNull("Should be null", inspector.getListElement(null, 0)); + } + + @Test + public void testNullContainer() { + final ArrayWritable list = new ArrayWritable(ArrayWritable.class, null); + assertEquals("Wrong size", -1, inspector.getListLength(list)); + assertNull("Should be null", inspector.getList(list)); + assertNull("Should be null", inspector.getListElement(list, 0)); + } + + @Test + public void testEmptyContainer() { + final ArrayWritable list = new ArrayWritable(ArrayWritable.class, new ArrayWritable[0]); + assertEquals("Wrong size", -1, inspector.getListLength(list)); + assertNull("Should be null", inspector.getList(list)); + assertNull("Should be null", inspector.getListElement(list, 0)); + } + + @Test + public void testRegularList() { + final ArrayWritable internalList = new ArrayWritable(Writable.class, + new Writable[]{new IntWritable(3), new IntWritable(5), new IntWritable(1)}); + final ArrayWritable list = new ArrayWritable(ArrayWritable.class, new ArrayWritable[]{internalList}); + + final List expected = new ArrayList(); + expected.add(new IntWritable(3)); + expected.add(new IntWritable(5)); + expected.add(new IntWritable(1)); + + assertEquals("Wrong size", 3, inspector.getListLength(list)); + assertEquals("Wrong result of inspection", expected, inspector.getList(list)); + + for (int i = 0; i < expected.size(); ++i) { + assertEquals("Wrong result of inspection", expected.get(i), inspector.getListElement(list, i)); + + } + + assertNull("Should be null", inspector.getListElement(list, 3)); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestStandardParquetHiveMapInspector.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestStandardParquetHiveMapInspector.java new file mode 100644 index 0000000..a7f009d --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestStandardParquetHiveMapInspector.java @@ -0,0 +1,88 @@ +/* + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.parquet.serde; + +import java.util.HashMap; +import java.util.Map; +import junit.framework.TestCase; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +public class TestStandardParquetHiveMapInspector extends TestCase { + + private StandardParquetHiveMapInspector inspector; + + @Override + public void setUp() { + inspector = new StandardParquetHiveMapInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector, + PrimitiveObjectInspectorFactory.javaIntObjectInspector); + } + + @Test + public void testNullMap() { + assertNull("Should be null", inspector.getMapValueElement(null, new IntWritable(0))); + } + + @Test + public void testNullContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, null); + assertNull("Should be null", inspector.getMapValueElement(map, new IntWritable(0))); + } + + @Test + public void testEmptyContainer() { + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new ArrayWritable[0]); + assertNull("Should be null", inspector.getMapValueElement(map, new IntWritable(0))); + } + + @Test + public void testRegularMap() { + final Writable[] entry1 = new Writable[]{new IntWritable(0), new IntWritable(1)}; + final Writable[] entry2 = new Writable[]{new IntWritable(2), new IntWritable(3)}; + + final ArrayWritable internalMap = new ArrayWritable(ArrayWritable.class, new Writable[]{ + new ArrayWritable(Writable.class, entry1), new ArrayWritable(Writable.class, entry2)}); + + final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new Writable[]{internalMap}); + + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new IntWritable(0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new IntWritable(2))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 0))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 2))); + } + + @Test + public void testHashMap() { + final Map map = new HashMap(); + map.put(new IntWritable(0), new IntWritable(1)); + map.put(new IntWritable(2), new IntWritable(3)); + map.put(new IntWritable(4), new IntWritable(5)); + map.put(new IntWritable(6), new IntWritable(7)); + + assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new IntWritable(0))); + assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new IntWritable(2))); + assertEquals("Wrong result of inspection", new IntWritable(5), inspector.getMapValueElement(map, new IntWritable(4))); + assertEquals("Wrong result of inspection", new IntWritable(7), inspector.getMapValueElement(map, new IntWritable(6))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 0))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 2))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 4))); + assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 6))); + } +} diff --git a/ql/src/test/queries/clientpositive/parquet_create.q b/ql/src/test/queries/clientpositive/parquet_create.q new file mode 100644 index 0000000..0b976bd --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_create.q @@ -0,0 +1,36 @@ +DROP TABLE parquet_create_staging; +DROP TABLE parquet_create; + +CREATE TABLE parquet_create_staging ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':'; + +CREATE TABLE parquet_create ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) STORED AS PARQUET; + +DESCRIBE FORMATTED parquet_create; + +LOAD DATA LOCAL INPATH '../../data/files/parquet_create.txt' OVERWRITE INTO TABLE parquet_create_staging; + +SELECT * FROM parquet_create_staging; + +INSERT OVERWRITE TABLE parquet_create SELECT * FROM parquet_create_staging; + +SELECT * FROM parquet_create group by id; +SELECT id, count(0) FROM parquet_create group by id; +SELECT str from parquet_create; +SELECT mp from parquet_create; +SELECT lst from parquet_create; +SELECT strct from parquet_create; diff --git a/ql/src/test/results/clientpositive/parquet_create.q.out b/ql/src/test/results/clientpositive/parquet_create.q.out new file mode 100644 index 0000000..34fdea2 --- /dev/null +++ b/ql/src/test/results/clientpositive/parquet_create.q.out @@ -0,0 +1,206 @@ +PREHOOK: query: DROP TABLE parquet_create_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_create_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE parquet_create +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_create +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_create_staging ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE parquet_create_staging ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@parquet_create_staging +PREHOOK: query: CREATE TABLE parquet_create ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE parquet_create ( + id int, + str string, + mp MAP, + lst ARRAY, + strct STRUCT +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@parquet_create +PREHOOK: query: DESCRIBE FORMATTED parquet_create +PREHOOK: type: DESCTABLE +POSTHOOK: query: DESCRIBE FORMATTED parquet_create +POSTHOOK: type: DESCTABLE +# col_name data_type comment + +id int from deserializer +str string from deserializer +mp map from deserializer +lst array from deserializer +strct struct from deserializer + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe +InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_create.txt' OVERWRITE INTO TABLE parquet_create_staging +PREHOOK: type: LOAD +PREHOOK: Output: default@parquet_create_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_create.txt' OVERWRITE INTO TABLE parquet_create_staging +POSTHOOK: type: LOAD +POSTHOOK: Output: default@parquet_create_staging +PREHOOK: query: SELECT * FROM parquet_create_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create_staging +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_create_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create_staging +#### A masked pattern was here #### +1 foo line1 {"key11":"value11","key12":"value12","key13":"value13"} ["a","b","c"] {"a":"one","b":"two"} +2 bar line2 {"key21":"value21","key22":"value22","key23":"value23"} ["d","e","f"] {"a":"three","b":"four"} +3 baz line3 {"key31":"value31","key32":"value32","key33":"value33"} ["g","h","i"] {"a":"five","b":"six"} +PREHOOK: query: INSERT OVERWRITE TABLE parquet_create SELECT * FROM parquet_create_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create_staging +PREHOOK: Output: default@parquet_create +POSTHOOK: query: INSERT OVERWRITE TABLE parquet_create SELECT * FROM parquet_create_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create_staging +POSTHOOK: Output: default@parquet_create +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +PREHOOK: query: SELECT * FROM parquet_create group by id +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM parquet_create group by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +1 +2 +3 +PREHOOK: query: SELECT id, count(0) FROM parquet_create group by id +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT id, count(0) FROM parquet_create group by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +1 1 +2 1 +3 1 +PREHOOK: query: SELECT str from parquet_create +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT str from parquet_create +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +foo line1 +bar line2 +baz line3 +PREHOOK: query: SELECT mp from parquet_create +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT mp from parquet_create +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +{"key12":"value12","key11":"value11","key13":"value13"} +{"key21":"value21","key23":"value23","key22":"value22"} +{"key33":"value33","key31":"value31","key32":"value32"} +PREHOOK: query: SELECT lst from parquet_create +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT lst from parquet_create +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +["a","b","c"] +["d","e","f"] +["g","h","i"] +PREHOOK: query: SELECT strct from parquet_create +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: query: SELECT strct from parquet_create +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_create +#### A masked pattern was here #### +POSTHOOK: Lineage: parquet_create.id SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_create.lst SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:lst, type:array, comment:null), ] +POSTHOOK: Lineage: parquet_create.mp SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:mp, type:map, comment:null), ] +POSTHOOK: Lineage: parquet_create.str SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_create.strct SIMPLE [(parquet_create_staging)parquet_create_staging.FieldSchema(name:strct, type:struct, comment:null), ] +{"a":"one","b":"two"} +{"a":"three","b":"four"} +{"a":"five","b":"six"}