diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java new file mode 100644 index 0000000..bf830eb --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *
+ * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.tez; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.io.ColumnarSplit; +import org.apache.hadoop.hive.ql.io.HiveInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.split.SplitSizeEstimator; + +/** + * Split size estimator for columnar file formats. + */ +public class ColumnarSplitSizeEstimator implements SplitSizeEstimator { + private static final Log LOG = LogFactory.getLog(ColumnarSplitSizeEstimator.class); + private static final boolean isDebugEnabled = LOG.isDebugEnabled(); + + @Override + public long getEstimatedSize(InputSplit inputSplit) throws IOException { + long colProjSize = inputSplit.getLength(); + + if (inputSplit instanceof ColumnarSplit) { + colProjSize = ((ColumnarSplit) inputSplit).getColumnarProjectionSize(); + if (isDebugEnabled) { + LOG.debug("Estimated column projection size: " + colProjSize); + } + return colProjSize; + } else if (inputSplit instanceof HiveInputFormat.HiveInputSplit) { + InputSplit innerSplit = ((HiveInputFormat.HiveInputSplit) inputSplit).getInputSplit(); + + if (innerSplit instanceof ColumnarSplit) { + colProjSize = ((ColumnarSplit) innerSplit).getColumnarProjectionSize(); + if (isDebugEnabled) { + LOG.debug("Estimated column projection size: " + colProjSize); + } + return colProjSize; + } + } + return colProjSize; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java index c169677..1e8384c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java @@ -88,7 +88,7 @@ InputSplit[] rawSplits = inputSplitCollection.toArray(new InputSplit[0]); InputSplit[] groupedSplits = tezGrouper.getGroupedSplits(conf, rawSplits, bucketTaskMap.get(bucketId), - HiveInputFormat.class.getName()); + HiveInputFormat.class.getName(), new ColumnarSplitSizeEstimator()); LOG.info("Original split size is " + rawSplits.length + " grouped split size is " + groupedSplits.length + ", for bucket: " + bucketId); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java new file mode 100644 index 0000000..ed8fc35 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io; + +/** + * Interface when implemented should return the estimated size of columnar projections + * that will be read from the split. This information will be used by split grouper for better + * grouping based on the actual data read instead of the complete split length. + */ +public interface ColumnarSplit { + + /** + * Return the estimation size of the column projections that will be read from this split. + * + * @return - estimated column projection size that will be read in bytes + */ + long getColumnarProjectionSize(); +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 3ae4688..9f135f1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -1029,7 +1029,7 @@ private boolean isStripeSatisfyPredicate(StripeStatistics stripeStatistics, if (isDebugEnabled) { for (OrcSplit split : splits) { LOG.debug(split + " projected_columns_uncompressed_size: " - + split.getProjectedColumnsUncompressedSize()); + + split.getColumnarProjectionSize()); } } return splits; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java index 0c7dd40..b56731a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java @@ -26,6 +26,7 @@ import java.util.List; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.ColumnarSplit; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.FileSplit; @@ -36,7 +37,7 @@ * OrcFileSplit. Holds file meta info * */ -public class OrcSplit extends FileSplit { +public class OrcSplit extends FileSplit implements ColumnarSplit { private ReaderImpl.FileMetaInfo fileMetaInfo; private boolean hasFooter; private boolean isOriginal; @@ -65,7 +66,7 @@ public OrcSplit(Path path, long offset, long length, String[] hosts, this.isOriginal = isOriginal; this.hasBase = hasBase; this.deltas.addAll(deltas); - this.projColsUncompressedSize = projectedDataSize; + this.projColsUncompressedSize = projectedDataSize <= 0 ? length : projectedDataSize; } @Override @@ -152,7 +153,8 @@ public boolean hasBase() { return deltas; } - public long getProjectedColumnsUncompressedSize() { + @Override + public long getColumnarProjectionSize() { return projColsUncompressedSize; } }