diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java new file mode 100644 index 0000000..bf830eb --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *
+ * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.tez; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.io.ColumnarSplit; +import org.apache.hadoop.hive.ql.io.HiveInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.split.SplitSizeEstimator; + +/** + * Split size estimator for columnar file formats. + */ +public class ColumnarSplitSizeEstimator implements SplitSizeEstimator { + private static final Log LOG = LogFactory.getLog(ColumnarSplitSizeEstimator.class); + private static final boolean isDebugEnabled = LOG.isDebugEnabled(); + + @Override + public long getEstimatedSize(InputSplit inputSplit) throws IOException { + long colProjSize = inputSplit.getLength(); + + if (inputSplit instanceof ColumnarSplit) { + colProjSize = ((ColumnarSplit) inputSplit).getColumnarProjectionSize(); + if (isDebugEnabled) { + LOG.debug("Estimated column projection size: " + colProjSize); + } + return colProjSize; + } else if (inputSplit instanceof HiveInputFormat.HiveInputSplit) { + InputSplit innerSplit = ((HiveInputFormat.HiveInputSplit) inputSplit).getInputSplit(); + + if (innerSplit instanceof ColumnarSplit) { + colProjSize = ((ColumnarSplit) innerSplit).getColumnarProjectionSize(); + if (isDebugEnabled) { + LOG.debug("Estimated column projection size: " + colProjSize); + } + return colProjSize; + } + } + return colProjSize; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java index c12da37..f95aabf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java @@ -90,7 +90,7 @@ InputSplit[] rawSplits = inputSplitCollection.toArray(new InputSplit[0]); InputSplit[] groupedSplits = tezGrouper.getGroupedSplits(conf, rawSplits, bucketTaskMap.get(bucketId), - HiveInputFormat.class.getName()); + HiveInputFormat.class.getName(), new ColumnarSplitSizeEstimator()); LOG.info("Original split size is " + rawSplits.length + " grouped split size is " + groupedSplits.length + ", for bucket: " + bucketId); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java new file mode 100644 index 0000000..ed8fc35 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io; + +/** + * Interface when implemented should return the estimated size of columnar projections + * that will be read from the split. This information will be used by split grouper for better + * grouping based on the actual data read instead of the complete split length. + */ +public interface ColumnarSplit { + + /** + * Return the estimation size of the column projections that will be read from this split. + * + * @return - estimated column projection size that will be read in bytes + */ + long getColumnarProjectionSize(); +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 2548106..340f55c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -1053,7 +1053,7 @@ private boolean isStripeSatisfyPredicate(StripeStatistics stripeStatistics, if (isDebugEnabled) { for (OrcSplit split : splits) { LOG.debug(split + " projected_columns_uncompressed_size: " - + split.getProjectedColumnsUncompressedSize()); + + split.getColumnarProjectionSize()); } } return splits; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java index 1263346..fa78703 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java @@ -28,6 +28,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.ColumnarSplit; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.FileSplit; @@ -38,7 +39,7 @@ * OrcFileSplit. Holds file meta info * */ -public class OrcSplit extends FileSplit { +public class OrcSplit extends FileSplit implements ColumnarSplit { private static final Log LOG = LogFactory.getLog(OrcSplit.class); private FileMetaInfo fileMetaInfo; @@ -74,7 +75,7 @@ public OrcSplit(Path path, Long fileId, long offset, long length, String[] hosts this.isOriginal = isOriginal; this.hasBase = hasBase; this.deltas.addAll(deltas); - this.projColsUncompressedSize = projectedDataSize; + this.projColsUncompressedSize = projectedDataSize <= 0 ? length : projectedDataSize; } @Override @@ -173,7 +174,8 @@ public Long getFileId() { return fileId; } - public long getProjectedColumnsUncompressedSize() { + @Override + public long getColumnarProjectionSize() { return projColsUncompressedSize; }