diff --git pom.xml pom.xml
index acacf81..c02e410 100644
--- pom.xml
+++ pom.xml
@@ -154,7 +154,7 @@
1.0.1
1.7.5
4.0.4
- 0.5.2
+ 0.7.0-SNAPSHOT
2.2.0
1.3.0
2.10
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java
new file mode 100644
index 0000000..3328631
--- /dev/null
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.io.ColumnarSplit;
+import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.AvoidSplitCombination;
+import org.apache.hadoop.hive.ql.io.HiveInputFormat;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.split.SplitSizeEstimator;
+
+/**
+ * Split size estimator for columnar file formats.
+ */
+public class ColumnarSplitSizeEstimator implements SplitSizeEstimator {
+ private static final Log LOG = LogFactory.getLog(ColumnarSplitSizeEstimator.class);
+ private static final boolean isDebugEnabled = LOG.isDebugEnabled();
+
+ @Override
+ public long getEstimatedSize(InputSplit inputSplit) throws IOException {
+ final long splitSize = inputSplit.getLength();
+ final long colProjSize;
+
+ if (inputSplit instanceof HiveInputFormat.HiveInputSplit) {
+ inputSplit = ((HiveInputFormat.HiveInputSplit)inputSplit).getInputSplit();
+ }
+
+ if (inputSplit instanceof ColumnarSplit) {
+ colProjSize = ((ColumnarSplit) inputSplit).getColumnarProjectionSize();
+ if (isDebugEnabled) {
+ LOG.debug("Estimated column projection size: " + colProjSize);
+ }
+ if (colProjSize <= 0) {
+ /* columnar splits of unknown size - estimate worst-case */
+ return Integer.MAX_VALUE;
+ }
+ } else {
+ colProjSize = splitSize;
+ }
+
+ return colProjSize;
+ }
+}
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
index c169677..1e8384c 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
@@ -88,7 +88,7 @@
InputSplit[] rawSplits = inputSplitCollection.toArray(new InputSplit[0]);
InputSplit[] groupedSplits =
tezGrouper.getGroupedSplits(conf, rawSplits, bucketTaskMap.get(bucketId),
- HiveInputFormat.class.getName());
+ HiveInputFormat.class.getName(), new ColumnarSplitSizeEstimator());
LOG.info("Original split size is " + rawSplits.length + " grouped split size is "
+ groupedSplits.length + ", for bucket: " + bucketId);
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java
new file mode 100644
index 0000000..ed8fc35
--- /dev/null
+++ ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.io;
+
+/**
+ * Interface when implemented should return the estimated size of columnar projections
+ * that will be read from the split. This information will be used by split grouper for better
+ * grouping based on the actual data read instead of the complete split length.
+ */
+public interface ColumnarSplit {
+
+ /**
+ * Return the estimation size of the column projections that will be read from this split.
+ *
+ * @return - estimated column projection size that will be read in bytes
+ */
+ long getColumnarProjectionSize();
+}
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 3ae4688..9f135f1 100644
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -1029,7 +1029,7 @@ private boolean isStripeSatisfyPredicate(StripeStatistics stripeStatistics,
if (isDebugEnabled) {
for (OrcSplit split : splits) {
LOG.debug(split + " projected_columns_uncompressed_size: "
- + split.getProjectedColumnsUncompressedSize());
+ + split.getColumnarProjectionSize());
}
}
return splits;
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
index 0c7dd40..b56731a 100644
--- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
@@ -26,6 +26,7 @@
import java.util.List;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.io.ColumnarSplit;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileSplit;
@@ -36,7 +37,7 @@
* OrcFileSplit. Holds file meta info
*
*/
-public class OrcSplit extends FileSplit {
+public class OrcSplit extends FileSplit implements ColumnarSplit {
private ReaderImpl.FileMetaInfo fileMetaInfo;
private boolean hasFooter;
private boolean isOriginal;
@@ -65,7 +66,7 @@ public OrcSplit(Path path, long offset, long length, String[] hosts,
this.isOriginal = isOriginal;
this.hasBase = hasBase;
this.deltas.addAll(deltas);
- this.projColsUncompressedSize = projectedDataSize;
+ this.projColsUncompressedSize = projectedDataSize <= 0 ? length : projectedDataSize;
}
@Override
@@ -152,7 +153,8 @@ public boolean hasBase() {
return deltas;
}
- public long getProjectedColumnsUncompressedSize() {
+ @Override
+ public long getColumnarProjectionSize() {
return projColsUncompressedSize;
}
}