diff --git common/src/java/org/apache/hadoop/hive/ql/log/PerfLogger.java common/src/java/org/apache/hadoop/hive/ql/log/PerfLogger.java index 2707987..f1181fd 100644 --- common/src/java/org/apache/hadoop/hive/ql/log/PerfLogger.java +++ common/src/java/org/apache/hadoop/hive/ql/log/PerfLogger.java @@ -218,11 +218,11 @@ } - public ImmutableMap getStartTimes() { + public Map getStartTimes() { return ImmutableMap.copyOf(startTimes); } - public ImmutableMap getEndTimes() { + public Map getEndTimes() { return ImmutableMap.copyOf(endTimes); } diff --git itests/hive-blobstore/pom.xml itests/hive-blobstore/pom.xml index efc6b37..09955c5 100644 --- itests/hive-blobstore/pom.xml +++ itests/hive-blobstore/pom.xml @@ -49,10 +49,6 @@ protobuf-java - org.apache.calcite - calcite-core - - org.apache.hive hive-common test @@ -64,6 +60,11 @@ org.apache.hive + hive-exec + test + + + org.apache.hive hive-standalone-metastore-common test @@ -94,6 +95,12 @@ org.apache.hive hive-it-util test + + + org.apache.calcite + calcite-core + + org.apache.hive @@ -101,11 +108,6 @@ test - org.apache.hive - hive-exec - test - - org.apache.hadoop hadoop-common test diff --git itests/hive-minikdc/pom.xml itests/hive-minikdc/pom.xml index f1328aa..22cf244 100644 --- itests/hive-minikdc/pom.xml +++ itests/hive-minikdc/pom.xml @@ -42,14 +42,6 @@ protobuf-java - org.apache.calcite - calcite-core - - - org.apache.calcite - calcite-linq4j - - org.apache.hive hive-common test diff --git itests/hive-unit/pom.xml itests/hive-unit/pom.xml index bc20cd6..103975f 100644 --- itests/hive-unit/pom.xml +++ itests/hive-unit/pom.xml @@ -40,6 +40,15 @@ org.apache.hive + hive-exec + + + org.apache.hive + hive-exec + tests + + + org.apache.hive hive-jdbc @@ -52,10 +61,6 @@ org.apache.hive - hive-exec - - - org.apache.hive hive-llap-server @@ -175,11 +180,6 @@ org.apache.hive - hive-exec - tests - - - org.apache.hive hive-common tests test diff --git itests/qtest-accumulo/pom.xml itests/qtest-accumulo/pom.xml index b0373d5..a35d2a8 100644 --- itests/qtest-accumulo/pom.xml +++ itests/qtest-accumulo/pom.xml @@ -56,12 +56,18 @@ org.apache.hive hive-contrib test - - - org.apache.hive - hive-exec - - + + + org.apache.hive + hive-exec + test + core + + + org.apache.hive + hive-exec + test + tests org.apache.hive @@ -96,8 +102,8 @@ test - org.apache.hive - hive-exec + org.apache.calcite + calcite-core org.apache.hive @@ -115,24 +121,6 @@ hive-udf test - - org.apache.hive - hive-exec - test - core - - - org.apache.hive - hive-exec - test - tests - - - org.apache.hive - hive-exec - tests - test - junit diff --git itests/qtest-kudu/pom.xml itests/qtest-kudu/pom.xml index 132d22c..f23399f 100644 --- itests/qtest-kudu/pom.xml +++ itests/qtest-kudu/pom.xml @@ -45,12 +45,18 @@ org.apache.hive hive-contrib test - - - org.apache.hive - hive-exec - - + + + org.apache.hive + hive-exec + test + core + + + org.apache.hive + hive-exec + test + tests org.apache.hive @@ -85,8 +91,8 @@ test - org.apache.hive - hive-exec + org.apache.calcite + calcite-core org.apache.hive @@ -104,18 +110,6 @@ hive-udf test - - org.apache.hive - hive-exec - test - core - - - org.apache.hive - hive-exec - test - tests - junit diff --git itests/qtest-spark/pom.xml itests/qtest-spark/pom.xml index b6bbeef..60d032d 100644 --- itests/qtest-spark/pom.xml +++ itests/qtest-spark/pom.xml @@ -119,6 +119,11 @@ org.apache.hive + hive-exec + test + + + org.apache.hive hive-standalone-metastore-common test @@ -149,6 +154,10 @@ hive-it-util + org.apache.calcite + calcite-core + + org.apache.hive hive-it-druid @@ -165,11 +174,6 @@ hive-udf test - - org.apache.hive - hive-exec - test - diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index 9d27b8d..3f6d42c 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -817,6 +817,8 @@ schq_materialized.q,\ schq_analyze.q,\ schq_ingest.q,\ + sketches_hll.q,\ + sketches_theta.q,\ table_access_keys_stats.q,\ temp_table_llap_partitioned.q,\ tez_bmj_schema_evolution.q,\ diff --git pom.xml pom.xml index d804a19..579e745 100644 --- pom.xml +++ pom.xml @@ -225,6 +225,10 @@ 2.2.4 1.2 2.0.1 + 2.4.0 + 3.0.11 + 1.23 + 1.0.0-incubating @@ -1007,6 +1011,26 @@ re2j ${re2j.version} + + com.jayway.jsonpath + json-path + ${json-path.version} + + + org.codehaus.janino + commons-compiler + ${janino.version} + + + org.codehaus.janino + janino + ${janino.version} + + + org.yaml + snakeyaml + ${snakeyaml.version} + diff --git ql/pom.xml ql/pom.xml index d5c83b6..c15228a 100644 --- ql/pom.xml +++ ql/pom.xml @@ -313,6 +313,11 @@ test + org.apache.datasketches + datasketches-hive + ${datasketches.version} + + com.lmax disruptor ${disruptor.version} @@ -778,6 +783,22 @@ + + com.jayway.jsonpath + json-path + + + org.codehaus.janino + commons-compiler + + + org.codehaus.janino + janino + + + org.yaml + snakeyaml + @@ -991,6 +1012,9 @@ io.dropwizard.metrics:metrics-jvm io.dropwizard.metrics:metrics-json com.zaxxer:HikariCP + org.apache.calcite:* + org.apache.calcite.avatica:avatica + org.apache.datasketches:* @@ -1015,8 +1039,16 @@ org.apache.hive.com.zaxxer.hikari - com.google.guava - org.apache.hive.com.google.guava + com.google.common + org.apache.hive.com.google.common + + + com.google.thirdparty.publicsuffix + org.apache.hive.com.google.thirdparty.publicsuffix + + + org.apache.datasketches + org.apache.hive.org.apache.datasketches diff --git ql/src/java/org/apache/hadoop/hive/ql/QueryDisplay.java ql/src/java/org/apache/hadoop/hive/ql/QueryDisplay.java index ddeb954..1aa5be3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/QueryDisplay.java +++ ql/src/java/org/apache/hadoop/hive/ql/QueryDisplay.java @@ -17,7 +17,6 @@ */ package org.apache.hadoop.hive.ql; -import com.google.common.collect.ImmutableMap; import org.apache.hadoop.hive.common.LogUtils; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskResult; @@ -303,7 +302,7 @@ * @param phase phase of query * @param hmsTimings map of HMS Client method-calls and duration in miliseconds, during given phase. */ - public synchronized void setHmsTimings(Phase phase, ImmutableMap hmsTimings) { + public synchronized void setHmsTimings(Phase phase, Map hmsTimings) { hmsTimingMap.put(phase, hmsTimings); } @@ -319,7 +318,7 @@ * @param phase phase of query * @param perfLogStarts map of PerfLogger call-trace name and start time in miliseconds, during given phase. */ - public synchronized void setPerfLogStarts(Phase phase, ImmutableMap perfLogStarts) { + public synchronized void setPerfLogStarts(Phase phase, Map perfLogStarts) { perfLogStartMap.put(phase, perfLogStarts); } @@ -335,7 +334,7 @@ * @param phase phase of query * @param perfLogEnds map of PerfLogger call-trace name and end time in miliseconds, during given phase. */ - public synchronized void setPerfLogEnds(Phase phase, ImmutableMap perfLogEnds) { + public synchronized void setPerfLogEnds(Phase phase, Map perfLogEnds) { perfLogEndMap.put(phase, perfLogEnds); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java new file mode 100644 index 0000000..b9d265f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver2; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; + +/** + * Registers functions from the DataSketches library as builtin functions. + * + * In an effort to show a more consistent + */ +public class DataSketchesFunctions { + + private static final String DATA_TO_SKETCH = "sketch"; + private static final String SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS = "estimate_bounds"; + private static final String SKETCH_TO_ESTIMATE = "estimate"; + private static final String SKETCH_TO_STRING = "stringify"; + private static final String UNION_SKETCH = "union"; + private static final String UNION_SKETCH1 = "union_f"; + private static final String GET_N = "n"; + private static final String GET_CDF = "cdf"; + private static final String GET_PMF = "pmf"; + private static final String GET_QUANTILES = "quantiles"; + private static final String GET_QUANTILE = "quantile"; + private static final String GET_RANK = "rank"; + private static final String INTERSECT_SKETCH = "intersect"; + private static final String INTERSECT_SKETCH1 = "intersect_f"; + private static final String EXCLUDE_SKETCH = "exclude"; + private static final String GET_K = "k"; + private static final String GET_FREQUENT_ITEMS = "frequent_items"; + private static final String T_TEST = "ttest"; + private static final String SKETCH_TO_MEANS = "means"; + private static final String SKETCH_TO_NUMBER_OF_RETAINED_ENTRIES = "n_retained"; + private static final String SKETCH_TO_QUANTILES_SKETCH = "quantiles_sketch"; + private static final String SKETCH_TO_VALUES = "values"; + private static final String SKETCH_TO_VARIANCES = "variances"; + private static final String SKETCH_TO_PERCENTILE = "percentile"; + + private final Registry system; + + public DataSketchesFunctions(Registry system) { + this.system = system; + } + + public static void register(Registry system) { + DataSketchesFunctions dsf = new DataSketchesFunctions(system); + String prefix = "ds"; + dsf.registerHll(prefix); + dsf.registerCpc(prefix); + dsf.registerKll(prefix); + dsf.registerTheta(prefix); + dsf.registerTuple(prefix); + dsf.registerQuantiles(prefix); + dsf.registerFrequencies(prefix); + } + + private void registerHll(String prefix) { + String p = prefix + "_hll_"; + registerUDAF(org.apache.datasketches.hive.hll.DataToSketchUDAF.class, p + DATA_TO_SKETCH); + registerUDF(org.apache.datasketches.hive.hll.SketchToEstimateAndErrorBoundsUDF.class, + p + SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS); + registerUDF(org.apache.datasketches.hive.hll.SketchToEstimateUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.hll.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDF(org.apache.datasketches.hive.hll.UnionSketchUDF.class, p + UNION_SKETCH1); + registerUDAF(org.apache.datasketches.hive.hll.UnionSketchUDAF.class, p + UNION_SKETCH); + } + + private void registerCpc(String prefix) { + String p = prefix + "_cpc_"; + registerUDAF(org.apache.datasketches.hive.cpc.DataToSketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: normalize GetEstimateAndErrorBoundsUDF vs SketchToEstimateAndErrorBoundsUDF + registerUDF(org.apache.datasketches.hive.cpc.GetEstimateAndErrorBoundsUDF.class, + p + SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS); + // FIXME: normalize GetEstimateUDF vs SketchToEstimateUDF + registerUDF(org.apache.datasketches.hive.cpc.GetEstimateUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.cpc.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDF(org.apache.datasketches.hive.cpc.UnionSketchUDF.class, p + UNION_SKETCH1); + registerUDAF(org.apache.datasketches.hive.cpc.UnionSketchUDAF.class, p + UNION_SKETCH); + } + + private void registerKll(String prefix) { + String p = prefix + "_kll_"; + registerUDAF(org.apache.datasketches.hive.kll.DataToSketchUDAF.class, p + DATA_TO_SKETCH); + registerUDF(org.apache.datasketches.hive.kll.SketchToStringUDF.class, p + SKETCH_TO_STRING); + // registerUDF(org.apache.datasketches.hive.kll.UnionSketchUDF.class, p + UNION_SKETCH); + registerUDAF(org.apache.datasketches.hive.kll.UnionSketchUDAF.class, p + UNION_SKETCH); + + registerUDF(org.apache.datasketches.hive.kll.GetNUDF.class, p + GET_N); + registerUDF(org.apache.datasketches.hive.kll.GetCdfUDF.class, p + GET_CDF); + registerUDF(org.apache.datasketches.hive.kll.GetPmfUDF.class, p + GET_PMF); + registerUDF(org.apache.datasketches.hive.kll.GetQuantilesUDF.class, p + GET_QUANTILES); + registerUDF(org.apache.datasketches.hive.kll.GetQuantileUDF.class, p + GET_QUANTILE); + registerUDF(org.apache.datasketches.hive.kll.GetRankUDF.class, p + GET_RANK); + } + + private void registerTheta(String prefix) { + String p = prefix + "_theta_"; + registerUDAF(org.apache.datasketches.hive.theta.DataToSketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: missing? + //registerUDF(org.apache.datasketches.hive.theta.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDF(org.apache.datasketches.hive.theta.UnionSketchUDF.class, p + UNION_SKETCH1); + registerUDAF(org.apache.datasketches.hive.theta.UnionSketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.theta.IntersectSketchUDF.class, p + INTERSECT_SKETCH1); + registerUDAF(org.apache.datasketches.hive.theta.IntersectSketchUDAF.class, p + INTERSECT_SKETCH); + registerUDF(org.apache.datasketches.hive.theta.EstimateSketchUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.theta.ExcludeSketchUDF.class, p + EXCLUDE_SKETCH); + + } + + private void registerTuple(String prefix) { + registerTupleArrayOfDoubles(prefix + "_tuple_arrayofdouble"); + registerTupleDoubleSummary(prefix + "_tuple_doublesummary"); + } + + private void registerTupleArrayOfDoubles(String string) { + String p = string + "_"; + registerUDAF(org.apache.datasketches.hive.tuple.DataToArrayOfDoublesSketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: missing? + //registerUDF(org.apache.datasketches.hive.theta.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDAF(org.apache.datasketches.hive.tuple.UnionArrayOfDoublesSketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchesTTestUDF.class, p + T_TEST); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToEstimatesUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToEstimateAndErrorBoundsUDF.class, + p + SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToMeansUDF.class, p + SKETCH_TO_MEANS); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToNumberOfRetainedEntriesUDF.class, + p + SKETCH_TO_NUMBER_OF_RETAINED_ENTRIES); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToQuantilesSketchUDF.class, + p + SKETCH_TO_QUANTILES_SKETCH); + registerUDTF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToValuesUDTF.class, p + SKETCH_TO_VALUES); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToVariancesUDF.class, p + SKETCH_TO_VARIANCES); + } + + private void registerTupleDoubleSummary(String string) { + String p = string + "_"; + registerUDAF(org.apache.datasketches.hive.tuple.DataToDoubleSummarySketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: missing? + //registerUDF(org.apache.datasketches.hive.theta.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDAF(org.apache.datasketches.hive.tuple.UnionDoubleSummarySketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.tuple.DoubleSummarySketchToEstimatesUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.tuple.DoubleSummarySketchToPercentileUDF.class, p + SKETCH_TO_PERCENTILE); + } + + private void registerQuantiles(String prefix) { + registerQuantilesString(prefix + "_quantile"); + registerQuantilesDoubles(prefix + "_quantile"); + } + + private void registerFrequencies(String prefix) { + String p = prefix + "_freq_"; + registerUDAF(org.apache.datasketches.hive.frequencies.DataToStringsSketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: missing? + //registerUDF(org.apache.datasketches.hive.frequencies.DoublesSketchToStringUDF.class, p + SKETCH_TO_STRING); + //registerUDF(org.apache.datasketches.hive.quantiles.UnionItemsSketchUDAF.class, p + UNION_SKETCH); + registerUDAF(org.apache.datasketches.hive.frequencies.UnionStringsSketchUDAF.class, p + UNION_SKETCH); + registerUDTF(org.apache.datasketches.hive.frequencies.GetFrequentItemsFromStringsSketchUDTF.class, + p + GET_FREQUENT_ITEMS); + } + + private void registerQuantilesString(String prefix) { + String p = prefix + "_strings_"; + registerUDAF(org.apache.datasketches.hive.quantiles.DataToStringsSketchUDAF.class, p + DATA_TO_SKETCH); + registerUDF(org.apache.datasketches.hive.quantiles.StringsSketchToStringUDF.class, p + SKETCH_TO_STRING); + //registerUDF(org.apache.datasketches.hive.quantiles.UnionItemsSketchUDAF.class, p + UNION_SKETCH); + registerUDAF(org.apache.datasketches.hive.quantiles.UnionStringsSketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.quantiles.GetNFromStringsSketchUDF.class, p + GET_N); + registerUDF(org.apache.datasketches.hive.quantiles.GetKFromStringsSketchUDF.class, p + GET_K); + registerUDF(org.apache.datasketches.hive.quantiles.GetCdfFromStringsSketchUDF.class, p + GET_CDF); + registerUDF(org.apache.datasketches.hive.quantiles.GetPmfFromStringsSketchUDF.class, p + GET_PMF); + registerUDF(org.apache.datasketches.hive.quantiles.GetQuantileFromStringsSketchUDF.class, p + GET_QUANTILE); + registerUDF(org.apache.datasketches.hive.quantiles.GetQuantilesFromStringsSketchUDF.class, p + GET_QUANTILES); + } + + private void registerQuantilesDoubles(String prefix) { + String p = prefix + "_doubles_"; + registerUDAF(org.apache.datasketches.hive.quantiles.DataToDoublesSketchUDAF.class, p + DATA_TO_SKETCH); + registerUDF(org.apache.datasketches.hive.quantiles.DoublesSketchToStringUDF.class, p + SKETCH_TO_STRING); + //registerUDF(org.apache.datasketches.hive.quantiles.UnionItemsSketchUDAF.class, p + UNION_SKETCH); + registerUDAF(org.apache.datasketches.hive.quantiles.UnionDoublesSketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.quantiles.GetNFromDoublesSketchUDF.class, p + GET_N); + registerUDF(org.apache.datasketches.hive.quantiles.GetKFromDoublesSketchUDF.class, p + GET_K); + registerUDF(org.apache.datasketches.hive.quantiles.GetCdfFromDoublesSketchUDF.class, p + GET_CDF); + registerUDF(org.apache.datasketches.hive.quantiles.GetPmfFromDoublesSketchUDF.class, p + GET_PMF); + registerUDF(org.apache.datasketches.hive.quantiles.GetQuantileFromDoublesSketchUDF.class, p + GET_QUANTILE); + registerUDF(org.apache.datasketches.hive.quantiles.GetQuantilesFromDoublesSketchUDF.class, p + GET_QUANTILES); + } + + private void registerUDF(Class udfClass, String name) { + system.registerUDF(name, udfClass, false); + } + + private void registerUDAF(Class udafClass, String name) { + try { + system.registerGenericUDAF(name, udafClass.newInstance()); + } catch (InstantiationException | IllegalAccessException e) { + throw new RuntimeException("Unable to register: " + name, e); + } + } + + private void registerUDTF(Class udtfClass, String name) { + system.registerGenericUDTF(name, udtfClass); + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index db5ee8d..dc3781a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -140,8 +140,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping; import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils; -import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -475,6 +473,7 @@ system.registerGenericUDAF("percentile_cont", new GenericUDAFPercentileCont()); system.registerGenericUDAF("percentile_disc", new GenericUDAFPercentileDisc()); + DataSketchesFunctions.register(system); // Generic UDFs system.registerGenericUDF("reflect", GenericUDFReflect.class); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveAggregate.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveAggregate.java index 6b841a5..5e5e928 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveAggregate.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveAggregate.java @@ -77,12 +77,12 @@ @Override protected RelDataType deriveRowType() { return deriveRowType(getCluster().getTypeFactory(), getInput().getRowType(), - indicator, groupSet, groupSets, aggCalls); + indicator, groupSet, aggCalls); } public static RelDataType deriveRowType(RelDataTypeFactory typeFactory, final RelDataType inputRowType, boolean indicator, - ImmutableBitSet groupSet, List groupSets, + ImmutableBitSet groupSet, final List aggCalls) { final List groupList = groupSet.asList(); assert groupList.size() == groupSet.cardinality(); @@ -100,10 +100,11 @@ typeFactory.createSqlType(SqlTypeName.BOOLEAN), false); String name = "i$" + fieldList.get(groupKey).getName(); int i = 0; + StringBuilder nameBuilder = new StringBuilder(name); while (containedNames.contains(name)) { - name += "_" + i++; + nameBuilder.append('_').append(i++); } - containedNames.add(name); + containedNames.add(nameBuilder.toString()); builder.add(name, booleanType); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java index 4526fc6..ab56ce8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java @@ -2427,7 +2427,7 @@ operand(LogicalCorrelate.class, operand(RelNode.class, any()), operand(Project.class, - operand(Aggregate.class, null, Aggregate.IS_SIMPLE, + operandJ(Aggregate.class, null, Aggregate::isSimple, operand(Project.class, operand(RelNode.class, any())))))); } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java index 04d688c..d2d6f70 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java @@ -46,13 +46,13 @@ import org.apache.calcite.util.Pair; import org.apache.calcite.util.Util; -import com.google.common.base.Predicate; import com.google.common.collect.ImmutableList; import java.math.BigDecimal; import java.util.ArrayList; import java.util.List; import java.util.Set; +import java.util.function.Predicate; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSubqueryRuntimeException; @@ -79,10 +79,10 @@ */ public class HiveSubQueryRemoveRule extends RelOptRule { - private HiveConf conf; + private final HiveConf conf; public HiveSubQueryRemoveRule(HiveConf conf) { - super(operand(RelNode.class, null, HiveSubQueryFinder.RELNODE_PREDICATE, any()), + super(operandJ(RelNode.class, null, HiveSubQueryFinder.RELNODE_PREDICATE, any()), HiveRelFactories.HIVE_BUILDER, "SubQueryRemoveRule:Filter"); this.conf = conf; } @@ -597,7 +597,7 @@ * Returns whether a {@link Project} contains a sub-query. */ public static final Predicate RELNODE_PREDICATE = new Predicate() { - @Override public boolean apply(RelNode relNode) { + @Override public boolean test(RelNode relNode) { if (relNode instanceof Project) { Project project = (Project) relNode; for (RexNode node : project.getProjects()) { diff --git ql/src/test/queries/clientpositive/sketches_hll.q ql/src/test/queries/clientpositive/sketches_hll.q new file mode 100644 index 0000000..56467a6 --- /dev/null +++ ql/src/test/queries/clientpositive/sketches_hll.q @@ -0,0 +1,16 @@ +-- prepare input data +create temporary table sketch_input (id int, category char(1)); +insert into table sketch_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b'); + +-- build sketches per category +create temporary table sketch_intermediate (category char(1), sketch binary); +insert into sketch_intermediate select category, ds_hll_sketch(id) from sketch_input group by category; + +-- get unique count estimates per category +select category, ds_hll_estimate(sketch) from sketch_intermediate; + + +-- union sketches across categories and get overall unique count estimate +select ds_hll_estimate(ds_hll_union(sketch)) from sketch_intermediate; diff --git ql/src/test/queries/clientpositive/sketches_theta.q ql/src/test/queries/clientpositive/sketches_theta.q new file mode 100644 index 0000000..6ab7278 --- /dev/null +++ ql/src/test/queries/clientpositive/sketches_theta.q @@ -0,0 +1,33 @@ +-- see here: https://datasketches.apache.org/docs/Theta/ThetaHiveUDFs.html + +create temporary table theta_input (id int, category char(1)); +insert into table theta_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b'); + +create temporary table sketch_intermediate (category char(1), sketch binary); +insert into sketch_intermediate select category, ds_theta_sketch(id) from theta_input group by category; + +select category, ds_theta_estimate(sketch) from sketch_intermediate; + +select ds_theta_estimate(ds_theta_union(sketch)) from sketch_intermediate; + + + +create temporary table sketch_input (id1 int, id2 int); +insert into table sketch_input values + (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20); + +create temporary table sketch_intermediate2 (sketch1 binary, sketch2 binary); + +insert into sketch_intermediate2 select ds_theta_sketch(id1), ds_theta_sketch(id2) from sketch_input; + +select + ds_theta_estimate(sketch1), + ds_theta_estimate(sketch2), + ds_theta_estimate(ds_theta_union_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_intersect_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch2, sketch1)) +from sketch_intermediate2; + diff --git ql/src/test/results/clientpositive/llap/sketches_hll.q.out ql/src/test/results/clientpositive/llap/sketches_hll.q.out new file mode 100644 index 0000000..9ebce86 --- /dev/null +++ ql/src/test/results/clientpositive/llap/sketches_hll.q.out @@ -0,0 +1,59 @@ +PREHOOK: query: create temporary table sketch_input (id int, category char(1)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create temporary table sketch_input (id int, category char(1)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: create temporary table sketch_intermediate (category char(1), sketch binary) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_intermediate +POSTHOOK: query: create temporary table sketch_intermediate (category char(1), sketch binary) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_intermediate +PREHOOK: query: insert into sketch_intermediate select category, ds_hll_sketch(id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@sketch_intermediate +POSTHOOK: query: insert into sketch_intermediate select category, ds_hll_sketch(id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@sketch_intermediate +POSTHOOK: Lineage: sketch_intermediate.category SIMPLE [(sketch_input)sketch_input.FieldSchema(name:category, type:char(1), comment:null), ] +POSTHOOK: Lineage: sketch_intermediate.sketch EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id, type:int, comment:null), ] +PREHOOK: query: select category, ds_hll_estimate(sketch) from sketch_intermediate +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +POSTHOOK: query: select category, ds_hll_estimate(sketch) from sketch_intermediate +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +a 10.000000223517425 +b 10.000000223517425 +PREHOOK: query: select ds_hll_estimate(ds_hll_union(sketch)) from sketch_intermediate +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +POSTHOOK: query: select ds_hll_estimate(ds_hll_union(sketch)) from sketch_intermediate +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +15.000000521540663 diff --git ql/src/test/results/clientpositive/llap/sketches_theta.q.out ql/src/test/results/clientpositive/llap/sketches_theta.q.out new file mode 100644 index 0000000..b3ea64d --- /dev/null +++ ql/src/test/results/clientpositive/llap/sketches_theta.q.out @@ -0,0 +1,120 @@ +PREHOOK: query: create temporary table theta_input (id int, category char(1)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@theta_input +POSTHOOK: query: create temporary table theta_input (id int, category char(1)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@theta_input +PREHOOK: query: insert into table theta_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@theta_input +POSTHOOK: query: insert into table theta_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@theta_input +POSTHOOK: Lineage: theta_input.category SCRIPT [] +POSTHOOK: Lineage: theta_input.id SCRIPT [] +PREHOOK: query: create temporary table sketch_intermediate (category char(1), sketch binary) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_intermediate +POSTHOOK: query: create temporary table sketch_intermediate (category char(1), sketch binary) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_intermediate +PREHOOK: query: insert into sketch_intermediate select category, ds_theta_sketch(id) from theta_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@theta_input +PREHOOK: Output: default@sketch_intermediate +POSTHOOK: query: insert into sketch_intermediate select category, ds_theta_sketch(id) from theta_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@theta_input +POSTHOOK: Output: default@sketch_intermediate +POSTHOOK: Lineage: sketch_intermediate.category SIMPLE [(theta_input)theta_input.FieldSchema(name:category, type:char(1), comment:null), ] +POSTHOOK: Lineage: sketch_intermediate.sketch EXPRESSION [(theta_input)theta_input.FieldSchema(name:id, type:int, comment:null), ] +PREHOOK: query: select category, ds_theta_estimate(sketch) from sketch_intermediate +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +POSTHOOK: query: select category, ds_theta_estimate(sketch) from sketch_intermediate +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +a 10.0 +b 10.0 +PREHOOK: query: select ds_theta_estimate(ds_theta_union(sketch)) from sketch_intermediate +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +POSTHOOK: query: select ds_theta_estimate(ds_theta_union(sketch)) from sketch_intermediate +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +15.0 +PREHOOK: query: create temporary table sketch_input (id1 int, id2 int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create temporary table sketch_input (id1 int, id2 int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.id1 SCRIPT [] +POSTHOOK: Lineage: sketch_input.id2 SCRIPT [] +PREHOOK: query: create temporary table sketch_intermediate2 (sketch1 binary, sketch2 binary) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_intermediate2 +POSTHOOK: query: create temporary table sketch_intermediate2 (sketch1 binary, sketch2 binary) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_intermediate2 +PREHOOK: query: insert into sketch_intermediate2 select ds_theta_sketch(id1), ds_theta_sketch(id2) from sketch_input +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@sketch_intermediate2 +POSTHOOK: query: insert into sketch_intermediate2 select ds_theta_sketch(id1), ds_theta_sketch(id2) from sketch_input +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@sketch_intermediate2 +POSTHOOK: Lineage: sketch_intermediate2.sketch1 EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id1, type:int, comment:null), ] +POSTHOOK: Lineage: sketch_intermediate2.sketch2 EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id2, type:int, comment:null), ] +PREHOOK: query: select + ds_theta_estimate(sketch1), + ds_theta_estimate(sketch2), + ds_theta_estimate(ds_theta_union_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_intersect_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch2, sketch1)) +from sketch_intermediate2 +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate2 +#### A masked pattern was here #### +POSTHOOK: query: select + ds_theta_estimate(sketch1), + ds_theta_estimate(sketch2), + ds_theta_estimate(ds_theta_union_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_intersect_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch2, sketch1)) +from sketch_intermediate2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate2 +#### A masked pattern was here #### +10.0 10.0 15.0 5.0 5.0 5.0 diff --git ql/src/test/results/clientpositive/show_functions.q.out ql/src/test/results/clientpositive/show_functions.q.out index 0453400..4b38cfb 100644 --- ql/src/test/results/clientpositive/show_functions.q.out +++ ql/src/test/results/clientpositive/show_functions.q.out @@ -105,6 +105,69 @@ degrees dense_rank div +ds_cpc_estimate +ds_cpc_estimate_bounds +ds_cpc_sketch +ds_cpc_stringify +ds_cpc_union +ds_cpc_union_f +ds_freq_frequent_items +ds_freq_sketch +ds_freq_union +ds_hll_estimate +ds_hll_estimate_bounds +ds_hll_sketch +ds_hll_stringify +ds_hll_union +ds_hll_union_f +ds_kll_cdf +ds_kll_n +ds_kll_pmf +ds_kll_quantile +ds_kll_quantiles +ds_kll_rank +ds_kll_sketch +ds_kll_stringify +ds_kll_union +ds_quantile_doubles_cdf +ds_quantile_doubles_k +ds_quantile_doubles_n +ds_quantile_doubles_pmf +ds_quantile_doubles_quantile +ds_quantile_doubles_quantiles +ds_quantile_doubles_sketch +ds_quantile_doubles_stringify +ds_quantile_doubles_union +ds_quantile_strings_cdf +ds_quantile_strings_k +ds_quantile_strings_n +ds_quantile_strings_pmf +ds_quantile_strings_quantile +ds_quantile_strings_quantiles +ds_quantile_strings_sketch +ds_quantile_strings_stringify +ds_quantile_strings_union +ds_theta_estimate +ds_theta_exclude +ds_theta_intersect +ds_theta_intersect_f +ds_theta_sketch +ds_theta_union +ds_theta_union_f +ds_tuple_arrayofdouble_estimate +ds_tuple_arrayofdouble_estimate_bounds +ds_tuple_arrayofdouble_means +ds_tuple_arrayofdouble_n_retained +ds_tuple_arrayofdouble_quantiles_sketch +ds_tuple_arrayofdouble_sketch +ds_tuple_arrayofdouble_ttest +ds_tuple_arrayofdouble_union +ds_tuple_arrayofdouble_values +ds_tuple_arrayofdouble_variances +ds_tuple_doublesummary_estimate +ds_tuple_doublesummary_percentile +ds_tuple_doublesummary_sketch +ds_tuple_doublesummary_union e elt encode @@ -392,6 +455,16 @@ current_database current_date decode +ds_cpc_estimate +ds_hll_estimate +ds_kll_quantile +ds_quantile_doubles_quantile +ds_quantile_strings_quantile +ds_theta_estimate +ds_theta_exclude +ds_tuple_arrayofdouble_estimate +ds_tuple_doublesummary_estimate +ds_tuple_doublesummary_percentile e encode explode @@ -540,6 +613,69 @@ degrees dense_rank div +ds_cpc_estimate +ds_cpc_estimate_bounds +ds_cpc_sketch +ds_cpc_stringify +ds_cpc_union +ds_cpc_union_f +ds_freq_frequent_items +ds_freq_sketch +ds_freq_union +ds_hll_estimate +ds_hll_estimate_bounds +ds_hll_sketch +ds_hll_stringify +ds_hll_union +ds_hll_union_f +ds_kll_cdf +ds_kll_n +ds_kll_pmf +ds_kll_quantile +ds_kll_quantiles +ds_kll_rank +ds_kll_sketch +ds_kll_stringify +ds_kll_union +ds_quantile_doubles_cdf +ds_quantile_doubles_k +ds_quantile_doubles_n +ds_quantile_doubles_pmf +ds_quantile_doubles_quantile +ds_quantile_doubles_quantiles +ds_quantile_doubles_sketch +ds_quantile_doubles_stringify +ds_quantile_doubles_union +ds_quantile_strings_cdf +ds_quantile_strings_k +ds_quantile_strings_n +ds_quantile_strings_pmf +ds_quantile_strings_quantile +ds_quantile_strings_quantiles +ds_quantile_strings_sketch +ds_quantile_strings_stringify +ds_quantile_strings_union +ds_theta_estimate +ds_theta_exclude +ds_theta_intersect +ds_theta_intersect_f +ds_theta_sketch +ds_theta_union +ds_theta_union_f +ds_tuple_arrayofdouble_estimate +ds_tuple_arrayofdouble_estimate_bounds +ds_tuple_arrayofdouble_means +ds_tuple_arrayofdouble_n_retained +ds_tuple_arrayofdouble_quantiles_sketch +ds_tuple_arrayofdouble_sketch +ds_tuple_arrayofdouble_ttest +ds_tuple_arrayofdouble_union +ds_tuple_arrayofdouble_values +ds_tuple_arrayofdouble_variances +ds_tuple_doublesummary_estimate +ds_tuple_doublesummary_percentile +ds_tuple_doublesummary_sketch +ds_tuple_doublesummary_union e elt encode