diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index bbe7fb0697..8d36c28a9c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -530,6 +530,7 @@ system.registerGenericUDTF("posexplode", GenericUDTFPosExplode.class); system.registerGenericUDTF("stack", GenericUDTFStack.class); system.registerGenericUDTF("get_splits", GenericUDTFGetSplits.class); + system.registerGenericUDTF("get_sql_schema", GenericUDTFGetSQLSchema.class); //PTF declarations system.registerGenericUDF(LEAD_FUNC_NAME, GenericUDFLead.class); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFGetSQLSchema.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFGetSQLSchema.java new file mode 100644 index 0000000000..960ad19ab2 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFGetSQLSchema.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.ParseException; +import org.apache.hadoop.hive.ql.parse.ParseUtils; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.mapred.JobConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * GenericUDTFGetSQLSchema. + */ +@Description(name = "get_sql_schema", value = "_FUNC_(string) - " + + "Takes query as argument. Returns schema (column names and types) of the resultset " + + " that would be generated when the query is executed. " + + "Can be invoked like: select get_sql_schema(\"select * from some_table\")." + + "NOTE: This does not produce any output for DDL queries like show tables/databases/... and others.") +@UDFType(deterministic = false) +public class GenericUDTFGetSQLSchema extends GenericUDTF { + private static final Logger LOG = LoggerFactory.getLogger(GenericUDTFGetSQLSchema.class); + + protected transient StringObjectInspector stringOI; + protected transient JobConf jc; + + private final transient Object[] nameTypePair = new Object[2]; + + @Override + public void process(Object[] arguments) throws HiveException { + + String query = stringOI.getPrimitiveJavaObject(arguments[0]); + LOG.debug("Getting schema for Query: {}", query); + HiveConf conf = new HiveConf(SessionState.get().getConf()); + List fieldSchemas = null; + try { + fieldSchemas = ParseUtils.parseQueryAndGetSchema(conf, query); + } catch (IOException | ParseException e) { + throw new HiveException(e); + } + + if (fieldSchemas != null) { + for (FieldSchema fieldSchema : fieldSchemas) { + nameTypePair[0] = fieldSchema.getName().getBytes(StandardCharsets.UTF_8); + nameTypePair[1] = fieldSchema.getType().getBytes(StandardCharsets.UTF_8); + forward(nameTypePair); + } + } + } + + @Override + public StructObjectInspector initialize(ObjectInspector[] arguments) + throws UDFArgumentException { + + LOG.debug("initializing GenericUDTFGetSQLSchema"); + + if (SessionState.get() == null || SessionState.get().getConf() == null) { + throw new IllegalStateException("Cannot run GET_SQL_SCHEMA outside HS2"); + } + LOG.debug("Initialized conf, jc and metastore connection"); + + if (arguments.length != 1) { + throw new UDFArgumentLengthException( + "The function GET_SQL_SCHEMA accepts 1 argument."); + } else if (!(arguments[0] instanceof StringObjectInspector)) { + LOG.error("Got " + arguments[0].getTypeName() + " instead of string."); + throw new UDFArgumentTypeException(0, "\"" + + "string\" is expected at function GET_SQL_SCHEMA, " + "but \"" + + arguments[0].getTypeName() + "\" is found"); + } + + stringOI = (StringObjectInspector) arguments[0]; + + List names = Arrays.asList("col_name", "col_type"); + List fieldOIs = Arrays.asList(PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector, + PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector); + StructObjectInspector outputOI = ObjectInspectorFactory + .getStandardStructObjectInspector(names, fieldOIs); + + LOG.debug("done initializing GenericUDTFGetSQLSchema"); + return outputOI; + } + + @Override + public String toString() { + return "get_sql_schema"; + } + + @Override + public void close() throws HiveException { + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDTFGetSQLSchema.java ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDTFGetSQLSchema.java new file mode 100644 index 0000000000..3615d2b778 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDTFGetSQLSchema.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * TestGenericUDTFGetSQLSchema. + */ +public class TestGenericUDTFGetSQLSchema { + + private static SessionState sessionState; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + HiveConf conf = new HiveConf(); + conf.set("hive.security.authorization.manager", + "org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider"); + sessionState = SessionState.start(conf); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + SessionState.endStart(sessionState); + } + + @Test + public void testWithComplexTypes() throws Exception { + invokeUDTFAndTest("select array('val1','val2') c1," + + " named_struct('a',1,'b','2') c2, " + + " array(array(1)) c3," + + " array(named_struct('a',1,'b','2')) c4," + + " map(1,1) c5", + new String[]{"c1", "array", + "c2", "struct", + "c3", "array>", + "c4", "array>", + "c5", "map" + }); + } + + @Test + public void testWithSimpleTypes() throws Exception { + invokeUDTFAndTest("select 1 as c1, 'Happy Valentines Day' as c2, 2.2 as c3, cast(2.2 as float) c4, " + + "cast(2.2 as double) c5, " + + "cast('2019-02-14' as date) c6", + new String[]{"c1", "int", + "c2", "string", + "c3", "decimal(2,1)", + "c4", "float", + "c5", "double", + "c6", "date" + }); + } + + @Test + public void testWithDDL() throws Exception { + invokeUDTFAndTest("show tables", new String[]{}); + } + + private void invokeUDTFAndTest(String query, String[] expected) throws HiveException { + + GenericUDTFGetSQLSchema genericUDTFGetSQLSchema = new GenericUDTFGetSQLSchema(); + List actual = new ArrayList<>(); + genericUDTFGetSQLSchema.collector = input -> { + if (input != null) { + Object[] udfOutput = (Object[]) input; + actual.add(new String((byte[]) udfOutput[0])); + actual.add(new String((byte[]) udfOutput[1])); + } + }; + + genericUDTFGetSQLSchema + .initialize(new ObjectInspector[]{PrimitiveObjectInspectorFactory.javaStringObjectInspector}); + genericUDTFGetSQLSchema.process(new Object[]{query}); + + assertEquals(expected.length, actual.size()); + assertTrue("Failed for query: " + query + ". Expected: " + Arrays.toString(expected) + + ". Actual: " + actual, Arrays.equals(expected, actual.toArray())); + } + +} diff --git ql/src/test/queries/clientpositive/udtf_get_sql_schema.q ql/src/test/queries/clientpositive/udtf_get_sql_schema.q new file mode 100644 index 0000000000..b8fadd4bc2 --- /dev/null +++ ql/src/test/queries/clientpositive/udtf_get_sql_schema.q @@ -0,0 +1,17 @@ +set hive.fetch.task.conversion=more; +set hive.mapred.mode=nonstrict; + +describe function get_sql_schema; +describe function extended get_sql_schema; + +create table t1(c1 int, c2 float, c3 double, c4 string, c5 date, c6 array, c7 struct, c8 map); +insert into t1 select 1, 1.1, 2.2, 'val1', '2019-02-15', array(1), named_struct('a',1,'b','2'), map(1,1); + +explain select get_sql_schema('select * from t1'); +select get_sql_schema('select * from t1'); + +create external table t2(c1 int, c2 float, c3 double, c4 string, c5 date, c6 array, c7 struct, c8 map); +insert into t2 select 1, 1.1, 2.2, 'val1', '2019-02-15', array(1), named_struct('a',1,'b','2'), map(1,1); + +explain select get_sql_schema('select * from t2'); +select get_sql_schema('select * from t2'); diff --git ql/src/test/results/clientpositive/show_functions.q.out ql/src/test/results/clientpositive/show_functions.q.out index 0fdcbda66f..5368bd38f0 100644 --- ql/src/test/results/clientpositive/show_functions.q.out +++ ql/src/test/results/clientpositive/show_functions.q.out @@ -110,6 +110,7 @@ from_unixtime from_utc_timestamp get_json_object get_splits +get_sql_schema greatest grouping hash diff --git ql/src/test/results/clientpositive/udtf_get_sql_schema.q.out ql/src/test/results/clientpositive/udtf_get_sql_schema.q.out new file mode 100644 index 0000000000..2faf5aad57 --- /dev/null +++ ql/src/test/results/clientpositive/udtf_get_sql_schema.q.out @@ -0,0 +1,158 @@ +PREHOOK: query: describe function get_sql_schema +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: describe function get_sql_schema +POSTHOOK: type: DESCFUNCTION +get_sql_schema(string) - Takes query as argument. Returns schema (column names and types) of the resultset that would be generated when the query is executed. Can be invoked like: select get_sql_schema("select * from some_table").NOTE: This does not produce any output for DDL queries like show tables/databases/... and others. +PREHOOK: query: describe function extended get_sql_schema +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: describe function extended get_sql_schema +POSTHOOK: type: DESCFUNCTION +get_sql_schema(string) - Takes query as argument. Returns schema (column names and types) of the resultset that would be generated when the query is executed. Can be invoked like: select get_sql_schema("select * from some_table").NOTE: This does not produce any output for DDL queries like show tables/databases/... and others. +Function class:org.apache.hadoop.hive.ql.udf.generic.GenericUDTFGetSQLSchema +Function type:BUILTIN +PREHOOK: query: create table t1(c1 int, c2 float, c3 double, c4 string, c5 date, c6 array, c7 struct, c8 map) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1 +POSTHOOK: query: create table t1(c1 int, c2 float, c3 double, c4 string, c5 date, c6 array, c7 struct, c8 map) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1 +PREHOOK: query: insert into t1 select 1, 1.1, 2.2, 'val1', '2019-02-15', array(1), named_struct('a',1,'b','2'), map(1,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t1 +POSTHOOK: query: insert into t1 select 1, 1.1, 2.2, 'val1', '2019-02-15', array(1), named_struct('a',1,'b','2'), map(1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.c1 SIMPLE [] +POSTHOOK: Lineage: t1.c2 EXPRESSION [] +POSTHOOK: Lineage: t1.c3 EXPRESSION [] +POSTHOOK: Lineage: t1.c4 SIMPLE [] +POSTHOOK: Lineage: t1.c5 EXPRESSION [] +POSTHOOK: Lineage: t1.c6 EXPRESSION [] +POSTHOOK: Lineage: t1.c7 EXPRESSION [] +POSTHOOK: Lineage: t1.c8 EXPRESSION [] +PREHOOK: query: explain select get_sql_schema('select * from t1') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: explain select get_sql_schema('select * from t1') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'select * from t1' (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE + UDTF Operator + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE + function name: get_sql_schema + Select Operator + expressions: col_name (type: binary), col_type (type: binary) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: select get_sql_schema('select * from t1') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select get_sql_schema('select * from t1') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +t1.c1 int +t1.c2 float +t1.c3 double +t1.c4 string +t1.c5 date +t1.c6 array +t1.c7 struct +t1.c8 map +PREHOOK: query: create external table t2(c1 int, c2 float, c3 double, c4 string, c5 date, c6 array, c7 struct, c8 map) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2 +POSTHOOK: query: create external table t2(c1 int, c2 float, c3 double, c4 string, c5 date, c6 array, c7 struct, c8 map) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2 +PREHOOK: query: insert into t2 select 1, 1.1, 2.2, 'val1', '2019-02-15', array(1), named_struct('a',1,'b','2'), map(1,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t2 +POSTHOOK: query: insert into t2 select 1, 1.1, 2.2, 'val1', '2019-02-15', array(1), named_struct('a',1,'b','2'), map(1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t2 +POSTHOOK: Lineage: t2.c1 SIMPLE [] +POSTHOOK: Lineage: t2.c2 EXPRESSION [] +POSTHOOK: Lineage: t2.c3 EXPRESSION [] +POSTHOOK: Lineage: t2.c4 SIMPLE [] +POSTHOOK: Lineage: t2.c5 EXPRESSION [] +POSTHOOK: Lineage: t2.c6 EXPRESSION [] +POSTHOOK: Lineage: t2.c7 EXPRESSION [] +POSTHOOK: Lineage: t2.c8 EXPRESSION [] +PREHOOK: query: explain select get_sql_schema('select * from t2') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: explain select get_sql_schema('select * from t2') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'select * from t2' (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE + UDTF Operator + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE + function name: get_sql_schema + Select Operator + expressions: col_name (type: binary), col_type (type: binary) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: select get_sql_schema('select * from t2') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select get_sql_schema('select * from t2') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +t2.c1 int +t2.c2 float +t2.c3 double +t2.c4 string +t2.c5 date +t2.c6 array +t2.c7 struct +t2.c8 map