diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 73492ff..a96ebd3 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -660,6 +660,16 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal @Deprecated METASTORE_CAPABILITY_CHECK("hive.metastore.client.capability.check", true, "Whether to check client capabilities for potentially breaking API usage."), + METASTORE_CLIENT_CACHE_ENABLED("hive.metastore.client.cache.enabled", false, + "Whether to enable metastore client cache"), + METASTORE_CLIENT_CACHE_EXPIRY_TIME("hive.metastore.client.cache.expiry.time", "120s", + new TimeValidator(TimeUnit.SECONDS), "Expiry time for metastore client cache"), + METASTORE_CLIENT_CACHE_INITIAL_CAPACITY("hive.metastore.client.cache.initial.capacity", 50, + "Initial capacity for metastore client cache"), + METASTORE_CLIENT_CACHE_MAX_CAPACITY("hive.metastore.client.cache.max.capacity", 50, + "Max capacity for metastore client cache"), + METASTORE_CLIENT_CACHE_STATS_ENABLED("hive.metastore.client.cache.stats.enabled", false, + "Whether to enable metastore client cache stats"), METASTORE_FASTPATH("hive.metastore.fastpath", false, "Used to avoid all of the proxies and object copies in the metastore. Note, if this is " + "set, you MUST use a local metastore (hive.metastore.uris must be empty) otherwise " + @@ -2486,6 +2496,10 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "It will also increase the background load on the Hadoop cluster as more MapReduce jobs\n" + "will be running in the background."), + HIVE_STREAMING_CONNECTION_CLIENT_HEARTBEAT_INTERVAL("hive.streaming.connection.client.heartbeat.interval", "5s", + new TimeValidator(TimeUnit.SECONDS), "Specify a time interval for hive streaming connection client to\n" + + "heartbeat all open transactions with metastore"), + HIVE_COMPACTOR_WORKER_TIMEOUT("hive.compactor.worker.timeout", "86400s", new TimeValidator(TimeUnit.SECONDS), "Time in seconds after which a compaction job will be declared failed and the\n" + diff --git a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java index 82ba775..37cc8a0 100644 --- a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java +++ b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java @@ -17,6 +17,9 @@ */ package org.apache.hadoop.hive.ql.txn.compactor; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + import java.io.File; import java.io.FileWriter; import java.io.IOException; @@ -70,10 +73,10 @@ import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.mapred.JobConf; import org.apache.hive.hcatalog.common.HCatUtil; -import org.apache.hive.streaming.DelimitedInputWriter; -import org.apache.hive.streaming.HiveEndPoint; +import org.apache.hive.streaming.HiveStreamingConnection; import org.apache.hive.streaming.StreamingConnection; import org.apache.hive.streaming.StreamingException; +import org.apache.hive.streaming.StrictDelimitedInputWriter; import org.apache.hive.streaming.TransactionBatch; import org.apache.orc.OrcConf; import org.junit.After; @@ -160,7 +163,6 @@ public void tearDown() { @Test public void schemaEvolutionAddColDynamicPartitioningInsert() throws Exception { String tblName = "dpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + @@ -241,7 +243,6 @@ public void schemaEvolutionAddColDynamicPartitioningInsert() throws Exception { @Test public void schemaEvolutionAddColDynamicPartitioningUpdate() throws Exception { String tblName = "udpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + @@ -402,7 +403,7 @@ public void testStatsAfterCompactionPartTbl() throws Exception { Map> stats = msClient.getPartitionColumnStatistics(ci.dbname, ci.tableName, Arrays.asList(ci.partName), colNames); List colStats = stats.get(ci.partName); - Assert.assertNotNull("No stats found for partition " + ci.partName, colStats); + assertNotNull("No stats found for partition " + ci.partName, colStats); Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName()); Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName()); LongColumnStatsData colAStats = colStats.get(0).getStatsData().getLongStats(); @@ -423,31 +424,31 @@ public void testStatsAfterCompactionPartTbl() throws Exception { LongColumnStatsData colAStatsPart2 = colStats.get(0).getStatsData().getLongStats(); StringColumnStatsData colBStatsPart2 = colStats.get(1).getStatsData().getStringStats(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(ci.dbname) + .withTable(ci.tableName) + .withStaticPartitionValues(Arrays.asList("0")) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); + connection.beginNextTransaction(); + Assert.assertEquals(TransactionBatch.TxnState.OPEN, connection.getCurrentTransactionState()); + connection.write("50,Kiev".getBytes()); + connection.write("51,St. Petersburg".getBytes()); + connection.write("44,Boston".getBytes()); + connection.commit(); + + connection.beginNextTransaction(); + connection.write("52,Tel Aviv".getBytes()); + connection.write("53,Atlantis".getBytes()); + connection.write("53,Boston".getBytes()); + connection.commit(); - HiveEndPoint endPt = new HiveEndPoint(null, ci.dbname, ci.tableName, Arrays.asList("0")); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - /*next call will eventually end up in HiveEndPoint.createPartitionIfNotExists() which - makes an operation on Driver - * and starts it's own CliSessionState and then closes it, which removes it from ThreadLoacal; - * thus the session - * created in this class is gone after this; I fixed it in HiveEndPoint*/ - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - Assert.assertEquals(TransactionBatch.TxnState.OPEN, txnBatch.getCurrentTransactionState()); - txnBatch.write("50,Kiev".getBytes()); - txnBatch.write("51,St. Petersburg".getBytes()); - txnBatch.write("44,Boston".getBytes()); - txnBatch.commit(); - - txnBatch.beginNextTransaction(); - txnBatch.write("52,Tel Aviv".getBytes()); - txnBatch.write("53,Atlantis".getBytes()); - txnBatch.write("53,Boston".getBytes()); - txnBatch.commit(); - - txnBatch.close(); + connection.close(); connection.close(); execSelectAndDumpData("select * from " + ci.getFullTableName(), driver, ci.getFullTableName()); @@ -473,7 +474,7 @@ public void testStatsAfterCompactionPartTbl() throws Exception { stats = msClient.getPartitionColumnStatistics(ci.dbname, ci.tableName, Arrays.asList(ci.partName), colNames); colStats = stats.get(ci.partName); - Assert.assertNotNull("No stats found for partition " + ci.partName, colStats); + assertNotNull("No stats found for partition " + ci.partName, colStats); Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName()); Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName()); colAStats = colStats.get(0).getStatsData().getLongStats(); @@ -501,7 +502,6 @@ public void testStatsAfterCompactionPartTbl() throws Exception { @Test public void dynamicPartitioningInsert() throws Exception { String tblName = "dpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + @@ -538,7 +538,6 @@ public void dynamicPartitioningInsert() throws Exception { @Test public void dynamicPartitioningUpdate() throws Exception { String tblName = "udpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + @@ -578,7 +577,6 @@ public void dynamicPartitioningUpdate() throws Exception { @Test public void dynamicPartitioningDelete() throws Exception { String tblName = "ddpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + @@ -620,7 +618,6 @@ public void dynamicPartitioningDelete() throws Exception { public void minorCompactWhileStreaming() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); @@ -628,17 +625,15 @@ public void minorCompactWhileStreaming() throws Exception { " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StreamingConnection connection = null; try { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + writeBatch(dbName, tblName, false); } // Start a third batch, but don't close it. - writeBatch(connection, writer, true); + connection = writeBatch(dbName, tblName, true); // Now, compact TxnStore txnHandler = TxnUtils.getTxnStore(conf); @@ -675,7 +670,9 @@ public void minorCompactWhileStreaming() throws Exception { 0, 1L, 4L, 1); } finally { - connection.close(); + if (connection != null) { + connection.close(); + } } } @@ -683,7 +680,6 @@ public void minorCompactWhileStreaming() throws Exception { public void majorCompactWhileStreaming() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); @@ -691,18 +687,16 @@ public void majorCompactWhileStreaming() throws Exception { " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed " STORED AS ORC TBLPROPERTIES ('transactional'='true') ", driver); - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StreamingConnection connection = null; try { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + writeBatch(dbName, tblName, false); } // Start a third batch, but don't close it. this delta will be ignored by compaction since // it has an open txn in it - writeBatch(connection, writer, true); + connection = writeBatch(dbName, tblName, true); // Now, compact TxnStore txnHandler = TxnUtils.getTxnStore(conf); @@ -728,16 +722,16 @@ public void majorCompactWhileStreaming() throws Exception { Assert.assertEquals(name, "base_0000004"); checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 0, 1L, 4L, 1); } finally { - connection.close(); + if (connection != null) { + connection.close(); + } } } @Test public void minorCompactAfterAbort() throws Exception { - String agentInfo = "UT_" + Thread.currentThread().getName(); String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); @@ -745,21 +739,30 @@ public void minorCompactAfterAbort() throws Exception { " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StreamingConnection connection = null; try { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + connection = writeBatch(dbName, tblName, false); + assertNull(connection); } + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .withTransactionBatchSize(2) + .connect(); // Start a third batch, abort everything, don't properly close it - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.abort(); - txnBatch.beginNextTransaction(); - txnBatch.abort(); + connection2.beginNextTransaction(); + connection2.abort(); + connection2.beginNextTransaction(); + connection2.abort(); // Now, compact TxnStore txnHandler = TxnUtils.getTxnStore(conf); @@ -793,8 +796,11 @@ public void minorCompactAfterAbort() throws Exception { Assert.fail("Expected: " + Arrays.toString(expected) + ", found: " + Arrays.toString(names)); } checkExpectedTxnsPresent(null, new Path[]{resultDelta}, columnNamesProperty, columnTypesProperty, 0, 1L, 4L, 1); + connection2.close(); } finally { - connection.close(); + if (connection != null) { + connection.close(); + } } } @@ -802,7 +808,6 @@ public void minorCompactAfterAbort() throws Exception { public void majorCompactAfterAbort() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); @@ -810,21 +815,30 @@ public void majorCompactAfterAbort() throws Exception { " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StreamingConnection connection = null; try { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + connection = writeBatch(dbName, tblName, false); + assertNull(connection); } + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .withTransactionBatchSize(2) + .connect(); // Start a third batch, but don't close it. - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.abort(); - txnBatch.beginNextTransaction(); - txnBatch.abort(); + connection2.beginNextTransaction(); + connection2.abort(); + connection2.beginNextTransaction(); + connection2.abort(); // Now, compact @@ -855,8 +869,11 @@ public void majorCompactAfterAbort() throws Exception { Assert.fail("majorCompactAfterAbort name " + name + " not equals to base_0000004"); } checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 0, 1L, 4L, 1); + connection2.close(); } finally { - connection.close(); + if (connection != null) { + connection.close(); + } } } @@ -864,7 +881,6 @@ public void majorCompactAfterAbort() throws Exception { public void majorCompactWhileStreamingForSplitUpdate() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); @@ -873,17 +889,15 @@ public void majorCompactWhileStreamingForSplitUpdate() throws Exception { " STORED AS ORC TBLPROPERTIES ('transactional'='true', " + "'transactional_properties'='default') ", driver); // this turns on split-update U=D+I - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StreamingConnection connection = null; try { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + writeBatch(dbName, tblName, false); } // Start a third batch, but don't close it. - writeBatch(connection, writer, true); + connection = writeBatch(dbName, tblName, true); // Now, compact TxnStore txnHandler = TxnUtils.getTxnStore(conf); @@ -909,16 +923,16 @@ public void majorCompactWhileStreamingForSplitUpdate() throws Exception { Assert.assertEquals(name, "base_0000004"); checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 0, 1L, 4L, 2); } finally { - connection.close(); + if (connection != null) { + connection.close(); + } } } @Test public void testMinorCompactionForSplitUpdateWithInsertsAndDeletes() throws Exception { - String agentInfo = "UT_" + Thread.currentThread().getName(); String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); @@ -995,10 +1009,8 @@ public void testMinorCompactionForSplitUpdateWithInsertsAndDeletes() throws Exce @Test public void testMinorCompactionForSplitUpdateWithOnlyInserts() throws Exception { - String agentInfo = "UT_" + Thread.currentThread().getName(); String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); @@ -1076,7 +1088,6 @@ public void testMinorCompactionForSplitUpdateWithOnlyInserts() throws Exception public void minorCompactWhileStreamingWithSplitUpdate() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); @@ -1085,17 +1096,15 @@ public void minorCompactWhileStreamingWithSplitUpdate() throws Exception { " STORED AS ORC TBLPROPERTIES ('transactional'='true'," + "'transactional_properties'='default')", driver); - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StreamingConnection connection = null; try { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + writeBatch(dbName, tblName, false); } // Start a third batch, but don't close it. - writeBatch(connection, writer, true); + connection = writeBatch(dbName, tblName, true); // Now, compact TxnStore txnHandler = TxnUtils.getTxnStore(conf); @@ -1151,7 +1160,9 @@ public void minorCompactWhileStreamingWithSplitUpdate() throws Exception { checkExpectedTxnsPresent(null, new Path[]{minorCompactedDeleteDelta}, columnNamesProperty, columnTypesProperty, 0, 0L, 0L, 1); } finally { - connection.close(); + if (connection != null) { + connection.close(); + } } } @@ -1329,25 +1340,34 @@ public void testCompactionInfoHashCode() { Assert.assertEquals("The hash codes must be equal", compactionInfo.hashCode(), compactionInfo1.hashCode()); } - private void writeBatch(StreamingConnection connection, DelimitedInputWriter writer, - boolean closeEarly) - throws InterruptedException, StreamingException { - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("50,Kiev".getBytes()); - txnBatch.write("51,St. Petersburg".getBytes()); - txnBatch.write("44,Boston".getBytes()); - txnBatch.commit(); + private StreamingConnection writeBatch(String dbName, String tblName, boolean closeEarly) throws StreamingException { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .withTransactionBatchSize(2) + .connect(); + connection.beginNextTransaction(); + connection.write("50,Kiev".getBytes()); + connection.write("51,St. Petersburg".getBytes()); + connection.write("44,Boston".getBytes()); + connection.commit(); if (!closeEarly) { - txnBatch.beginNextTransaction(); - txnBatch.write("52,Tel Aviv".getBytes()); - txnBatch.write("53,Atlantis".getBytes()); - txnBatch.write("53,Boston".getBytes()); - txnBatch.commit(); - - txnBatch.close(); + connection.beginNextTransaction(); + connection.write("52,Tel Aviv".getBytes()); + connection.write("53,Atlantis".getBytes()); + connection.write("53,Boston".getBytes()); + connection.commit(); + connection.close(); + return null; } + return connection; } private void checkExpectedTxnsPresent(Path base, Path[] deltas, String columnNamesProperty, diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveClientCache.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveClientCache.java new file mode 100644 index 0000000..6c33f63 --- /dev/null +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveClientCache.java @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.hadoop.hive.metastore; + +import java.io.IOException; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import javax.security.auth.login.LoginException; + +import org.apache.commons.lang.builder.EqualsBuilder; +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.hadoop.hive.common.classification.InterfaceAudience; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.annotation.NoReconnect; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.shims.Utils; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hive.common.util.ShutdownHookManager; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.RemovalListener; +import com.google.common.cache.RemovalNotification; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +/** + * A thread safe time expired cache for HiveMetaStoreClient + */ +class HiveClientCache { + public final static int DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS = 2 * 60; + public final static int DEFAULT_HIVE_CACHE_INITIAL_CAPACITY = 50; + public final static int DEFAULT_HIVE_CACHE_MAX_CAPACITY = 50; + public final static boolean DEFAULT_HIVE_CLIENT_CACHE_STATS_ENABLED = false; + + private final Cache hiveCache; + private static final Logger LOG = LoggerFactory.getLogger(HiveClientCache.class); + private final int timeout; + // This lock is used to make sure removalListener won't close a client that is being contemplated for returning by get() + private final Object CACHE_TEARDOWN_LOCK = new Object(); + + private static final AtomicInteger nextId = new AtomicInteger(0); + + private final ScheduledFuture cleanupHandle; // used to cleanup cache + + private boolean enableStats; + + // Since HiveMetaStoreClient is not threadsafe, hive clients are not shared across threads. + // Thread local variable containing each thread's unique ID, is used as one of the keys for the cache + // causing each thread to get a different client even if the conf is same. + private static final ThreadLocal threadId = + new ThreadLocal() { + @Override + protected Integer initialValue() { + return nextId.getAndIncrement(); + } + }; + + private int getThreadId() { + return threadId.get(); + } + + public static IMetaStoreClient getNonCachedHiveMetastoreClient(HiveConf hiveConf) throws MetaException { + return RetryingMetaStoreClient.getProxy(hiveConf, true); + } + + public HiveClientCache(HiveConf hiveConf) { + this((int) HiveConf.getTimeVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_EXPIRY_TIME, TimeUnit.SECONDS), + HiveConf.getIntVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_INITIAL_CAPACITY), + HiveConf.getIntVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_MAX_CAPACITY), + HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_STATS_ENABLED)); + } + + /** + * @deprecated This constructor will be made private or removed as more configuration properties are required. + */ + @Deprecated + public HiveClientCache(final int timeout) { + this(timeout, DEFAULT_HIVE_CACHE_INITIAL_CAPACITY, DEFAULT_HIVE_CACHE_MAX_CAPACITY, DEFAULT_HIVE_CLIENT_CACHE_STATS_ENABLED); + } + + /** + * @param timeout the length of time in seconds after a client is created that it should be automatically removed + */ + private HiveClientCache(final int timeout, final int initialCapacity, final int maxCapacity, final boolean enableStats) { + this.timeout = timeout; + this.enableStats = enableStats; + + LOG.info("Initializing cache: eviction-timeout=" + timeout + " initial-capacity=" + initialCapacity + " maximum-capacity=" + maxCapacity); + + CacheBuilder builder = CacheBuilder.newBuilder() + .initialCapacity(initialCapacity) + .maximumSize(maxCapacity) + .expireAfterAccess(timeout, TimeUnit.SECONDS) + .removalListener(createRemovalListener()); + + /* + * Guava versions <12.0 have stats collection enabled by default and do not expose a recordStats method. + * Check for newer versions of the library and ensure that stats collection is enabled by default. + */ + try { + java.lang.reflect.Method m = builder.getClass().getMethod("recordStats", null); + m.invoke(builder, null); + } catch (NoSuchMethodException e) { + LOG.debug("Using a version of guava <12.0. Stats collection is enabled by default."); + } catch (Exception e) { + LOG.warn("Unable to invoke recordStats method.", e); + } + + this.hiveCache = builder.build(); + + /* + * We need to use a cleanup interval, which is how often the cleanup thread will kick in + * and go do a check to see if any of the connections can be expired. We don't want to + * do this too often, because it'd be like having a mini-GC going off every so often, + * so we limit it to a minimum of DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS. If the client + * has explicitly set a larger timeout on the cache, though, we respect that, and use that + */ + long cleanupInterval = timeout > DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS ? timeout : DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS; + + this.cleanupHandle = createCleanupThread(cleanupInterval); + + createShutdownHook(); + } + + private RemovalListener createRemovalListener() { + RemovalListener listener = + new RemovalListener() { + @Override + public void onRemoval(RemovalNotification notification) { + ICacheableMetaStoreClient hiveMetaStoreClient = notification.getValue(); + if (hiveMetaStoreClient != null) { + if (LOG.isDebugEnabled()) { + LOG.debug("Evicting client: " + Integer.toHexString(System.identityHashCode(hiveMetaStoreClient))); + } + + // TODO: This global lock may not be necessary as all concurrent methods in ICacheableMetaStoreClient + // are synchronized. + synchronized (CACHE_TEARDOWN_LOCK) { + hiveMetaStoreClient.setExpiredFromCache(); + hiveMetaStoreClient.tearDownIfUnused(); + } + } + } + }; + + return listener; + } + + private ScheduledFuture createCleanupThread(long interval) { + // Add a maintenance thread that will attempt to trigger a cache clean continuously + Runnable cleanupThread = new Runnable() { + @Override + public void run() { + cleanup(); + } + }; + + /** + * Create the cleanup handle. In addition to cleaning up every cleanupInterval, we add + * a slight offset, so that the very first time it runs, it runs with a slight delay, so + * as to catch any other connections that were closed when the first timeout happened. + * As a result, the time we can expect an unused connection to be reaped is + * 5 seconds after the first timeout, and then after that, it'll check for whether or not + * it can be cleaned every max(DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS,timeout) seconds + */ + ThreadFactory daemonThreadFactory = (new ThreadFactoryBuilder()).setDaemon(true) + .setNameFormat("HiveClientCache-cleaner-%d") + .build(); + + return Executors.newScheduledThreadPool(1, daemonThreadFactory) + .scheduleWithFixedDelay(cleanupThread, timeout + 5, interval, TimeUnit.SECONDS); + } + + private void createShutdownHook() { + // Add a shutdown hook for cleanup, if there are elements remaining in the cache which were not cleaned up. + // This is the best effort approach. Ignore any error while doing so. Notice that most of the clients + // would get cleaned up via either the removalListener or the close() call, only the active clients + // that are in the cache or expired but being used in other threads wont get cleaned. The following code will only + // clean the active cache ones. The ones expired from cache but being hold by other threads are in the mercy + // of finalize() being called. + Thread cleanupHiveClientShutdownThread = new Thread() { + @Override + public void run() { + LOG.debug("Cleaning up hive client cache in ShutDown hook"); + cleanupHandle.cancel(false); // Cancel the maintenance thread. + closeAllClientsQuietly(); + } + }; + + ShutdownHookManager.addShutdownHook(cleanupHiveClientShutdownThread); + } + + /** + * Note: This doesn't check if they are being used or not, meant only to be called during shutdown etc. + */ + void closeAllClientsQuietly() { + try { + ConcurrentMap elements = hiveCache.asMap(); + for (ICacheableMetaStoreClient cacheableHiveMetaStoreClient : elements.values()) { + cacheableHiveMetaStoreClient.tearDown(); + } + } catch (Exception e) { + LOG.warn("Clean up of hive clients in the cache failed. Ignored", e); + } + + if (this.enableStats) { + LOG.info("Cache statistics after shutdown: size=" + hiveCache.size() + " " + hiveCache.stats()); + } + } + + public void cleanup() { + // TODO: periodically reload a new HiveConf to check if stats reporting is enabled. + hiveCache.cleanUp(); + + if (enableStats) { + LOG.info("Cache statistics after cleanup: size=" + hiveCache.size() + " " + hiveCache.stats()); + } + } + + /** + * Returns a cached client if exists or else creates one, caches and returns it. It also checks that the client is + * healthy and can be reused + * @param hiveConf + * @return the hive client + * @throws MetaException + * @throws IOException + * @throws LoginException + */ + public IMetaStoreClient get(final HiveConf hiveConf) throws MetaException, IOException, LoginException { + final HiveClientCacheKey cacheKey = HiveClientCacheKey.fromHiveConf(hiveConf, getThreadId()); + ICacheableMetaStoreClient cacheableHiveMetaStoreClient = null; + + // the hmsc is not shared across threads. So the only way it could get closed while we are doing healthcheck + // is if removalListener closes it. The synchronization takes care that removalListener won't do it + synchronized (CACHE_TEARDOWN_LOCK) { + cacheableHiveMetaStoreClient = getOrCreate(cacheKey); + cacheableHiveMetaStoreClient.acquire(); + } + if (!cacheableHiveMetaStoreClient.isOpen()) { + synchronized (CACHE_TEARDOWN_LOCK) { + hiveCache.invalidate(cacheKey); + cacheableHiveMetaStoreClient.close(); + cacheableHiveMetaStoreClient = getOrCreate(cacheKey); + cacheableHiveMetaStoreClient.acquire(); + } + } + return cacheableHiveMetaStoreClient; + } + + /** + * Return from cache if exists else create/cache and return + * @param cacheKey + * @return + * @throws IOException + * @throws MetaException + * @throws LoginException + */ + private ICacheableMetaStoreClient getOrCreate(final HiveClientCacheKey cacheKey) + throws IOException, MetaException, LoginException { + try { + return hiveCache.get(cacheKey, new Callable() { + @Override + public ICacheableMetaStoreClient call() throws MetaException { + // This is called from HCat, so always allow embedded metastore (as was the default). + return + (ICacheableMetaStoreClient) RetryingMetaStoreClient.getProxy(cacheKey.getHiveConf(), + new Class[]{HiveConf.class, Integer.class, Boolean.class}, + new Object[]{cacheKey.getHiveConf(), timeout, true}, + CacheableHiveMetaStoreClient.class.getName()); + } + }); + } catch (ExecutionException e) { + Throwable t = e.getCause(); + if (t instanceof IOException) { + throw (IOException) t; + } else if (t instanceof MetaException) { + throw (MetaException) t; + } else if (t instanceof LoginException) { + throw (LoginException) t; + } else { + throw new IOException("Error creating hiveMetaStoreClient", t); + } + } + } + + /** + * A class to wrap HiveConf and expose equality based only on UserGroupInformation and the metaStoreURIs. + * This becomes the key for the cache and this way the same HiveMetaStoreClient would be returned if + * UserGroupInformation and metaStoreURIs are same. This function can evolve to express + * the cases when HiveConf is different but the same hiveMetaStoreClient can be used + */ + static class HiveClientCacheKey { + final private String metaStoreURIs; + final private UserGroupInformation ugi; + final private HiveConf hiveConf; + final private int threadId; + + private HiveClientCacheKey(HiveConf hiveConf, final int threadId) throws IOException, LoginException { + this.metaStoreURIs = hiveConf.getVar(HiveConf.ConfVars.METASTOREURIS); + ugi = Utils.getUGI(); + this.hiveConf = hiveConf; + this.threadId = threadId; + } + + public static HiveClientCacheKey fromHiveConf(HiveConf hiveConf, final int threadId) throws IOException, LoginException { + return new HiveClientCacheKey(hiveConf, threadId); + } + + public HiveConf getHiveConf() { + return hiveConf; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + HiveClientCacheKey that = (HiveClientCacheKey) o; + return new EqualsBuilder(). + append(this.metaStoreURIs, + that.metaStoreURIs). + append(this.ugi, that.ugi). + append(this.threadId, that.threadId).isEquals(); + } + + @Override + public int hashCode() { + return new HashCodeBuilder(). + append(metaStoreURIs). + append(ugi). + append(threadId).toHashCode(); + } + + @Override + public String toString() { + return "HiveClientCacheKey: uri=" + this.metaStoreURIs + " ugi=" + this.ugi + " thread=" + this.threadId; + } + } + + @InterfaceAudience.Private + public interface ICacheableMetaStoreClient extends IMetaStoreClient { + @NoReconnect + void acquire(); + + @NoReconnect + void setExpiredFromCache(); + + @NoReconnect + AtomicInteger getUsers(); + + @NoReconnect + boolean isClosed(); + + /** + * @deprecated This method is not used internally and should not be visible through HCatClient.create. + */ + @Deprecated + @NoReconnect + boolean isOpen(); + + @NoReconnect + void tearDownIfUnused(); + + @NoReconnect + void tearDown(); + } + + /** + * Add # of current users on HiveMetaStoreClient, so that the client can be cleaned when no one is using it. + */ + static class CacheableHiveMetaStoreClient extends HiveMetaStoreClient implements ICacheableMetaStoreClient { + + private final AtomicInteger users = new AtomicInteger(0); + private volatile boolean expiredFromCache = false; + private boolean isClosed = false; + + CacheableHiveMetaStoreClient(final HiveConf conf, final Integer timeout, Boolean allowEmbedded) + throws MetaException { + super(conf, null, allowEmbedded); + } + + /** + * Increments the user count and optionally renews the expiration time. + * renew should correspond with the expiration policy of the cache. + * When the policy is expireAfterAccess, the expiration time should be extended. + * When the policy is expireAfterWrite, the expiration time should not be extended. + * A mismatch with the policy will lead to closing the connection unnecessarily after the initial + * expiration time is generated. + */ + public synchronized void acquire() { + users.incrementAndGet(); + if (users.get() > 1) { + LOG.warn("Unexpected increment of user count beyond one: " + users.get() + " " + this); + } + } + + /** + * Decrements the user count. + */ + private void release() { + if (users.get() > 0) { + users.decrementAndGet(); + } else { + LOG.warn("Unexpected attempt to decrement user count of zero: " + users.get() + " " + this); + } + } + + /** + * Communicate to the client that it is no longer in the cache. + * The expiration time should be voided to allow the connection to be closed at the first opportunity. + */ + public synchronized void setExpiredFromCache() { + if (users.get() != 0) { + LOG.warn("Evicted client has non-zero user count: " + users.get()); + } + + expiredFromCache = true; + } + + public boolean isClosed() { + return isClosed; + } + + /* + * Used only for Debugging or testing purposes + */ + public AtomicInteger getUsers() { + return users; + } + + /** + * Make a call to hive meta store and see if the client is still usable. Some calls where the user provides + * invalid data renders the client unusable for future use (example: create a table with very long table name) + * @return + */ + @Deprecated + public boolean isOpen() { + try { + // Look for an unlikely database name and see if either MetaException or TException is thrown + super.getDatabases("NonExistentDatabaseUsedForHealthCheck"); + } catch (TException e) { + return false; + } + return true; + } + + /** + * Decrement the user count and piggyback this to set expiry flag as well, then teardown(), if conditions are met. + * This *MUST* be called by anyone who uses this client. + */ + @Override + public synchronized void close() { + release(); + tearDownIfUnused(); + } + + /** + * Attempt to tear down the client connection. + * The connection will be closed if the following conditions hold: + * 1. There are no active user holding the client. + * 2. The client has been evicted from the cache. + */ + public synchronized void tearDownIfUnused() { + if (users.get() != 0) { + LOG.warn("Non-zero user count preventing client tear down: users=" + users.get() + " expired=" + expiredFromCache); + } + + if (users.get() == 0 && expiredFromCache) { + this.tearDown(); + } + } + + /** + * Close the underlying objects irrespective of whether they are in use or not. + */ + public void tearDown() { + try { + if (!isClosed) { + super.close(); + } + isClosed = true; + } catch (Exception e) { + LOG.warn("Error closing hive metastore client. Ignored.", e); + } + } + + @Override + public String toString() { + return "HCatClient: thread: " + Thread.currentThread().getId() + " users=" + users.get() + + " expired=" + expiredFromCache + " closed=" + isClosed; + } + + /** + * GC is attempting to destroy the object. + * No one references this client anymore, so it can be torn down without worrying about user counts. + * @throws Throwable + */ + @Override + protected void finalize() throws Throwable { + if (users.get() != 0) { + LOG.warn("Closing client with non-zero user count: users=" + users.get() + " expired=" + expiredFromCache); + } + + try { + this.tearDown(); + } finally { + super.finalize(); + } + } + } +} diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreUtils.java index a66c135..75a7201 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreUtils.java @@ -18,9 +18,13 @@ package org.apache.hadoop.hive.metastore; +import java.io.IOException; import java.util.ArrayList; import java.util.List; +import javax.security.auth.login.LoginException; + +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,6 +46,7 @@ public class HiveMetaStoreUtils { protected static final Logger LOG = LoggerFactory.getLogger("hive.log"); + private static volatile HiveClientCache hiveClientCache; /** * getDeserializer @@ -210,4 +215,34 @@ public static FieldSchema getFieldSchemaFromTypeInfo(String fieldName, "generated by TypeInfoUtils.getFieldSchemaFromTypeInfo"); } + /** + * Get or create a hive client depending on whether it exits in cache or not + * @param hiveConf The hive configuration + * @return the client + * @throws MetaException When HiveMetaStoreClient couldn't be created + * @throws IOException + */ + public static IMetaStoreClient getHiveMetastoreClient(HiveConf hiveConf) + throws MetaException, IOException { + + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_ENABLED)){ + // If cache is disabled, don't use it. + return HiveClientCache.getNonCachedHiveMetastoreClient(hiveConf); + } + + // Singleton behaviour: create the cache instance if required. + if (hiveClientCache == null) { + synchronized (IMetaStoreClient.class) { + if (hiveClientCache == null) { + hiveClientCache = new HiveClientCache(hiveConf); + } + } + } + try { + return hiveClientCache.get(hiveConf); + } catch (LoginException e) { + throw new IOException("Couldn't create hiveMetaStoreClient, Error getting UGI for user", e); + } + } + } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java new file mode 100644 index 0000000..40b2e8e --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java @@ -0,0 +1,669 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.hadoop.hive.serde2; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.CharacterCodingException; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.lazy.LazyFactory; +import org.apache.hadoop.hive.serde2.lazy.LazyObjectBase; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HiveStringUtils; +import org.apache.hive.common.util.TimestampParser; +import org.codehaus.jackson.JsonFactory; +import org.codehaus.jackson.JsonParseException; +import org.codehaus.jackson.JsonParser; +import org.codehaus.jackson.JsonToken; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, + serdeConstants.LIST_COLUMN_TYPES, + serdeConstants.TIMESTAMP_FORMATS}) + +// FIXME: move TestJsonSerDe from hcat to serde2 +public class JsonSerDe extends AbstractSerDe { + + private static final Logger LOG = LoggerFactory.getLogger(JsonSerDe.class); + private List columnNames; + private StructTypeInfo schema; + + private JsonFactory jsonFactory = null; + + private StandardStructObjectInspector cachedObjectInspector; + private TimestampParser tsParser; + + @Override + public void initialize(Configuration conf, Properties tbl) + throws SerDeException { + List columnTypes; + StructTypeInfo rowTypeInfo; + + LOG.debug("Initializing JsonSerDe: {}", tbl.entrySet()); + + // Get column names and types + String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS); + String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); + final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl + .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA); + // all table column names + if (columnNameProperty.isEmpty()) { + columnNames = Collections.emptyList(); + } else { + columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter)); + } + + // all column types + if (columnTypeProperty.isEmpty()) { + columnTypes = Collections.emptyList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + + LOG.debug("columns: {}, {}", columnNameProperty, columnNames); + LOG.debug("types: {}, {} ", columnTypeProperty, columnTypes); + + assert (columnNames.size() == columnTypes.size()); + + rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + schema = rowTypeInfo; + LOG.debug("schema : {}", schema); + cachedObjectInspector = (StandardStructObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(rowTypeInfo); + + jsonFactory = new JsonFactory(); + tsParser = new TimestampParser( + HiveStringUtils.splitAndUnEscape(tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS))); + } + + /** + * Takes JSON string in Text form, and has to return an object representation above + * it that's readable by the corresponding object inspector. + * For this implementation, since we're using the jackson parser, we can construct + * our own object implementation, and we use HCatRecord for it + */ + @Override + public Object deserialize(Writable blob) throws SerDeException { + + Text t = (Text) blob; + JsonParser p; + List r = new ArrayList<>(Collections.nCopies(columnNames.size(), null)); + try { + p = jsonFactory.createJsonParser(new ByteArrayInputStream((t.getBytes()))); + if (p.nextToken() != JsonToken.START_OBJECT) { + throw new IOException("Start token not found where expected"); + } + JsonToken token; + while (((token = p.nextToken()) != JsonToken.END_OBJECT) && (token != null)) { + // iterate through each token, and create appropriate object here. + populateRecord(r, token, p, schema); + } + } catch (JsonParseException e) { + LOG.warn("Error [{}] parsing json text [{}].", e, t); + throw new SerDeException(e); + } catch (IOException e) { + LOG.warn("Error [{}] parsing json text [{}].", e, t); + throw new SerDeException(e); + } + + return r; + } + + private void populateRecord(List r, JsonToken token, JsonParser p, StructTypeInfo s) throws IOException { + if (token != JsonToken.FIELD_NAME) { + throw new IOException("Field name expected"); + } + String fieldName = p.getText(); + int fpos = s.getAllStructFieldNames().indexOf(fieldName); + if (fpos == -1) { + fpos = getPositionFromHiveInternalColumnName(fieldName); + LOG.debug("NPE finding position for field [{}] in schema [{}]," + + " attempting to check if it is an internal column name like _col0", fieldName, s); + if (fpos == -1) { + skipValue(p); + return; // unknown field, we return. We'll continue from the next field onwards. + } + // If we get past this, then the column name did match the hive pattern for an internal + // column name, such as _col0, etc, so it *MUST* match the schema for the appropriate column. + // This means people can't use arbitrary column names such as _col0, and expect us to ignore it + // if we find it. + if (!fieldName.equalsIgnoreCase(getHiveInternalColumnName(fpos))) { + LOG.error("Hive internal column name {} and position " + + "encoding {} for the column name are at odds", fieldName, fpos); + throw new IOException("Hive internal column name (" + fieldName + + ") and position encoding (" + fpos + + ") for the column name are at odds"); + } + // If we reached here, then we were successful at finding an alternate internal + // column mapping, and we're about to proceed. + } + Object currField = extractCurrentField(p, s.getStructFieldTypeInfo(fieldName), false); + r.set(fpos, currField); + } + + public String getHiveInternalColumnName(int fpos) { + return HiveConf.getColumnInternalName(fpos); + } + + public int getPositionFromHiveInternalColumnName(String internalName) { + // return HiveConf.getPositionFromInternalName(fieldName); + // The above line should have been all the implementation that + // we need, but due to a bug in that impl which recognizes + // only single-digit columns, we need another impl here. + Pattern internalPattern = Pattern.compile("_col([0-9]+)"); + Matcher m = internalPattern.matcher(internalName); + if (!m.matches()) { + return -1; + } else { + return Integer.parseInt(m.group(1)); + } + } + + /** + * Utility method to extract (and forget) the next value token from the JsonParser, + * as a whole. The reason this function gets called is to yank out the next value altogether, + * because it corresponds to a field name that we do not recognize, and thus, do not have + * a schema/type for. Thus, this field is to be ignored. + * + * @throws IOException + * @throws JsonParseException + */ + private void skipValue(JsonParser p) throws JsonParseException, IOException { + JsonToken valueToken = p.nextToken(); + + if ((valueToken == JsonToken.START_ARRAY) || (valueToken == JsonToken.START_OBJECT)) { + // if the currently read token is a beginning of an array or object, move stream forward + // skipping any child tokens till we're at the corresponding END_ARRAY or END_OBJECT token + p.skipChildren(); + } + // At the end of this function, the stream should be pointing to the last token that + // corresponds to the value being skipped. This way, the next call to nextToken + // will advance it to the next field name. + } + + /** + * Utility method to extract current expected field from given JsonParser + * isTokenCurrent is a boolean variable also passed in, which determines + * if the JsonParser is already at the token we expect to read next, or + * needs advancing to the next before we read. + */ + private Object extractCurrentField(JsonParser p, TypeInfo fieldTypeInfo, + boolean isTokenCurrent) throws IOException { + Object val = null; + JsonToken valueToken; + if (isTokenCurrent) { + valueToken = p.getCurrentToken(); + } else { + valueToken = p.nextToken(); + } + + switch (fieldTypeInfo.getCategory()) { + case PRIMITIVE: + PrimitiveObjectInspector.PrimitiveCategory primitiveCategory = PrimitiveObjectInspector.PrimitiveCategory.UNKNOWN; + if (fieldTypeInfo instanceof PrimitiveTypeInfo) { + primitiveCategory = ((PrimitiveTypeInfo) fieldTypeInfo).getPrimitiveCategory(); + } + switch (primitiveCategory) { + case INT: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue(); + break; + case BYTE: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue(); + break; + case SHORT: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue(); + break; + case LONG: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue(); + break; + case BOOLEAN: + String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); + if (bval != null) { + val = Boolean.valueOf(bval); + } else { + val = null; + } + break; + case FLOAT: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue(); + break; + case DOUBLE: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue(); + break; + case STRING: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); + break; + case BINARY: + String b = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); + if (b != null) { + try { + String t = Text.decode(b.getBytes(), 0, b.getBytes().length); + return t.getBytes(); + } catch (CharacterCodingException e) { + LOG.warn("Error generating json binary type from object.", e); + return null; + } + } else { + val = null; + } + break; + case DATE: + val = (valueToken == JsonToken.VALUE_NULL) ? null : Date.valueOf(p.getText()); + break; + case TIMESTAMP: + val = (valueToken == JsonToken.VALUE_NULL) ? null : tsParser.parseTimestamp(p.getText()); + break; + case DECIMAL: + val = (valueToken == JsonToken.VALUE_NULL) ? null : HiveDecimal.create(p.getText()); + break; + case VARCHAR: + int vLen = ((BaseCharTypeInfo) fieldTypeInfo).getLength(); + val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), vLen); + break; + case CHAR: + int cLen = ((BaseCharTypeInfo) fieldTypeInfo).getLength(); + val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveChar(p.getText(), cLen); + break; + } + break; + case LIST: + if (valueToken == JsonToken.VALUE_NULL) { + val = null; + break; + } + if (valueToken != JsonToken.START_ARRAY) { + throw new IOException("Start of Array expected"); + } + List arr = new ArrayList(); + while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) { + arr.add(extractCurrentField(p, ((ListTypeInfo)fieldTypeInfo).getListElementTypeInfo(), true)); + } + val = arr; + break; + case MAP: + if (valueToken == JsonToken.VALUE_NULL) { + val = null; + break; + } + if (valueToken != JsonToken.START_OBJECT) { + throw new IOException("Start of Object expected"); + } + Map map = new LinkedHashMap(); + while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) { + Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(), + (PrimitiveTypeInfo) ((MapTypeInfo)fieldTypeInfo).getMapKeyTypeInfo()); + Object v = extractCurrentField(p, ((MapTypeInfo) fieldTypeInfo).getMapValueTypeInfo(), false); + map.put(k, v); + } + val = map; + break; + case STRUCT: + if (valueToken == JsonToken.VALUE_NULL) { + val = null; + break; + } + if (valueToken != JsonToken.START_OBJECT) { + throw new IOException("Start of Object expected"); + } + ArrayList subSchema = ((StructTypeInfo)fieldTypeInfo).getAllStructFieldTypeInfos(); + int sz = subSchema.size(); + List struct = new ArrayList(Collections.nCopies(sz, null)); + while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) { + populateRecord(struct, valueToken, p, ((StructTypeInfo) fieldTypeInfo)); + } + val = struct; + break; + default: + LOG.error("Unknown type found: " + fieldTypeInfo); + return null; + } + return val; + } + + private Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveTypeInfo mapKeyType) + throws IOException { + switch (mapKeyType.getPrimitiveCategory()) { + case INT: + return Integer.valueOf(s); + case BYTE: + return Byte.valueOf(s); + case SHORT: + return Short.valueOf(s); + case LONG: + return Long.valueOf(s); + case BOOLEAN: + return (s.equalsIgnoreCase("true")); + case FLOAT: + return Float.valueOf(s); + case DOUBLE: + return Double.valueOf(s); + case STRING: + return s; + case BINARY: + try { + String t = Text.decode(s.getBytes(), 0, s.getBytes().length); + return t.getBytes(); + } catch (CharacterCodingException e) { + LOG.warn("Error generating json binary type from object.", e); + return null; + } + case DATE: + return Date.valueOf(s); + case TIMESTAMP: + return Timestamp.valueOf(s); + case DECIMAL: + return HiveDecimal.create(s); + case VARCHAR: + return new HiveVarchar(s, ((BaseCharTypeInfo) mapKeyType).getLength()); + case CHAR: + return new HiveChar(s, ((BaseCharTypeInfo) mapKeyType).getLength()); + } + throw new IOException("Could not convert from string to map type " + mapKeyType.getTypeName()); + } + + /** + * Given an object and object inspector pair, traverse the object + * and generate a Text representation of the object. + */ + @Override + public Writable serialize(Object obj, ObjectInspector objInspector) + throws SerDeException { + StringBuilder sb = new StringBuilder(); + try { + + StructObjectInspector soi = (StructObjectInspector) objInspector; + List structFields = soi.getAllStructFieldRefs(); + assert (columnNames.size() == structFields.size()); + if (obj == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACE); + for (int i = 0; i < structFields.size(); i++) { + if (i > 0) { + sb.append(SerDeUtils.COMMA); + } + appendWithQuotes(sb, columnNames.get(i)); + sb.append(SerDeUtils.COLON); + buildJSONString(sb, soi.getStructFieldData(obj, structFields.get(i)), + structFields.get(i).getFieldObjectInspector()); + } + sb.append(SerDeUtils.RBRACE); + } + + } catch (IOException e) { + LOG.warn("Error generating json text from object.", e); + throw new SerDeException(e); + } + return new Text(sb.toString()); + } + + private static StringBuilder appendWithQuotes(StringBuilder sb, String value) { + return sb == null ? null : sb.append(SerDeUtils.QUOTE).append(value).append(SerDeUtils.QUOTE); + } + + // TODO : code section copied over from SerDeUtils because of non-standard json production there + // should use quotes for all field names. We should fix this there, and then remove this copy. + // See http://jackson.codehaus.org/1.7.3/javadoc/org/codehaus/jackson/JsonParser.Feature.html#ALLOW_UNQUOTED_FIELD_NAMES + // for details - trying to enable Jackson to ignore that doesn't seem to work(compilation failure + // when attempting to use that feature, so having to change the production itself. + // Also, throws IOException when Binary is detected. + private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector oi) throws IOException { + + switch (oi.getCategory()) { + case PRIMITIVE: { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + if (o == null) { + sb.append("null"); + } else { + switch (poi.getPrimitiveCategory()) { + case BOOLEAN: { + boolean b = ((BooleanObjectInspector) poi).get(o); + sb.append(b ? "true" : "false"); + break; + } + case BYTE: { + sb.append(((ByteObjectInspector) poi).get(o)); + break; + } + case SHORT: { + sb.append(((ShortObjectInspector) poi).get(o)); + break; + } + case INT: { + sb.append(((IntObjectInspector) poi).get(o)); + break; + } + case LONG: { + sb.append(((LongObjectInspector) poi).get(o)); + break; + } + case FLOAT: { + sb.append(((FloatObjectInspector) poi).get(o)); + break; + } + case DOUBLE: { + sb.append(((DoubleObjectInspector) poi).get(o)); + break; + } + case STRING: { + String s = + SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(o)); + appendWithQuotes(sb, s); + break; + } + case BINARY: + byte[] b = ((BinaryObjectInspector) oi).getPrimitiveJavaObject(o); + Text txt = new Text(); + txt.set(b, 0, b.length); + appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString())); + break; + case DATE: + Date d = ((DateObjectInspector) poi).getPrimitiveJavaObject(o); + appendWithQuotes(sb, d.toString()); + break; + case TIMESTAMP: { + Timestamp t = ((TimestampObjectInspector) poi).getPrimitiveJavaObject(o); + appendWithQuotes(sb, t.toString()); + break; + } + case DECIMAL: + sb.append(((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(o)); + break; + case VARCHAR: { + String s = SerDeUtils.escapeString( + ((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(o).toString()); + appendWithQuotes(sb, s); + break; + } + case CHAR: { + //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13) + // HiveChar.toString() returns getPaddedValue() + String s = SerDeUtils.escapeString( + ((HiveCharObjectInspector) poi).getPrimitiveJavaObject(o).toString()); + appendWithQuotes(sb, s); + break; + } + default: + throw new RuntimeException("Unknown primitive type: " + poi.getPrimitiveCategory()); + } + } + break; + } + case LIST: { + ListObjectInspector loi = (ListObjectInspector) oi; + ObjectInspector listElementObjectInspector = loi + .getListElementObjectInspector(); + List olist = loi.getList(o); + if (olist == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACKET); + for (int i = 0; i < olist.size(); i++) { + if (i > 0) { + sb.append(SerDeUtils.COMMA); + } + buildJSONString(sb, olist.get(i), listElementObjectInspector); + } + sb.append(SerDeUtils.RBRACKET); + } + break; + } + case MAP: { + MapObjectInspector moi = (MapObjectInspector) oi; + ObjectInspector mapKeyObjectInspector = moi.getMapKeyObjectInspector(); + ObjectInspector mapValueObjectInspector = moi + .getMapValueObjectInspector(); + Map omap = moi.getMap(o); + if (omap == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACE); + boolean first = true; + for (Object entry : omap.entrySet()) { + if (first) { + first = false; + } else { + sb.append(SerDeUtils.COMMA); + } + Map.Entry e = (Map.Entry) entry; + StringBuilder keyBuilder = new StringBuilder(); + buildJSONString(keyBuilder, e.getKey(), mapKeyObjectInspector); + String keyString = keyBuilder.toString().trim(); + if ((!keyString.isEmpty()) && (keyString.charAt(0) != SerDeUtils.QUOTE)) { + appendWithQuotes(sb, keyString); + } else { + sb.append(keyString); + } + sb.append(SerDeUtils.COLON); + buildJSONString(sb, e.getValue(), mapValueObjectInspector); + } + sb.append(SerDeUtils.RBRACE); + } + break; + } + case STRUCT: { + StructObjectInspector soi = (StructObjectInspector) oi; + List structFields = soi.getAllStructFieldRefs(); + if (o == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACE); + for (int i = 0; i < structFields.size(); i++) { + if (i > 0) { + sb.append(SerDeUtils.COMMA); + } + appendWithQuotes(sb, structFields.get(i).getFieldName()); + sb.append(SerDeUtils.COLON); + buildJSONString(sb, soi.getStructFieldData(o, structFields.get(i)), + structFields.get(i).getFieldObjectInspector()); + } + sb.append(SerDeUtils.RBRACE); + } + break; + } + case UNION: { + UnionObjectInspector uoi = (UnionObjectInspector) oi; + if (o == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACE); + sb.append(uoi.getTag(o)); + sb.append(SerDeUtils.COLON); + buildJSONString(sb, uoi.getField(o), + uoi.getObjectInspectors().get(uoi.getTag(o))); + sb.append(SerDeUtils.RBRACE); + } + break; + } + default: + throw new RuntimeException("Unknown type in ObjectInspector!"); + } + } + + + /** + * Returns an object inspector for the specified schema that + * is capable of reading in the object representation of the JSON string + */ + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return cachedObjectInspector; + } + + @Override + public Class getSerializedClass() { + return Text.class; + } + + @Override + public SerDeStats getSerDeStats() { + // no support for statistics yet + return null; + } + +} diff --git a/streaming/pom.xml b/streaming/pom.xml index b58ec01..7b02e8b 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -56,12 +56,6 @@ ${project.version} - org.apache.hive.hcatalog - hive-hcatalog-core - true - ${project.version} - - org.apache.commons commons-lang3 true diff --git a/streaming/src/java/org/apache/hive/streaming/AbstractRecordWriter.java b/streaming/src/java/org/apache/hive/streaming/AbstractRecordWriter.java index 25998ae..05d8616 100644 --- a/streaming/src/java/org/apache/hive/streaming/AbstractRecordWriter.java +++ b/streaming/src/java/org/apache/hive/streaming/AbstractRecordWriter.java @@ -19,12 +19,18 @@ package org.apache.hive.streaming; -import org.apache.hadoop.security.UserGroupInformation; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; + import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.JavaUtils; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStoreUtils; import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.MetaException; @@ -38,162 +44,230 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector; import org.apache.hadoop.util.ReflectionUtils; -import org.apache.hive.hcatalog.common.HCatUtil; import org.apache.thrift.TException; - -import java.io.IOException; - -import java.security.PrivilegedExceptionAction; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Properties; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public abstract class AbstractRecordWriter implements RecordWriter { - static final private Logger LOG = LoggerFactory.getLogger(AbstractRecordWriter.class.getName()); + private static final Logger LOG = LoggerFactory.getLogger(AbstractRecordWriter.class.getName()); - private final HiveConf conf; - private final HiveEndPoint endPoint; - final Table tbl; + protected HiveConf conf; + private StreamingConnection conn; + Table tbl; - private final IMetaStoreClient msClient; - final List bucketIds; - private ArrayList updaters = null; + private Map> updaters = new HashMap<>(); + // input OI includes table columns + partition columns + private StructObjectInspector inputRowObjectInspector; + private ObjectInspector outputRowObjectInspector; + // input columns include table columns + partition columns + List inputColumns = new ArrayList<>(); + List inputTypes = new ArrayList<>(); - private final int totalBuckets; + private List partitionColumns = new ArrayList<>(); + private ObjectInspector[] partitionObjInspectors = null; + private StructField[] partitionStructFields = null; + private Object[] partitionFieldData; + + private ObjectInspector[] bucketObjInspectors = null; + private StructField[] bucketStructFields = null; + private Object[] bucketFieldData; + private List bucketIds = new ArrayList<>(); + private int totalBuckets; /** * Indicates whether target table is bucketed */ - private final boolean isBucketed; - - private final Path partitionPath; - - private final AcidOutputFormat outf; - private Object[] bucketFieldData; // Pre-allocated in constructor. Updated on each write. + private boolean isBucketed; + private AcidOutputFormat outf; private Long curBatchMinWriteId; private Long curBatchMaxWriteId; - private static final class TableWriterPair { - private final Table tbl; - private final Path partitionPath; - TableWriterPair(Table t, Path p) { - tbl = t; - partitionPath = p; - } - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #AbstractRecordWriter(HiveEndPoint, HiveConf, StreamingConnection)} - */ - protected AbstractRecordWriter(HiveEndPoint endPoint, HiveConf conf) - throws ConnectionError, StreamingException { - this(endPoint, conf, null); - } - protected AbstractRecordWriter(HiveEndPoint endPoint2, HiveConf conf, StreamingConnection conn) - throws StreamingException { - this.endPoint = endPoint2; - this.conf = conf!=null ? conf - : HiveEndPoint.createHiveConf(DelimitedInputWriter.class, endPoint.metaStoreUri); + @Override + public void init(StreamingConnection conn) throws StreamingException { try { - msClient = HCatUtil.getHiveMetastoreClient(this.conf); - UserGroupInformation ugi = conn != null ? conn.getUserGroupInformation() : null; - if (ugi == null) { - this.tbl = msClient.getTable(endPoint.database, endPoint.table); - this.partitionPath = getPathForEndPoint(msClient, endPoint); - } else { - TableWriterPair twp = ugi.doAs( - new PrivilegedExceptionAction() { - @Override - public TableWriterPair run() throws Exception { - return new TableWriterPair(msClient.getTable(endPoint.database, endPoint.table), - getPathForEndPoint(msClient, endPoint)); - } - }); - this.tbl = twp.tbl; - this.partitionPath = twp.partitionPath; - } - this.isBucketed = tbl.getSd().getNumBuckets() > 0; - /** - * For unbucketed tables we have exactly 1 RecrodUpdater for each AbstractRecordWriter which - * ends up writing to a file bucket_000000 - * See also {@link #getBucket(Object)} - */ - this.totalBuckets = isBucketed ? tbl.getSd().getNumBuckets() : 1; - if(isBucketed) { - this.bucketIds = getBucketColIDs(tbl.getSd().getBucketCols(), tbl.getSd().getCols()); - this.bucketFieldData = new Object[bucketIds.size()]; - } - else { - bucketIds = Collections.emptyList(); + this.conn = conn; + this.conf = conn.getHiveConf(); + final IMetaStoreClient msClient = HiveMetaStoreUtils.getHiveMetastoreClient(this.conf); + this.tbl = msClient.getTable(conn.getDatabase(), conn.getTable()); + final List tableColumns = getCols(tbl); + final List tableColTypes = getColTypes(tbl); + this.partitionColumns = getPartitionCols(tbl); + final List partitionTypes = getPartitionColTypes(tbl); + this.inputColumns = new ArrayList<>(tableColumns); + this.inputTypes = new ArrayList<>(tableColTypes); + if (conn.isPartitionedTable() && conn.isDynamicPartitioning()) { + this.inputColumns.addAll(partitionColumns); + this.inputTypes.addAll(partitionTypes); } String outFormatName = this.tbl.getSd().getOutputFormat(); - outf = (AcidOutputFormat) ReflectionUtils.newInstance(JavaUtils.loadClass(outFormatName), conf); - } catch(InterruptedException e) { - throw new StreamingException(endPoint2.toString(), e); + this.outf = (AcidOutputFormat) ReflectionUtils.newInstance(JavaUtils.loadClass(outFormatName), conf); } catch (MetaException | NoSuchObjectException e) { - throw new ConnectionError(endPoint2, e); + throw new ConnectionError(conn, e); } catch (TException | ClassNotFoundException | IOException e) { throw new StreamingException(e.getMessage(), e); } + + try { + final AbstractSerDe serDe = createSerde(); + this.inputRowObjectInspector = (StructObjectInspector) serDe.getObjectInspector(); + if (conn.isPartitionedTable() && conn.isDynamicPartitioning()) { + preparePartitioningFields(); + int dpStartCol = inputRowObjectInspector.getAllStructFieldRefs().size() - tbl.getPartitionKeys().size(); + this.outputRowObjectInspector = new SubStructObjectInspector(inputRowObjectInspector, 0, dpStartCol); + } else { + this.outputRowObjectInspector = inputRowObjectInspector; + } + prepareBucketingFields(); + } catch (SerDeException e) { + throw new StreamingException("Unable to create SerDe", e); + } + } + + private ArrayList getCols(Table table) { + List cols = table.getSd().getCols(); + ArrayList colNames = new ArrayList<>(cols.size()); + for (FieldSchema col : cols) { + colNames.add(col.getName().toLowerCase()); + } + return colNames; + } + + private ArrayList getColTypes(Table table) { + List cols = table.getSd().getCols(); + ArrayList colNames = new ArrayList<>(cols.size()); + for (FieldSchema col : cols) { + colNames.add(col.getType().toLowerCase()); + } + return colNames; + } + + private void prepareBucketingFields() { + this.isBucketed = tbl.getSd().getNumBuckets() > 0; + /* + * For unbucketed tables we have exactly 1 RecordUpdater for each AbstractRecordWriter which + * ends up writing to a file bucket_000000 + * See also {@link #getBucket(Object)} + */ + this.totalBuckets = isBucketed ? tbl.getSd().getNumBuckets() : 1; + if (isBucketed) { + this.bucketIds = getBucketColIDs(tbl.getSd().getBucketCols(), tbl.getSd().getCols()); + this.bucketFieldData = new Object[bucketIds.size()]; + this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, inputRowObjectInspector); + this.bucketStructFields = new StructField[bucketIds.size()]; + List allFields = inputRowObjectInspector.getAllStructFieldRefs(); + for (int i = 0; i < bucketIds.size(); i++) { + bucketStructFields[i] = allFields.get(bucketIds.get(i)); + } + } + } + + private void preparePartitioningFields() { + final int numPartitions = tbl.getPartitionKeys().size(); + this.partitionFieldData = new Object[numPartitions]; + this.partitionObjInspectors = new ObjectInspector[numPartitions]; + int startIdx = inputRowObjectInspector.getAllStructFieldRefs().size() - numPartitions; + int endIdx = inputRowObjectInspector.getAllStructFieldRefs().size(); + int j = 0; + for (int i = startIdx; i < endIdx; i++) { + StructField structField = inputRowObjectInspector.getAllStructFieldRefs().get(i); + partitionObjInspectors[j++] = structField.getFieldObjectInspector(); + } + this.partitionStructFields = new StructField[partitionColumns.size()]; + for (int i = 0; i < partitionColumns.size(); i++) { + String partCol = partitionColumns.get(i); + partitionStructFields[i] = inputRowObjectInspector.getStructFieldRef(partCol); + } } /** * used to tag error msgs to provied some breadcrumbs */ - String getWatermark() { - return partitionPath + " writeIds[" + curBatchMinWriteId + "," + curBatchMaxWriteId + "]"; + private String getWatermark(String partition) { + return partition + " writeIds[" + curBatchMinWriteId + "," + curBatchMaxWriteId + "]"; } + // return the column numbers of the bucketed columns private List getBucketColIDs(List bucketCols, List cols) { - ArrayList result = new ArrayList(bucketCols.size()); - HashSet bucketSet = new HashSet(bucketCols); + ArrayList result = new ArrayList<>(bucketCols.size()); + HashSet bucketSet = new HashSet<>(bucketCols); for (int i = 0; i < cols.size(); i++) { - if( bucketSet.contains(cols.get(i).getName()) ) { + if (bucketSet.contains(cols.get(i).getName())) { result.add(i); } } return result; } - /** - * Get the SerDe for the Objects created by {@link #encode}. This is public so that test - * frameworks can use it. - * @return serde - * @throws SerializationError - */ - public abstract AbstractSerDe getSerde() throws SerializationError; + public abstract AbstractSerDe createSerde() throws SerializationError; /** * Encode a record as an Object that Hive can read with the ObjectInspector associated with the - * serde returned by {@link #getSerde}. This is public so that test frameworks can use it. + * serde returned by {@link #createSerde}. This is public so that test frameworks can use it. + * * @param record record to be deserialized * @return deserialized record as an Object - * @throws SerializationError + * @throws SerializationError - any error during serialization or deserialization of record */ public abstract Object encode(byte[] record) throws SerializationError; - protected abstract ObjectInspector[] getBucketObjectInspectors(); - protected abstract StructObjectInspector getRecordObjectInspector(); - protected abstract StructField[] getBucketStructFields(); - // returns the bucket number to which the record belongs to - protected int getBucket(Object row) throws SerializationError { - if(!isBucketed) { + private int getBucket(Object row) { + if (!isBucketed) { return 0; } - ObjectInspector[] inspectors = getBucketObjectInspectors(); Object[] bucketFields = getBucketFields(row); - return ObjectInspectorUtils.getBucketNumber(bucketFields, inspectors, totalBuckets); + return ObjectInspectorUtils.getBucketNumber(bucketFields, bucketObjInspectors, totalBuckets); + } + + private ArrayList getPartitionCols(Table table) { + List cols = table.getPartitionKeys(); + ArrayList colNames = new ArrayList<>(cols.size()); + for (FieldSchema col : cols) { + colNames.add(col.getName().toLowerCase()); + } + return colNames; + } + + private ArrayList getPartitionColTypes(Table table) { + List cols = table.getPartitionKeys(); + ArrayList colNames = new ArrayList<>(cols.size()); + for (FieldSchema col : cols) { + colNames.add(col.getType().toLowerCase()); + } + return colNames; + } + + private List getPartitionValues(final Object row) { + if (!conn.isPartitionedTable()) { + return null; + } + List partitionValues = new ArrayList<>(); + if (conn.isPartitionedTable() && conn.isDynamicPartitioning()) { + Object[] partitionFields = getPartitionFields(row); + for (int i = 0; i < partitionObjInspectors.length; i++) { + ObjectInspector oi = partitionObjInspectors[i]; + Object field = partitionFields[i]; + Object partitionValue = ObjectInspectorUtils.copyToStandardObject(field, oi, ObjectInspectorUtils + .ObjectInspectorCopyOption.WRITABLE); + partitionValues.add(partitionValue.toString()); + } + } else { + partitionValues = conn.getStaticPartitionValues(); + } + return partitionValues; } @Override public void flush() throws StreamingIOFailure { try { - for (RecordUpdater updater : updaters) { - if (updater != null) { - updater.flush(); + for (Map.Entry> entry : updaters.entrySet()) { + LOG.info("Flushing record updater for partitions: {}", entry.getKey()); + for (RecordUpdater updater : entry.getValue()) { + if (updater != null) { + updater.flush(); + } } } } catch (IOException e) { @@ -201,124 +275,137 @@ public void flush() throws StreamingIOFailure { } } - @Override - public void clear() throws StreamingIOFailure { - } - /** * Creates a new record updater for the new batch + * * @param minWriteId smallest writeid in the batch * @param maxWriteID largest writeid in the batch - * @throws StreamingIOFailure if failed to create record updater */ @Override - public void newBatch(Long minWriteId, Long maxWriteID) - throws StreamingIOFailure, SerializationError { + public void newBatch(Long minWriteId, Long maxWriteID) { curBatchMinWriteId = minWriteId; curBatchMaxWriteId = maxWriteID; - updaters = new ArrayList(totalBuckets); - for (int bucket = 0; bucket < totalBuckets; bucket++) { - updaters.add(bucket, null);//so that get(i) returns null rather than ArrayOutOfBounds - } } @Override public void closeBatch() throws StreamingIOFailure { boolean haveError = false; - for (RecordUpdater updater : updaters) { - if (updater != null) { - try { - //try not to leave any files open - updater.close(false); - } catch (Exception ex) { - haveError = true; - LOG.error("Unable to close " + updater + " due to: " + ex.getMessage(), ex); + String partition = null; + for (Map.Entry> entry : updaters.entrySet()) { + partition = entry.getKey(); + LOG.info("Closing updater for partitions: {}", partition); + for (RecordUpdater updater : entry.getValue()) { + if (updater != null) { + try { + //try not to leave any files open + updater.close(false); + } catch (Exception ex) { + haveError = true; + LOG.error("Unable to close " + updater + " due to: " + ex.getMessage(), ex); + } } } + entry.getValue().clear(); } updaters.clear(); - if(haveError) { - throw new StreamingIOFailure("Encountered errors while closing (see logs) " + getWatermark()); + if (haveError) { + throw new StreamingIOFailure("Encountered errors while closing (see logs) " + getWatermark(partition)); } } - protected static ObjectInspector[] getObjectInspectorsForBucketedCols(List bucketIds - , StructObjectInspector recordObjInspector) - throws SerializationError { + private static ObjectInspector[] getObjectInspectorsForBucketedCols(List bucketIds + , StructObjectInspector recordObjInspector) { ObjectInspector[] result = new ObjectInspector[bucketIds.size()]; for (int i = 0; i < bucketIds.size(); i++) { int bucketId = bucketIds.get(i); result[i] = - recordObjInspector.getAllStructFieldRefs().get( bucketId ).getFieldObjectInspector(); + recordObjInspector.getAllStructFieldRefs().get(bucketId).getFieldObjectInspector(); } return result; } - - private Object[] getBucketFields(Object row) throws SerializationError { - StructObjectInspector recordObjInspector = getRecordObjectInspector(); - StructField[] bucketStructFields = getBucketStructFields(); + private Object[] getBucketFields(Object row) { for (int i = 0; i < bucketIds.size(); i++) { - bucketFieldData[i] = recordObjInspector.getStructFieldData(row, bucketStructFields[i]); + bucketFieldData[i] = inputRowObjectInspector.getStructFieldData(row, bucketStructFields[i]); } return bucketFieldData; } - private RecordUpdater createRecordUpdater(int bucketId, Long minWriteId, Long maxWriteID) - throws IOException, SerializationError { + private Object[] getPartitionFields(Object row) { + for (int i = 0; i < partitionFieldData.length; i++) { + partitionFieldData[i] = inputRowObjectInspector.getStructFieldData(row, partitionStructFields[i]); + } + return partitionFieldData; + } + + @Override + public void write(final long writeId, final byte[] record) throws StreamingException { try { - // Initialize table properties from the table parameters. This is required because the table - // may define certain table parameters that may be required while writing. The table parameter - // 'transactional_properties' is one such example. - Properties tblProperties = new Properties(); - tblProperties.putAll(tbl.getParameters()); - return outf.getRecordUpdater(partitionPath, - new AcidOutputFormat.Options(conf) - .inspector(getSerde().getObjectInspector()) - .bucket(bucketId) - .tableProperties(tblProperties) - .minimumWriteId(minWriteId) - .maximumWriteId(maxWriteID) - .statementId(-1) - .finalDestination(partitionPath)); - } catch (SerDeException e) { - throw new SerializationError("Failed to get object inspector from Serde " - + getSerde().getClass().getName(), e); + Object encodedRow = encode(record); + int bucket = getBucket(encodedRow); + List partitionValues = getPartitionValues(encodedRow); + getRecordUpdater(partitionValues, bucket).insert(writeId, encodedRow); + } catch (IOException e) { + throw new StreamingIOFailure("Error writing record in transaction write id (" + + writeId + ")", e); } } - RecordUpdater getRecordUpdater(int bucketId) throws StreamingIOFailure, SerializationError { - RecordUpdater recordUpdater = updaters.get(bucketId); + private RecordUpdater createRecordUpdater(final Path partitionPath, int bucketId, Long minWriteId, + Long maxWriteID) + throws IOException { + // Initialize table properties from the table parameters. This is required because the table + // may define certain table parameters that may be required while writing. The table parameter + // 'transactional_properties' is one such example. + Properties tblProperties = new Properties(); + tblProperties.putAll(tbl.getParameters()); + return outf.getRecordUpdater(partitionPath, + new AcidOutputFormat.Options(conf) + .inspector(outputRowObjectInspector) + .bucket(bucketId) + .tableProperties(tblProperties) + .minimumWriteId(minWriteId) + .maximumWriteId(maxWriteID) + .statementId(-1) + .finalDestination(partitionPath)); + } + + private RecordUpdater getRecordUpdater(List partitionVals, int bucketId) throws StreamingIOFailure { + RecordUpdater recordUpdater; + String partKey = partitionVals == null ? conn.getDatabase() + "." + conn.getTable() : partitionVals.toString(); + Path partitionPath; + try { + // for static partitions, the partition might already exists or will be created by HiveEndpoint. + if (!updaters.containsKey(partKey) && conn.isDynamicPartitioning()) { + conn.createPartitionIfNotExists(partitionVals); + LOG.info("Created dynamic partition for values: {}", partitionVals); + } + partitionPath = conn.getPathForPartition(partitionVals); + updaters.computeIfAbsent(partKey, k -> initializeBuckets()); + recordUpdater = updaters.get(partKey).get(bucketId); + } catch (StreamingException e) { + throw new StreamingIOFailure("Unable to create partition: " + partitionVals + "for " + conn, e); + } if (recordUpdater == null) { try { - recordUpdater = createRecordUpdater(bucketId, curBatchMinWriteId, curBatchMaxWriteId); + recordUpdater = createRecordUpdater(partitionPath, bucketId, curBatchMinWriteId, curBatchMaxWriteId); } catch (IOException e) { - String errMsg = "Failed creating RecordUpdater for " + getWatermark(); + String errMsg = "Failed creating RecordUpdater for " + getWatermark(partitionPath.toString()); LOG.error(errMsg, e); throw new StreamingIOFailure(errMsg, e); } - updaters.set(bucketId, recordUpdater); + List partitionUpdaters = updaters.get(partKey); + partitionUpdaters.set(bucketId, recordUpdater); } return recordUpdater; } - private Path getPathForEndPoint(IMetaStoreClient msClient, HiveEndPoint endPoint) - throws StreamingException { - try { - String location; - if(endPoint.partitionVals==null || endPoint.partitionVals.isEmpty() ) { - location = msClient.getTable(endPoint.database,endPoint.table) - .getSd().getLocation(); - } else { - location = msClient.getPartition(endPoint.database, endPoint.table, - endPoint.partitionVals).getSd().getLocation(); - } - return new Path(location); - } catch (TException e) { - throw new StreamingException(e.getMessage() - + ". Unable to get path for end point: " - + endPoint.partitionVals, e); + private List initializeBuckets() { + List result = new ArrayList<>(totalBuckets); + for (int bucket = 0; bucket < totalBuckets; bucket++) { + result.add(bucket, null); //so that get(i) returns null rather than ArrayOutOfBounds } + return result; } } diff --git a/streaming/src/java/org/apache/hive/streaming/ConnectionError.java b/streaming/src/java/org/apache/hive/streaming/ConnectionError.java index 668bffb..9b6a6aa 100644 --- a/streaming/src/java/org/apache/hive/streaming/ConnectionError.java +++ b/streaming/src/java/org/apache/hive/streaming/ConnectionError.java @@ -20,15 +20,15 @@ public class ConnectionError extends StreamingException { - public ConnectionError(String msg) { + ConnectionError(String msg) { super(msg); } - public ConnectionError(String msg, Exception innerEx) { + ConnectionError(String msg, Exception innerEx) { super(msg, innerEx); } - public ConnectionError(HiveEndPoint endPoint, Exception innerEx) { + ConnectionError(StreamingConnection endPoint, Exception innerEx) { super("Error connecting to " + endPoint + (innerEx == null ? "" : ": " + innerEx.getMessage()), innerEx); } diff --git a/streaming/src/java/org/apache/hive/streaming/ConnectionInfo.java b/streaming/src/java/org/apache/hive/streaming/ConnectionInfo.java new file mode 100644 index 0000000..87c56a9 --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/ConnectionInfo.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +import java.util.List; + +/** + * Helper interface to get connection related information. + */ +public interface ConnectionInfo { + + /** + * Get metastore URI that metastore client uses. + * + * @return - metastore URI used by client + */ + String getMetastoreUri(); + + /** + * Get the database used by streaming connection. + * + * @return - database + */ + String getDatabase(); + + /** + * Get the table used by streaming connection. + * + * @return - table + */ + String getTable(); + + /** + * Get any static partitions specified during streaming connection creation. + * + * @return - static partition values + */ + List getStaticPartitionValues(); + + /** + * Get if the specified table is partitioned table or not. + * + * @return - true if partitioned table else false + */ + boolean isPartitionedTable(); + + /** + * Get if static partitioning is used. + * + * @return - true if static partitioning case else false + */ + boolean isStaticPartitioning(); + + /** + * Get if dynamic partitioning is used. + * + * @return - true if dynamic partitioning case else false + */ + boolean isDynamicPartitioning(); + + /** + * Get agent info that is set during streaming connection. + * + * @return - agent info + */ + String getAgentInfo(); +} diff --git a/streaming/src/java/org/apache/hive/streaming/DelimitedInputWriter.java b/streaming/src/java/org/apache/hive/streaming/DelimitedInputWriter.java deleted file mode 100644 index 898b3f9..0000000 --- a/streaming/src/java/org/apache/hive/streaming/DelimitedInputWriter.java +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - - -import com.google.common.annotations.VisibleForTesting; -import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.Table; -import org.apache.hadoop.hive.serde2.AbstractSerDe; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; -import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; -import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.io.BytesWritable; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - -/** - * Streaming Writer handles delimited input (eg. CSV). - * Delimited input is parsed & reordered to match column order in table - * Uses Lazy Simple Serde to process delimited input - */ -public class DelimitedInputWriter extends AbstractRecordWriter { - private final boolean reorderingNeeded; - private String delimiter; - private char serdeSeparator; - private int[] fieldToColMapping; - private final ArrayList tableColumns; - private LazySimpleSerDe serde = null; - - private final LazySimpleStructObjectInspector recordObjInspector; - private final ObjectInspector[] bucketObjInspectors; - private final StructField[] bucketStructFields; - - static final private Logger LOG = LoggerFactory.getLogger(DelimitedInputWriter.class.getName()); - - /** Constructor. Uses default separator of the LazySimpleSerde - * @param colNamesForFields Column name assignment for input fields. nulls or empty - * strings in the array indicates the fields to be skipped - * @param delimiter input field delimiter - * @param endPoint Hive endpoint - * @throws ConnectionError Problem talking to Hive - * @throws ClassNotFoundException Serde class not found - * @throws SerializationError Serde initialization/interaction failed - * @throws StreamingException Problem acquiring file system path for partition - * @throws InvalidColumn any element in colNamesForFields refers to a non existing column - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, StreamingConnection conn) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - this(colNamesForFields, delimiter, endPoint, null, conn); - } - /** Constructor. Uses default separator of the LazySimpleSerde - * @param colNamesForFields Column name assignment for input fields. nulls or empty - * strings in the array indicates the fields to be skipped - * @param delimiter input field delimiter - * @param endPoint Hive endpoint - * @param conf a Hive conf object. Can be null if not using advanced hive settings. - * @throws ConnectionError Problem talking to Hive - * @throws ClassNotFoundException Serde class not found - * @throws SerializationError Serde initialization/interaction failed - * @throws StreamingException Problem acquiring file system path for partition - * @throws InvalidColumn any element in colNamesForFields refers to a non existing column - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - this(colNamesForFields, delimiter, endPoint, conf, - (char) LazySerDeParameters.DefaultSeparators[0], conn); - } - /** - * Constructor. Allows overriding separator of the LazySimpleSerde - * @param colNamesForFields Column name assignment for input fields - * @param delimiter input field delimiter - * @param endPoint Hive endpoint - * @param conf a Hive conf object. Set to null if not using advanced hive settings. - * @param serdeSeparator separator used when encoding data that is fed into the - * LazySimpleSerde. Ensure this separator does not occur - * in the field data - * @param conn connection this Writer is to be used with - * @throws ConnectionError Problem talking to Hive - * @throws ClassNotFoundException Serde class not found - * @throws SerializationError Serde initialization/interaction failed - * @throws StreamingException Problem acquiring file system path for partition - * @throws InvalidColumn any element in colNamesForFields refers to a non existing column - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, HiveConf conf, char serdeSeparator, StreamingConnection conn) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - super(endPoint, conf, conn); - this.tableColumns = getCols(tbl); - this.serdeSeparator = serdeSeparator; - this.delimiter = delimiter; - this.fieldToColMapping = getFieldReordering(colNamesForFields, getTableColumns()); - this.reorderingNeeded = isReorderingNeeded(delimiter, getTableColumns()); - LOG.debug("Field reordering needed = " + this.reorderingNeeded + ", for endpoint " + endPoint); - this.serdeSeparator = serdeSeparator; - this.serde = createSerde(tbl, conf, serdeSeparator); - - // get ObjInspectors for entire record and bucketed cols - try { - this.recordObjInspector = (LazySimpleStructObjectInspector) serde.getObjectInspector(); - this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, recordObjInspector); - } catch (SerDeException e) { - throw new SerializationError("Unable to get ObjectInspector for bucket columns", e); - } - - // get StructFields for bucketed cols - bucketStructFields = new StructField[bucketIds.size()]; - List allFields = recordObjInspector.getAllStructFieldRefs(); - for (int i = 0; i < bucketIds.size(); i++) { - bucketStructFields[i] = allFields.get(bucketIds.get(i)); - } - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, StreamingConnection)} - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - this(colNamesForFields, delimiter, endPoint, null, null); - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, HiveConf, StreamingConnection)} - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, HiveConf conf) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - this(colNamesForFields, delimiter, endPoint, conf, - (char) LazySerDeParameters.DefaultSeparators[0], null); - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, HiveConf, char, StreamingConnection)} - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, HiveConf conf, char serdeSeparator) - throws ClassNotFoundException, StreamingException { - this(colNamesForFields, delimiter, endPoint, conf, serdeSeparator, null); - } - - private boolean isReorderingNeeded(String delimiter, ArrayList tableColumns) { - return !( delimiter.equals(String.valueOf(getSerdeSeparator())) - && areFieldsInColOrder(fieldToColMapping) - && tableColumns.size()>=fieldToColMapping.length ); - } - - private static boolean areFieldsInColOrder(int[] fieldToColMapping) { - for(int i=0; i tableColNames) - throws InvalidColumn { - int[] result = new int[ colNamesForFields.length ]; - for(int i=0; itableColNames.size()) { - throw new InvalidColumn("Number of field names exceeds the number of columns in table"); - } - return result; - } - - // Reorder fields in record based on the order of columns in the table - protected byte[] reorderFields(byte[] record) throws UnsupportedEncodingException { - if(!reorderingNeeded) { - return record; - } - String[] reorderedFields = new String[getTableColumns().size()]; - String decoded = new String(record); - String[] fields = decoded.split(delimiter,-1); - for (int i=0; i getTableColumns() { - return tableColumns; - } - - @Override - public void write(long writeId, byte[] record) - throws SerializationError, StreamingIOFailure { - try { - byte[] orderedFields = reorderFields(record); - Object encodedRow = encode(orderedFields); - int bucket = getBucket(encodedRow); - getRecordUpdater(bucket).insert(writeId, encodedRow); - } catch (IOException e) { - throw new StreamingIOFailure("Error writing record in transaction write id (" - + writeId + ")", e); - } - } - - @Override - public AbstractSerDe getSerde() { - return serde; - } - - protected LazySimpleStructObjectInspector getRecordObjectInspector() { - return recordObjInspector; - } - - @Override - protected StructField[] getBucketStructFields() { - return bucketStructFields; - } - - protected ObjectInspector[] getBucketObjectInspectors() { - return bucketObjInspectors; - } - - @Override - public Object encode(byte[] record) throws SerializationError { - try { - BytesWritable blob = new BytesWritable(); - blob.set(record, 0, record.length); - return serde.deserialize(blob); - } catch (SerDeException e) { - throw new SerializationError("Unable to convert byte[] record into Object", e); - } - } - - /** - * Creates LazySimpleSerde - * @return - * @throws SerializationError if serde could not be initialized - * @param tbl - */ - protected static LazySimpleSerDe createSerde(Table tbl, HiveConf conf, char serdeSeparator) - throws SerializationError { - try { - Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); - tableProps.setProperty("field.delim", String.valueOf(serdeSeparator)); - LazySimpleSerDe serde = new LazySimpleSerDe(); - SerDeUtils.initializeSerDe(serde, conf, tableProps, null); - return serde; - } catch (SerDeException e) { - throw new SerializationError("Error initializing serde", e); - } - } - - private ArrayList getCols(Table table) { - List cols = table.getSd().getCols(); - ArrayList colNames = new ArrayList(cols.size()); - for (FieldSchema col : cols) { - colNames.add(col.getName().toLowerCase()); - } - return colNames; - } - - public char getSerdeSeparator() { - return serdeSeparator; - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/HeartBeatFailure.java b/streaming/src/java/org/apache/hive/streaming/HeartBeatFailure.java index b1f9520..88db646 100644 --- a/streaming/src/java/org/apache/hive/streaming/HeartBeatFailure.java +++ b/streaming/src/java/org/apache/hive/streaming/HeartBeatFailure.java @@ -25,9 +25,17 @@ private Collection abortedTxns; private Collection nosuchTxns; - public HeartBeatFailure(Collection abortedTxns, Set nosuchTxns) { + HeartBeatFailure(Collection abortedTxns, Set nosuchTxns) { super("Heart beat error. InvalidTxns: " + nosuchTxns + ". AbortedTxns: " + abortedTxns); this.abortedTxns = abortedTxns; this.nosuchTxns = nosuchTxns; } + + public Collection getAbortedTxns() { + return abortedTxns; + } + + public Collection getNosuchTxns() { + return nosuchTxns; + } } diff --git a/streaming/src/java/org/apache/hive/streaming/HiveEndPoint.java b/streaming/src/java/org/apache/hive/streaming/HiveEndPoint.java deleted file mode 100644 index b04e137..0000000 --- a/streaming/src/java/org/apache/hive/streaming/HiveEndPoint.java +++ /dev/null @@ -1,1117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -import java.io.IOException; -import java.security.PrivilegedExceptionAction; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hive.cli.CliSessionState; -import org.apache.hadoop.hive.common.JavaUtils; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.IMetaStoreClient; -import org.apache.hadoop.hive.metastore.LockComponentBuilder; -import org.apache.hadoop.hive.metastore.LockRequestBuilder; -import org.apache.hadoop.hive.metastore.Warehouse; -import org.apache.hadoop.hive.metastore.api.DataOperationType; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.HeartbeatTxnRangeResponse; -import org.apache.hadoop.hive.metastore.api.LockRequest; -import org.apache.hadoop.hive.metastore.api.LockResponse; -import org.apache.hadoop.hive.metastore.api.LockState; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; -import org.apache.hadoop.hive.metastore.api.NoSuchTxnException; -import org.apache.hadoop.hive.metastore.api.Table; -import org.apache.hadoop.hive.metastore.api.TxnAbortedException; -import org.apache.hadoop.hive.metastore.api.TxnToWriteId; -import org.apache.hadoop.hive.ql.DriverFactory; -import org.apache.hadoop.hive.ql.IDriver; -import org.apache.hadoop.hive.ql.io.AcidUtils; -import org.apache.hadoop.hive.ql.session.SessionState; -import org.apache.hadoop.security.UserGroupInformation; -import org.apache.hive.hcatalog.common.HCatUtil; -import org.apache.thrift.TException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Information about the hive end point (i.e. table or partition) to write to. - * A light weight object that does NOT internally hold on to resources such as - * network connections. It can be stored in Hashed containers such as sets and hash tables. - */ -public class HiveEndPoint { - public final String metaStoreUri; - public final String database; - public final String table; - public final ArrayList partitionVals; - - - static final private Logger LOG = LoggerFactory.getLogger(HiveEndPoint.class.getName()); - - /** - * - * @param metaStoreUri URI of the metastore to connect to eg: thrift://localhost:9083 - * @param database Name of the Hive database - * @param table Name of table to stream to - * @param partitionVals Indicates the specific partition to stream to. Can be null or empty List - * if streaming to a table without partitions. The order of values in this - * list must correspond exactly to the order of partition columns specified - * during the table creation. E.g. For a table partitioned by - * (continent string, country string), partitionVals could be the list - * ("Asia", "India"). - */ - public HiveEndPoint(String metaStoreUri - , String database, String table, List partitionVals) { - this.metaStoreUri = metaStoreUri; - if (database==null) { - throw new IllegalArgumentException("Database cannot be null for HiveEndPoint"); - } - this.database = database; - this.table = table; - if (table==null) { - throw new IllegalArgumentException("Table cannot be null for HiveEndPoint"); - } - this.partitionVals = partitionVals==null ? new ArrayList() - : new ArrayList( partitionVals ); - } - - - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #newConnection(boolean, String)} - */ - @Deprecated - public StreamingConnection newConnection(final boolean createPartIfNotExists) - throws ConnectionError, InvalidPartition, InvalidTable, PartitionCreationFailed - , ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, null, null, null); - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #newConnection(boolean, HiveConf, String)} - */ - @Deprecated - public StreamingConnection newConnection(final boolean createPartIfNotExists, HiveConf conf) - throws ConnectionError, InvalidPartition, InvalidTable, PartitionCreationFailed - , ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, conf, null, null); - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #newConnection(boolean, HiveConf, UserGroupInformation, String)} - */ - @Deprecated - public StreamingConnection newConnection(final boolean createPartIfNotExists, final HiveConf conf, - final UserGroupInformation authenticatedUser) - throws ConnectionError, InvalidPartition, - InvalidTable, PartitionCreationFailed, ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, conf, authenticatedUser, null); - } - /** - * Acquire a new connection to MetaStore for streaming - * @param createPartIfNotExists If true, the partition specified in the endpoint - * will be auto created if it does not exist - * @param agentInfo should uniquely identify the process/entity that is using this batch. This - * should be something that can be correlated with calling application log files - * and/or monitoring consoles. - * @return - * @throws ConnectionError if problem connecting - * @throws InvalidPartition if specified partition is not valid (createPartIfNotExists = false) - * @throws ImpersonationFailed if not able to impersonate 'proxyUser' - * @throws PartitionCreationFailed if failed to create partition - * @throws InterruptedException - */ - public StreamingConnection newConnection(final boolean createPartIfNotExists, String agentInfo) - throws ConnectionError, InvalidPartition, InvalidTable, PartitionCreationFailed - , ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, null, null, agentInfo); - } - - /** - * Acquire a new connection to MetaStore for streaming - * @param createPartIfNotExists If true, the partition specified in the endpoint - * will be auto created if it does not exist - * @param conf HiveConf object, set it to null if not using advanced hive settings. - * @param agentInfo should uniquely identify the process/entity that is using this batch. This - * should be something that can be correlated with calling application log files - * and/or monitoring consoles. - * @return - * @throws ConnectionError if problem connecting - * @throws InvalidPartition if specified partition is not valid (createPartIfNotExists = false) - * @throws ImpersonationFailed if not able to impersonate 'proxyUser' - * @throws PartitionCreationFailed if failed to create partition - * @throws InterruptedException - */ - public StreamingConnection newConnection(final boolean createPartIfNotExists, HiveConf conf, String agentInfo) - throws ConnectionError, InvalidPartition, InvalidTable, PartitionCreationFailed - , ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, conf, null, agentInfo); - } - - /** - * Acquire a new connection to MetaStore for streaming. To connect using Kerberos, - * 'authenticatedUser' argument should have been used to do a kerberos login. Additionally the - * 'hive.metastore.kerberos.principal' setting should be set correctly either in hive-site.xml or - * in the 'conf' argument (if not null). If using hive-site.xml, it should be in classpath. - * - * @param createPartIfNotExists If true, the partition specified in the endpoint - * will be auto created if it does not exist - * @param conf HiveConf object to be used for the connection. Can be null. - * @param authenticatedUser UserGroupInformation object obtained from successful authentication. - * Uses non-secure mode if this argument is null. - * @param agentInfo should uniquely identify the process/entity that is using this batch. This - * should be something that can be correlated with calling application log files - * and/or monitoring consoles. - * @return - * @throws ConnectionError if there is a connection problem - * @throws InvalidPartition if specified partition is not valid (createPartIfNotExists = false) - * @throws ImpersonationFailed if not able to impersonate 'username' - * @throws PartitionCreationFailed if failed to create partition - * @throws InterruptedException - */ - public StreamingConnection newConnection(final boolean createPartIfNotExists, final HiveConf conf, - final UserGroupInformation authenticatedUser, final String agentInfo) - throws ConnectionError, InvalidPartition, - InvalidTable, PartitionCreationFailed, ImpersonationFailed , InterruptedException { - - if( authenticatedUser==null ) { - return newConnectionImpl(authenticatedUser, createPartIfNotExists, conf, agentInfo); - } - - try { - return authenticatedUser.doAs ( - new PrivilegedExceptionAction() { - @Override - public StreamingConnection run() - throws ConnectionError, InvalidPartition, InvalidTable - , PartitionCreationFailed { - return newConnectionImpl(authenticatedUser, createPartIfNotExists, conf, agentInfo); - } - } - ); - } catch (IOException e) { - throw new ConnectionError("Failed to connect as : " + authenticatedUser.getShortUserName(), e); - } - } - - private StreamingConnection newConnectionImpl(UserGroupInformation ugi, - boolean createPartIfNotExists, HiveConf conf, String agentInfo) - throws ConnectionError, InvalidPartition, InvalidTable - , PartitionCreationFailed { - return new ConnectionImpl(this, ugi, conf, createPartIfNotExists, agentInfo); - } - - private static UserGroupInformation getUserGroupInfo(String user) - throws ImpersonationFailed { - try { - return UserGroupInformation.createProxyUser( - user, UserGroupInformation.getLoginUser()); - } catch (IOException e) { - LOG.error("Unable to get UserGroupInfo for user : " + user, e); - throw new ImpersonationFailed(user,e); - } - } - - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - - HiveEndPoint endPoint = (HiveEndPoint) o; - - if (database != null - ? !database.equals(endPoint.database) - : endPoint.database != null ) { - return false; - } - if (metaStoreUri != null - ? !metaStoreUri.equals(endPoint.metaStoreUri) - : endPoint.metaStoreUri != null ) { - return false; - } - if (!partitionVals.equals(endPoint.partitionVals)) { - return false; - } - if (table != null ? !table.equals(endPoint.table) : endPoint.table != null) { - return false; - } - - return true; - } - - @Override - public int hashCode() { - int result = metaStoreUri != null ? metaStoreUri.hashCode() : 0; - result = 31 * result + (database != null ? database.hashCode() : 0); - result = 31 * result + (table != null ? table.hashCode() : 0); - result = 31 * result + partitionVals.hashCode(); - return result; - } - - @Override - public String toString() { - return "{" + - "metaStoreUri='" + metaStoreUri + '\'' + - ", database='" + database + '\'' + - ", table='" + table + '\'' + - ", partitionVals=" + partitionVals + " }"; - } - - - private static class ConnectionImpl implements StreamingConnection { - private final IMetaStoreClient msClient; - private final IMetaStoreClient heartbeaterMSClient; - private final HiveEndPoint endPt; - private final UserGroupInformation ugi; - private final String username; - private final boolean secureMode; - private final String agentInfo; - - /** - * @param endPoint end point to connect to - * @param ugi on behalf of whom streaming is done. cannot be null - * @param conf HiveConf object - * @param createPart create the partition if it does not exist - * @throws ConnectionError if there is trouble connecting - * @throws InvalidPartition if specified partition does not exist (and createPart=false) - * @throws InvalidTable if specified table does not exist - * @throws PartitionCreationFailed if createPart=true and not able to create partition - */ - private ConnectionImpl(HiveEndPoint endPoint, UserGroupInformation ugi, - HiveConf conf, boolean createPart, String agentInfo) - throws ConnectionError, InvalidPartition, InvalidTable - , PartitionCreationFailed { - this.endPt = endPoint; - this.ugi = ugi; - this.agentInfo = agentInfo; - this.username = ugi==null ? System.getProperty("user.name") : ugi.getShortUserName(); - if (conf==null) { - conf = HiveEndPoint.createHiveConf(this.getClass(), endPoint.metaStoreUri); - } - else { - overrideConfSettings(conf); - } - this.secureMode = ugi==null ? false : ugi.hasKerberosCredentials(); - this.msClient = getMetaStoreClient(endPoint, conf, secureMode); - // We use a separate metastore client for heartbeat calls to ensure heartbeat RPC calls are - // isolated from the other transaction related RPC calls. - this.heartbeaterMSClient = getMetaStoreClient(endPoint, conf, secureMode); - checkEndPoint(endPoint, msClient); - if (createPart && !endPoint.partitionVals.isEmpty()) { - createPartitionIfNotExists(endPoint, msClient, conf); - } - } - - /** - * Checks the validity of endpoint - * @param endPoint the HiveEndPoint to be checked - * @param msClient the metastore client - * @throws InvalidTable - */ - private void checkEndPoint(HiveEndPoint endPoint, IMetaStoreClient msClient) - throws InvalidTable, ConnectionError { - Table t; - try { - t = msClient.getTable(endPoint.database, endPoint.table); - } catch (Exception e) { - LOG.warn("Unable to check the endPoint: " + endPoint, e); - throw new InvalidTable(endPoint.database, endPoint.table, e); - } - // 1 - check that the table is Acid - if (!AcidUtils.isFullAcidTable(t)) { - LOG.error("HiveEndPoint " + endPoint + " must use an acid table"); - throw new InvalidTable(endPoint.database, endPoint.table, "is not an Acid table"); - } - - // 2 - check if partitionvals are legitimate - if (t.getPartitionKeys() != null && !t.getPartitionKeys().isEmpty() - && endPoint.partitionVals.isEmpty()) { - // Invalid if table is partitioned, but endPoint's partitionVals is empty - String errMsg = "HiveEndPoint " + endPoint + " doesn't specify any partitions for " + - "partitioned table"; - LOG.error(errMsg); - throw new ConnectionError(errMsg); - } - if ((t.getPartitionKeys() == null || t.getPartitionKeys().isEmpty()) - && !endPoint.partitionVals.isEmpty()) { - // Invalid if table is not partitioned, but endPoint's partitionVals is not empty - String errMsg = "HiveEndPoint" + endPoint + " specifies partitions for unpartitioned table"; - LOG.error(errMsg); - throw new ConnectionError(errMsg); - } - } - - /** - * Close connection - */ - @Override - public void close() { - if (ugi==null) { - msClient.close(); - heartbeaterMSClient.close(); - return; - } - try { - ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public Void run() throws Exception { - msClient.close(); - heartbeaterMSClient.close(); - return null; - } - } ); - try { - FileSystem.closeAllForUGI(ugi); - } catch (IOException exception) { - LOG.error("Could not clean up file-system handles for UGI: " + ugi, exception); - } - } catch (IOException e) { - LOG.error("Error closing connection to " + endPt, e); - } catch (InterruptedException e) { - LOG.error("Interrupted when closing connection to " + endPt, e); - } - } - - @Override - public UserGroupInformation getUserGroupInformation() { - return ugi; - } - - /** - * Acquires a new batch of transactions from Hive. - * - * @param numTransactions is a hint from client indicating how many transactions client needs. - * @param recordWriter Used to write record. The same writer instance can - * be shared with another TransactionBatch (to the same endpoint) - * only after the first TransactionBatch has been closed. - * Writer will be closed when the TransactionBatch is closed. - * @return - * @throws StreamingIOFailure if failed to create new RecordUpdater for batch - * @throws TransactionBatchUnAvailable if failed to acquire a new Transaction batch - * @throws ImpersonationFailed failed to run command as proxyUser - * @throws InterruptedException - */ - @Override - public TransactionBatch fetchTransactionBatch(final int numTransactions, - final RecordWriter recordWriter) - throws StreamingException, TransactionBatchUnAvailable, ImpersonationFailed - , InterruptedException { - if (ugi==null) { - return fetchTransactionBatchImpl(numTransactions, recordWriter); - } - try { - return ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public TransactionBatch run() throws StreamingException, InterruptedException { - return fetchTransactionBatchImpl(numTransactions, recordWriter); - } - } - ); - } catch (IOException e) { - throw new ImpersonationFailed("Failed to fetch Txn Batch as user '" + ugi.getShortUserName() - + "' when acquiring Transaction Batch on endPoint " + endPt, e); - } - } - - private TransactionBatch fetchTransactionBatchImpl(int numTransactions, - RecordWriter recordWriter) - throws StreamingException, TransactionBatchUnAvailable, InterruptedException { - return new TransactionBatchImpl(username, ugi, endPt, numTransactions, msClient, - heartbeaterMSClient, recordWriter, agentInfo); - } - - - private static void createPartitionIfNotExists(HiveEndPoint ep, - IMetaStoreClient msClient, HiveConf conf) - throws InvalidTable, PartitionCreationFailed { - if (ep.partitionVals.isEmpty()) { - return; - } - SessionState localSession = null; - if(SessionState.get() == null) { - localSession = SessionState.start(new CliSessionState(conf)); - } - IDriver driver = DriverFactory.newDriver(conf); - - try { - if (LOG.isDebugEnabled()) { - LOG.debug("Attempting to create partition (if not existent) " + ep); - } - - List partKeys = msClient.getTable(ep.database, ep.table) - .getPartitionKeys(); - runDDL(driver, "use " + ep.database); - String query = "alter table " + ep.table + " add if not exists partition " - + partSpecStr(partKeys, ep.partitionVals); - runDDL(driver, query); - } catch (MetaException e) { - LOG.error("Failed to create partition : " + ep, e); - throw new PartitionCreationFailed(ep, e); - } catch (NoSuchObjectException e) { - LOG.error("Failed to create partition : " + ep, e); - throw new InvalidTable(ep.database, ep.table); - } catch (TException e) { - LOG.error("Failed to create partition : " + ep, e); - throw new PartitionCreationFailed(ep, e); - } catch (QueryFailedException e) { - LOG.error("Failed to create partition : " + ep, e); - throw new PartitionCreationFailed(ep, e); - } finally { - driver.close(); - try { - if(localSession != null) { - localSession.close(); - } - } catch (IOException e) { - LOG.warn("Error closing SessionState used to run Hive DDL."); - } - } - } - - private static boolean runDDL(IDriver driver, String sql) throws QueryFailedException { - if (LOG.isDebugEnabled()) { - LOG.debug("Running Hive Query: " + sql); - } - driver.run(sql); - return true; - } - - private static String partSpecStr(List partKeys, ArrayList partVals) { - if (partKeys.size()!=partVals.size()) { - throw new IllegalArgumentException("Partition values:" + partVals + - ", does not match the partition Keys in table :" + partKeys ); - } - StringBuilder buff = new StringBuilder(partKeys.size()*20); - buff.append(" ( "); - int i=0; - for (FieldSchema schema : partKeys) { - buff.append(schema.getName()); - buff.append("='"); - buff.append(partVals.get(i)); - buff.append("'"); - if (i!=partKeys.size()-1) { - buff.append(","); - } - ++i; - } - buff.append(" )"); - return buff.toString(); - } - - private static IMetaStoreClient getMetaStoreClient(HiveEndPoint endPoint, HiveConf conf, boolean secureMode) - throws ConnectionError { - - if (endPoint.metaStoreUri!= null) { - conf.setVar(HiveConf.ConfVars.METASTOREURIS, endPoint.metaStoreUri); - } - if(secureMode) { - conf.setBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL,true); - } - try { - return HCatUtil.getHiveMetastoreClient(conf); - } catch (MetaException e) { - throw new ConnectionError("Error connecting to Hive Metastore URI: " - + endPoint.metaStoreUri + ". " + e.getMessage(), e); - } catch (IOException e) { - throw new ConnectionError("Error connecting to Hive Metastore URI: " - + endPoint.metaStoreUri + ". " + e.getMessage(), e); - } - } - } // class ConnectionImpl - - private static class TransactionBatchImpl implements TransactionBatch { - private final String username; - private final UserGroupInformation ugi; - private final HiveEndPoint endPt; - private final IMetaStoreClient msClient; - private final IMetaStoreClient heartbeaterMSClient; - private final RecordWriter recordWriter; - private final List txnToWriteIds; - - //volatile because heartbeat() may be in a "different" thread; updates of this are "piggybacking" - private volatile int currentTxnIndex = -1; - private final String partNameForLock; - //volatile because heartbeat() may be in a "different" thread - private volatile TxnState state; - private LockRequest lockRequest = null; - /** - * once any operation on this batch encounters a system exception - * (e.g. IOException on write) it's safest to assume that we can't write to the - * file backing this batch any more. This guards important public methods - */ - private volatile boolean isClosed = false; - private final String agentInfo; - /** - * Tracks the state of each transaction - */ - private final TxnState[] txnStatus; - /** - * ID of the last txn used by {@link #beginNextTransactionImpl()} - */ - private long lastTxnUsed; - - /** - * Represents a batch of transactions acquired from MetaStore - * - * @throws StreamingException if failed to create new RecordUpdater for batch - * @throws TransactionBatchUnAvailable if failed to acquire a new Transaction batch - */ - private TransactionBatchImpl(final String user, UserGroupInformation ugi, HiveEndPoint endPt, - final int numTxns, final IMetaStoreClient msClient, - final IMetaStoreClient heartbeaterMSClient, RecordWriter recordWriter, String agentInfo) - throws StreamingException, TransactionBatchUnAvailable, InterruptedException { - boolean success = false; - try { - if ( endPt.partitionVals!=null && !endPt.partitionVals.isEmpty() ) { - Table tableObj = msClient.getTable(endPt.database, endPt.table); - List partKeys = tableObj.getPartitionKeys(); - partNameForLock = Warehouse.makePartName(partKeys, endPt.partitionVals); - } else { - partNameForLock = null; - } - this.username = user; - this.ugi = ugi; - this.endPt = endPt; - this.msClient = msClient; - this.heartbeaterMSClient = heartbeaterMSClient; - this.recordWriter = recordWriter; - this.agentInfo = agentInfo; - - List txnIds = openTxnImpl(msClient, user, numTxns, ugi); - txnToWriteIds = allocateWriteIdsImpl(msClient, txnIds, ugi); - assert(txnToWriteIds.size() == numTxns); - - txnStatus = new TxnState[numTxns]; - for(int i = 0; i < txnStatus.length; i++) { - assert(txnToWriteIds.get(i).getTxnId() == txnIds.get(i)); - txnStatus[i] = TxnState.OPEN;//Open matches Metastore state - } - this.state = TxnState.INACTIVE; - - // The Write Ids returned for the transaction batch is also sequential - recordWriter.newBatch(txnToWriteIds.get(0).getWriteId(), txnToWriteIds.get(numTxns-1).getWriteId()); - success = true; - } catch (TException e) { - throw new TransactionBatchUnAvailable(endPt, e); - } catch (IOException e) { - throw new TransactionBatchUnAvailable(endPt, e); - } - finally { - //clean up if above throws - markDead(success); - } - } - - private List openTxnImpl(final IMetaStoreClient msClient, final String user, final int numTxns, UserGroupInformation ugi) - throws IOException, TException, InterruptedException { - if(ugi==null) { - return msClient.openTxns(user, numTxns).getTxn_ids(); - } - return (List) ugi.doAs(new PrivilegedExceptionAction() { - @Override - public Object run() throws Exception { - return msClient.openTxns(user, numTxns).getTxn_ids(); - } - }); - } - - private List allocateWriteIdsImpl(final IMetaStoreClient msClient, - final List txnIds, UserGroupInformation ugi) - throws IOException, TException, InterruptedException { - if(ugi==null) { - return msClient.allocateTableWriteIdsBatch(txnIds, endPt.database, endPt.table); - } - return (List) ugi.doAs(new PrivilegedExceptionAction() { - @Override - public Object run() throws Exception { - return msClient.allocateTableWriteIdsBatch(txnIds, endPt.database, endPt.table); - } - }); - } - - @Override - public String toString() { - if (txnToWriteIds==null || txnToWriteIds.isEmpty()) { - return "{}"; - } - StringBuilder sb = new StringBuilder(" TxnStatus["); - for(TxnState state : txnStatus) { - //'state' should not be null - future proofing - sb.append(state == null ? "N" : state); - } - sb.append("] LastUsed ").append(JavaUtils.txnIdToString(lastTxnUsed)); - return "TxnId/WriteIds=[" + txnToWriteIds.get(0).getTxnId() - + "/" + txnToWriteIds.get(0).getWriteId() - + "..." - + txnToWriteIds.get(txnToWriteIds.size()-1).getTxnId() - + "/" + txnToWriteIds.get(txnToWriteIds.size()-1).getWriteId() - + "] on endPoint = " + endPt + "; " + sb; - } - - /** - * Activate the next available transaction in the current transaction batch - * @throws TransactionError failed to switch to next transaction - */ - @Override - public void beginNextTransaction() throws TransactionError, ImpersonationFailed, - InterruptedException { - checkIsClosed(); - if (ugi==null) { - beginNextTransactionImpl(); - return; - } - try { - ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public Void run() throws TransactionError { - beginNextTransactionImpl(); - return null; - } - } - ); - } catch (IOException e) { - throw new ImpersonationFailed("Failed switching to next Txn as user '" + username + - "' in Txn batch :" + this, e); - } - } - - private void beginNextTransactionImpl() throws TransactionError { - state = TxnState.INACTIVE;//clear state from previous txn - - if ((currentTxnIndex + 1) >= txnToWriteIds.size()) { - throw new InvalidTrasactionState("No more transactions available in" + - " current batch for end point : " + endPt); - } - ++currentTxnIndex; - state = TxnState.OPEN; - lastTxnUsed = getCurrentTxnId(); - lockRequest = createLockRequest(endPt, partNameForLock, username, getCurrentTxnId(), agentInfo); - try { - LockResponse res = msClient.lock(lockRequest); - if (res.getState() != LockState.ACQUIRED) { - throw new TransactionError("Unable to acquire lock on " + endPt); - } - } catch (TException e) { - throw new TransactionError("Unable to acquire lock on " + endPt, e); - } - } - - /** - * Get Id of currently open transaction. - * @return -1 if there is no open TX - */ - @Override - public Long getCurrentTxnId() { - if (currentTxnIndex >= 0) { - return txnToWriteIds.get(currentTxnIndex).getTxnId(); - } - return -1L; - } - - /** - * Get Id of currently open transaction. - * @return -1 if there is no open TX - */ - @Override - public Long getCurrentWriteId() { - if (currentTxnIndex >= 0) { - return txnToWriteIds.get(currentTxnIndex).getWriteId(); - } - return -1L; - } - - /** - * get state of current transaction - * @return - */ - @Override - public TxnState getCurrentTransactionState() { - return state; - } - - /** - * Remaining transactions are the ones that are not committed or aborted or active. - * Active transaction is not considered part of remaining txns. - * @return number of transactions remaining this batch. - */ - @Override - public int remainingTransactions() { - if (currentTxnIndex>=0) { - return txnToWriteIds.size() - currentTxnIndex -1; - } - return txnToWriteIds.size(); - } - - - /** - * Write record using RecordWriter - * @param record the data to be written - * @throws StreamingIOFailure I/O failure - * @throws SerializationError serialization error - * @throws ImpersonationFailed error writing on behalf of proxyUser - * @throws InterruptedException - */ - @Override - public void write(final byte[] record) - throws StreamingException, InterruptedException { - write(Collections.singletonList(record)); - } - private void checkIsClosed() throws IllegalStateException { - if(isClosed) { - throw new IllegalStateException("TransactionBatch " + toString() + " has been closed()"); - } - } - /** - * A transaction batch opens a single HDFS file and writes multiple transaction to it. If there is any issue - * with the write, we can't continue to write to the same file any as it may be corrupted now (at the tail). - * This ensures that a client can't ignore these failures and continue to write. - */ - private void markDead(boolean success) { - if(success) { - return; - } - isClosed = true;//also ensures that heartbeat() is no-op since client is likely doing it async - try { - abort(true);//abort all remaining txns - } - catch(Exception ex) { - LOG.error("Fatal error on " + toString() + "; cause " + ex.getMessage(), ex); - } - try { - closeImpl(); - } - catch (Exception ex) { - LOG.error("Fatal error on " + toString() + "; cause " + ex.getMessage(), ex); - } - } - - - /** - * Write records using RecordWriter - * @param records collection of rows to be written - * @throws StreamingException serialization error - * @throws ImpersonationFailed error writing on behalf of proxyUser - * @throws InterruptedException - */ - @Override - public void write(final Collection records) - throws StreamingException, InterruptedException, - ImpersonationFailed { - checkIsClosed(); - boolean success = false; - try { - if (ugi == null) { - writeImpl(records); - } else { - ugi.doAs( - new PrivilegedExceptionAction() { - @Override - public Void run() throws StreamingException { - writeImpl(records); - return null; - } - } - ); - } - success = true; - } catch(SerializationError ex) { - //this exception indicates that a {@code record} could not be parsed and the - //caller can decide whether to drop it or send it to dead letter queue. - //rolling back the txn and retrying won't help since the tuple will be exactly the same - //when it's replayed. - success = true; - throw ex; - } catch(IOException e){ - throw new ImpersonationFailed("Failed writing as user '" + username + - "' to endPoint :" + endPt + ". Transaction Id: " - + getCurrentTxnId(), e); - } - finally { - markDead(success); - } - } - - private void writeImpl(Collection records) - throws StreamingException { - for (byte[] record : records) { - recordWriter.write(getCurrentWriteId(), record); - } - } - - - /** - * Commit the currently open transaction - * @throws TransactionError - * @throws StreamingIOFailure if flushing records failed - * @throws ImpersonationFailed if - * @throws InterruptedException - */ - @Override - public void commit() throws TransactionError, StreamingException, - ImpersonationFailed, InterruptedException { - checkIsClosed(); - boolean success = false; - try { - if (ugi == null) { - commitImpl(); - } - else { - ugi.doAs( - new PrivilegedExceptionAction() { - @Override - public Void run() throws StreamingException { - commitImpl(); - return null; - } - } - ); - } - success = true; - } catch (IOException e) { - throw new ImpersonationFailed("Failed committing Txn ID " + getCurrentTxnId() + " as user '" - + username + "'on endPoint :" + endPt + ". Transaction Id: ", e); - } - finally { - markDead(success); - } - } - - private void commitImpl() throws TransactionError, StreamingException { - try { - recordWriter.flush(); - msClient.commitTxn(txnToWriteIds.get(currentTxnIndex).getTxnId()); - state = TxnState.COMMITTED; - txnStatus[currentTxnIndex] = TxnState.COMMITTED; - } catch (NoSuchTxnException e) { - throw new TransactionError("Invalid transaction id : " - + getCurrentTxnId(), e); - } catch (TxnAbortedException e) { - throw new TransactionError("Aborted transaction cannot be committed" - , e); - } catch (TException e) { - throw new TransactionError("Unable to commit transaction" - + getCurrentTxnId(), e); - } - } - - /** - * Abort the currently open transaction - * @throws TransactionError - */ - @Override - public void abort() throws TransactionError, StreamingException - , ImpersonationFailed, InterruptedException { - if(isClosed) { - /** - * isDead is only set internally by this class. {@link #markDead(boolean)} will abort all - * remaining txns, so make this no-op to make sure that a well-behaved client that calls abort() - * error doesn't get misleading errors - */ - return; - } - abort(false); - } - private void abort(final boolean abortAllRemaining) throws TransactionError, StreamingException - , ImpersonationFailed, InterruptedException { - if (ugi==null) { - abortImpl(abortAllRemaining); - return; - } - try { - ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public Void run() throws StreamingException { - abortImpl(abortAllRemaining); - return null; - } - } - ); - } catch (IOException e) { - throw new ImpersonationFailed("Failed aborting Txn " + getCurrentTxnId() + " as user '" - + username + "' on endPoint :" + endPt, e); - } - } - - private void abortImpl(boolean abortAllRemaining) throws TransactionError, StreamingException { - try { - if(abortAllRemaining) { - //when last txn finished (abort/commit) the currentTxnIndex is pointing at that txn - //so we need to start from next one, if any. Also if batch was created but - //fetchTransactionBatch() was never called, we want to start with first txn - int minOpenTxnIndex = Math.max(currentTxnIndex + - (state == TxnState.ABORTED || state == TxnState.COMMITTED ? 1 : 0), 0); - for(currentTxnIndex = minOpenTxnIndex; - currentTxnIndex < txnToWriteIds.size(); currentTxnIndex++) { - msClient.rollbackTxn(txnToWriteIds.get(currentTxnIndex).getTxnId()); - txnStatus[currentTxnIndex] = TxnState.ABORTED; - } - currentTxnIndex--;//since the loop left it == txnToWriteIds.size() - } - else { - if (getCurrentTxnId() > 0) { - msClient.rollbackTxn(getCurrentTxnId()); - txnStatus[currentTxnIndex] = TxnState.ABORTED; - } - } - state = TxnState.ABORTED; - recordWriter.clear(); - } catch (NoSuchTxnException e) { - throw new TransactionError("Unable to abort invalid transaction id : " - + getCurrentTxnId(), e); - } catch (TException e) { - throw new TransactionError("Unable to abort transaction id : " - + getCurrentTxnId(), e); - } - } - - @Override - public void heartbeat() throws StreamingException, HeartBeatFailure { - if(isClosed) { - return; - } - if(state != TxnState.OPEN && currentTxnIndex >= txnToWriteIds.size() - 1) { - //here means last txn in the batch is resolved but the close() hasn't been called yet so - //there is nothing to heartbeat - return; - } - //if here after commit()/abort() but before next beginNextTransaction(), currentTxnIndex still - //points at the last txn which we don't want to heartbeat - Long first = txnToWriteIds.get(state == TxnState.OPEN ? currentTxnIndex : currentTxnIndex + 1).getTxnId(); - Long last = txnToWriteIds.get(txnToWriteIds.size()-1).getTxnId(); - try { - HeartbeatTxnRangeResponse resp = heartbeaterMSClient.heartbeatTxnRange(first, last); - if (!resp.getAborted().isEmpty() || !resp.getNosuch().isEmpty()) { - throw new HeartBeatFailure(resp.getAborted(), resp.getNosuch()); - } - } catch (TException e) { - throw new StreamingException("Failure to heartbeat on ids (" + first + "src/gen/thrift" - + last + ") on end point : " + endPt ); - } - } - - @Override - public boolean isClosed() { - return isClosed; - } - /** - * Close the TransactionBatch. This will abort any still open txns in this batch. - * @throws StreamingIOFailure I/O failure when closing transaction batch - */ - @Override - public void close() throws StreamingException, ImpersonationFailed, InterruptedException { - if(isClosed) { - return; - } - isClosed = true; - abortImpl(true);//abort proactively so that we don't wait for timeout - closeImpl();//perhaps we should add a version of RecordWriter.closeBatch(boolean abort) which - //will call RecordUpdater.close(boolean abort) - } - private void closeImpl() throws StreamingException, InterruptedException{ - state = TxnState.INACTIVE; - if(ugi == null) { - recordWriter.closeBatch(); - return; - } - try { - ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public Void run() throws StreamingException { - recordWriter.closeBatch(); - return null; - } - } - ); - try { - FileSystem.closeAllForUGI(ugi); - } catch (IOException exception) { - LOG.error("Could not clean up file-system handles for UGI: " + ugi, exception); - } - } catch (IOException e) { - throw new ImpersonationFailed("Failed closing Txn Batch as user '" + username + - "' on endPoint :" + endPt, e); - } - } - - private static LockRequest createLockRequest(final HiveEndPoint hiveEndPoint, - String partNameForLock, String user, long txnId, String agentInfo) { - LockRequestBuilder rqstBuilder = agentInfo == null ? - new LockRequestBuilder() : new LockRequestBuilder(agentInfo); - rqstBuilder.setUser(user); - rqstBuilder.setTransactionId(txnId); - - LockComponentBuilder lockCompBuilder = new LockComponentBuilder() - .setDbName(hiveEndPoint.database) - .setTableName(hiveEndPoint.table) - .setShared() - .setOperationType(DataOperationType.INSERT); - if (partNameForLock!=null && !partNameForLock.isEmpty() ) { - lockCompBuilder.setPartitionName(partNameForLock); - } - rqstBuilder.addLockComponent(lockCompBuilder.build()); - - return rqstBuilder.build(); - } - } // class TransactionBatchImpl - - static HiveConf createHiveConf(Class clazz, String metaStoreUri) { - HiveConf conf = new HiveConf(clazz); - if (metaStoreUri!= null) { - setHiveConf(conf, HiveConf.ConfVars.METASTOREURIS, metaStoreUri); - } - HiveEndPoint.overrideConfSettings(conf); - return conf; - } - - private static void overrideConfSettings(HiveConf conf) { - setHiveConf(conf, HiveConf.ConfVars.HIVE_TXN_MANAGER, - "org.apache.hadoop.hive.ql.lockmgr.DbTxnManager"); - setHiveConf(conf, HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, true); - setHiveConf(conf, HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI, true); - // Avoids creating Tez Client sessions internally as it takes much longer currently - setHiveConf(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE, "mr"); - } - - private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var, String value) { - if( LOG.isDebugEnabled() ) { - LOG.debug("Overriding HiveConf setting : " + var + " = " + value); - } - conf.setVar(var, value); - } - - private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var, boolean value) { - if( LOG.isDebugEnabled() ) { - LOG.debug("Overriding HiveConf setting : " + var + " = " + value); - } - conf.setBoolVar(var, value); - } - -} // class HiveEndPoint diff --git a/streaming/src/java/org/apache/hive/streaming/HiveStreamingConnection.java b/streaming/src/java/org/apache/hive/streaming/HiveStreamingConnection.java new file mode 100644 index 0000000..aebbebe --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/HiveStreamingConnection.java @@ -0,0 +1,1071 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.cli.CliSessionState; +import org.apache.hadoop.hive.common.JavaUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStoreUtils; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.LockComponentBuilder; +import org.apache.hadoop.hive.metastore.LockRequestBuilder; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.DataOperationType; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.HeartbeatTxnRangeResponse; +import org.apache.hadoop.hive.metastore.api.LockRequest; +import org.apache.hadoop.hive.metastore.api.LockResponse; +import org.apache.hadoop.hive.metastore.api.LockState; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.NoSuchTxnException; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.TxnAbortedException; +import org.apache.hadoop.hive.metastore.api.TxnToWriteId; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.ql.DriverFactory; +import org.apache.hadoop.hive.ql.IDriver; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +/** + * Streaming connection implementation for hive. To create a streaming connection, use the builder API + * to create record writer first followed by the connection itself. Once connection is created, clients can + * start a transaction, keep writing using the connection, commit and close when done. + * + * To connect to the correct metastore, HiveConf object has to be created from hive-site.xml/HIVE_CONF_DIR or if + * manually created metastore uri has to be set correctly. If hive conf object is not specified, + * "thrift://localhost:9083" will be used as default. + * + * Example usage: + * StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + * .withFieldDelimiter(',') + * .build(); + * StreamingConnection connection = HiveStreamingConnection.newBuilder() + * .withDatabase("default") + * .withTable("src") + * .withAgentInfo("nifi-agent") + * .withRecordWriter(writer) + * .withHiveConf(hiveConf) + * .connect(); + * connection.beginNextTransaction(); + * connection.write("key1,val1".getBytes()); + * connection.write("key2,val2".getBytes()); + * connection.commit(); + * connection.beginNextTransaction(); + * connection.write("key3,val3".getBytes()); + * connection.write("key4,val4".getBytes()); + * connection.commit(); + * connection.close(); + */ +public class HiveStreamingConnection implements StreamingConnection { + private static final Logger LOG = LoggerFactory.getLogger(HiveStreamingConnection.class.getName()); + + private static final String DEFAULT_METASTORE_URI = "thrift://localhost:9083"; + private static final int DEFAULT_TRANSACTION_BATCH_SIZE = 10; + + // fields populated from builder + private String database; + private String table; + private List staticPartitionValues; + private String agentInfo; + private final int transactionBatchSize; + private RecordWriter recordWriter; + // used by heartbeat thread to protect against swapping of current transaction batch + private final Object txnBatchLock = new Object(); + private final AtomicReference currentTxnBatchRef = new AtomicReference<>(); + private TransactionBatchImpl currentTransactionBatch; + private HiveConf conf; + + // internal fields + private boolean isPartitionedTable; + private final IMetaStoreClient msClient; + private final IMetaStoreClient heartbeatMSClient; + private final String username; + private final boolean secureMode; + private Table t = null; + private SessionState localSession = null; + private IDriver driver = null; + private UserGroupInformation loggedInUser; + private final String metastoreUri; + + private HiveStreamingConnection(Builder builder) throws StreamingException { + this.database = builder.database; + this.table = builder.table; + this.staticPartitionValues = builder.staticPartitionValues; + try { + loggedInUser = UserGroupInformation.getLoginUser(); + } catch (IOException e) { + LOG.warn("Unable to get logged in user via UGI. err: {}", e.getMessage()); + loggedInUser = null; + } + if (loggedInUser == null) { + this.username = System.getProperty("user.name"); + this.secureMode = false; + } else { + this.username = loggedInUser.getShortUserName(); + this.secureMode = loggedInUser.hasKerberosCredentials(); + } + this.agentInfo = builder.agentInfo == null ? username : builder.agentInfo; + this.conf = builder.hiveConf; + if (conf == null) { + conf = HiveStreamingConnection.createHiveConf(this.getClass(), DEFAULT_METASTORE_URI); + } else { + overrideConfSettings(conf); + } + this.metastoreUri = conf.get(MetastoreConf.ConfVars.THRIFT_URIS.getHiveName()); + this.msClient = getMetaStoreClient(); + // We use a separate metastore client for heartbeat calls to ensure heartbeat RPC calls are + // isolated from the other transaction related RPC calls. + this.heartbeatMSClient = getMetaStoreClient(); + this.transactionBatchSize = builder.transactionBatchSize; + this.recordWriter = builder.recordWriter; + checkEndPoint(); + createPartitionIfNotExists(staticPartitionValues); + + // start heartbeat thread + ThreadFactory threadFactory = new ThreadFactoryBuilder().setDaemon(true) + .setNameFormat("HiveStreamingConnection-Heartbeat-Thread") + .build(); + ScheduledExecutorService scheduledExecutorService = Executors.newSingleThreadScheduledExecutor(threadFactory); + long heartBeatInterval = conf.getTimeVar( + HiveConf.ConfVars.HIVE_STREAMING_CONNECTION_CLIENT_HEARTBEAT_INTERVAL, TimeUnit.MILLISECONDS); + Runnable runnable = new HeartbeatRunnable(txnBatchLock, currentTxnBatchRef); + scheduledExecutorService.scheduleWithFixedDelay(runnable, heartBeatInterval, heartBeatInterval, TimeUnit + .MILLISECONDS); + final String msg = "STREAMING CONNECTION INFO" + "\n" + + "metastore-uri: " + metastoreUri + "\n" + + "database: " + database + "\n" + + "table: " + table + "\n" + + "partitioned-table: " + isPartitionedTable() + "\n" + + "static-partitioning: " + isStaticPartitioning() + "\n" + + "dynamic-partitioning: " + isDynamicPartitioning() + "\n" + + "username: " + username + "\n" + + "secure-mode: " + secureMode + "\n" + + "record-writer: " + recordWriter.getClass().getSimpleName() + "\n" + + "heartbeat-interval: " + heartBeatInterval + " ms\n"; + LOG.info(msg); + } + + public static Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + private String database; + private String table; + private List staticPartitionValues; + private String agentInfo; + private HiveConf hiveConf; + private int transactionBatchSize = DEFAULT_TRANSACTION_BATCH_SIZE; + private RecordWriter recordWriter; + + /** + * Specify database to use for streaming connection. + * + * @param database - db name + * @return - builder + */ + public Builder withDatabase(final String database) { + this.database = database; + return this; + } + + /** + * Specify table to use for streaming connection. + * + * @param table - table name + * @return - builder + */ + public Builder withTable(final String table) { + this.table = table; + return this; + } + + /** + * Specify the name of partition to use for streaming connection. + * + * @param staticPartitionValues - static partition values + * @return - builder + */ + public Builder withStaticPartitionValues(final List staticPartitionValues) { + this.staticPartitionValues = staticPartitionValues == null ? null : new ArrayList<>(staticPartitionValues); + return this; + } + + /** + * Specify agent info to use for streaming connection. + * + * @param agentInfo - agent info + * @return - builder + */ + public Builder withAgentInfo(final String agentInfo) { + this.agentInfo = agentInfo; + return this; + } + + /** + * Specify hive configuration object to use for streaming connection. + * Generate this object by point to already existing hive-site.xml or HIVE_CONF_DIR. + * Make sure if metastore URI has been set correctly else thrift://localhost:9083 will be + * used as default. + * + * @param hiveConf - hive conf object + * @return - builder + */ + public Builder withHiveConf(final HiveConf hiveConf) { + this.hiveConf = hiveConf; + return this; + } + + /** + * Transaction batch size to use (default value is 10). This is expert level configuration. + * For every transaction batch a delta directory will be created which will impact + * when compaction will trigger. + * + * @param transactionBatchSize - transaction batch size + * @return - builder + */ + public Builder withTransactionBatchSize(final int transactionBatchSize) { + this.transactionBatchSize = transactionBatchSize; + return this; + } + + /** + * Record writer to use for writing records to destination table. + * + * @param recordWriter - record writer + * @return - builder + */ + public Builder withRecordWriter(final RecordWriter recordWriter) { + this.recordWriter = recordWriter; + return this; + } + + /** + * Returning a streaming connection to hive. + * + * @return - hive streaming connection + * @throws StreamingException - thrown if connection cannot be made + */ + public HiveStreamingConnection connect() throws StreamingException { + Preconditions.checkNotNull(database, "Database cannot be null for HiveEndPoint"); + Preconditions.checkNotNull(table, "Table cannot be null for HiveEndPoint"); + Preconditions.checkNotNull(recordWriter, "Record writer cannot be null"); + HiveStreamingConnection conn = newConnectionImpl(this); + recordWriter.init(conn); + return conn; + } + } + + private void setPartitionedTable(boolean isPartitionedTable) { + this.isPartitionedTable = isPartitionedTable; + } + + private static HiveStreamingConnection newConnectionImpl(Builder builder) throws StreamingException { + return new HiveStreamingConnection(builder); + } + + @Override + public String toString() { + return "{ metaStoreUri: " + metastoreUri + ", database: " + database + ", table: '" + table + " }"; + } + + @VisibleForTesting + public String toTransactionString() { + return currentTransactionBatch == null ? "" : currentTransactionBatch.toString(); + } + + @Override + public void createPartitionIfNotExists(final List partitionVals) throws StreamingException { + if (partitionVals == null || partitionVals.isEmpty()) { + return; + } + // local session and driver gets closed only when connection is closed (this is to avoid overhead of + // closing and opening a session and driver for dynamic partitioning case) + List partKeys = t.getPartitionKeys(); + if (localSession == null && SessionState.get() == null) { + localSession = SessionState.start(new CliSessionState(conf)); + } + + driver = DriverFactory.newDriver(conf); + if (LOG.isDebugEnabled()) { + LOG.debug("Attempting to create partition (if not existent) " + this); + } + + String query; + try { + msClient.getPartition(database, table, partitionVals); + LOG.info("Partition {} already exists", partitionVals); + return; + } catch (MetaException e) { + throw new StreamingException("Unable to get partition: " + partitionVals, e); + } catch (NoSuchObjectException e) { + // partition does not exists, will be created below + LOG.info("Partition {} does not exists. Processing to create..", partitionVals); + } catch (TException e) { + throw new StreamingException("Unable to get partition: " + partitionVals, e); + } + // adding "if not exists" will request EXCLUSIVE lock on the table but for streaming ingest we request shared + // lock on the table. see createLockRequest(). Getting exclusive lock will not let concurrent writers for a + // partition. So not using "if not exists" here and partition exists exception will be ignored. + query = "alter table " + database + "." + table + " add partition " + partSpecStr(partKeys, partitionVals); + try { + runDDL(driver, query); + } catch (PartitionCreationFailed e) { + if (e.getCause() instanceof AlreadyExistsException) { + LOG.info("Partition {} already exists.. Ignoring..", partitionVals); + } else { + throw e; + } + } + } + + private void runDDL(IDriver driver, String sql) throws PartitionCreationFailed { + if (LOG.isDebugEnabled()) { + LOG.debug("Running Hive Query: " + sql); + } + CommandProcessorResponse response = driver.run(sql); + if (response.getResponseCode() != 0) { + throw new PartitionCreationFailed(this, response.getException()); + } + } + + private String partSpecStr(List partKeys, List partVals) { + if (partKeys.size() != partVals.size()) { + throw new IllegalArgumentException("Partition values:" + partVals + + ", does not match the partition Keys in table :" + partKeys); + } + StringBuilder buff = new StringBuilder(partKeys.size() * 20); + buff.append(" ( "); + int i = 0; + for (FieldSchema schema : partKeys) { + buff.append(schema.getName()); + buff.append("='"); + buff.append(partVals.get(i)); + buff.append("'"); + if (i != partKeys.size() - 1) { + buff.append(","); + } + ++i; + } + buff.append(" )"); + return buff.toString(); + } + + /** + * Returns partition path else table path (when specified partitionVals is null or empty) + * + * @param partitionVals - partition values + * @return - Path to partition or table + * @throws StreamingException - thrown if any metastore exception is thrown + */ + @Override + public Path getPathForPartition(List partitionVals) throws StreamingException { + try { + String location; + if (partitionVals == null || partitionVals.isEmpty()) { + location = msClient.getTable(database, table) + .getSd().getLocation(); + } else { + location = msClient.getPartition(database, table, + partitionVals).getSd().getLocation(); + } + return new Path(location); + } catch (TException e) { + throw new StreamingException(e.getMessage() + + ". Unable to get path for partition: " + partitionVals, e); + } + } + + private void checkEndPoint() throws InvalidTable, ConnectionError { + try { + t = msClient.getTable(database, table); + } catch (Exception e) { + LOG.warn("Unable to check the endPoint: " + this, e); + throw new InvalidTable(database, table, e); + } + // 1 - check that the table is Acid + if (!AcidUtils.isFullAcidTable(t)) { + LOG.error("HiveEndPoint " + this + " must use an acid table"); + throw new InvalidTable(database, table, "is not an Acid table"); + } + + if (t.getPartitionKeys() != null && !t.getPartitionKeys().isEmpty()) { + setPartitionedTable(true); + } else { + setPartitionedTable(false); + } + + // partition values are specified on non-partitioned table + if (!isPartitionedTable() && + (staticPartitionValues != null && !staticPartitionValues.isEmpty())) { + // Invalid if table is not partitioned, but endPoint's partitionVals is not empty + String errMsg = "HiveEndPoint" + this + " specifies partitions for unpartitioned table"; + LOG.error(errMsg); + throw new ConnectionError(errMsg); + } + } + + private static class HeartbeatRunnable implements Runnable { + private final Object txnBatchLock; + private final AtomicReference currentTxnBatchRef; + + HeartbeatRunnable(final Object txnBatchLock, final AtomicReference currentTxnBatchRef) { + this.txnBatchLock = txnBatchLock; + this.currentTxnBatchRef = currentTxnBatchRef; + } + + @Override + public void run() { + synchronized (txnBatchLock) { + TransactionBatchImpl transactionBatch = currentTxnBatchRef.get(); + try { + transactionBatch.heartbeat(); + LOG.info("Heartbeat sent for: {}", transactionBatch); + } catch (StreamingException e) { + LOG.warn("Unable to heartbeat transaction batch: {} to metastore", transactionBatch, e); + } + } + } + } + + @Override + public void beginNextTransaction() throws StreamingException { + synchronized (txnBatchLock) { + if (currentTransactionBatch == null) { + currentTransactionBatch = getTransactionBatch(recordWriter); + currentTxnBatchRef.set(currentTransactionBatch); + LOG.info("Opened new transaction batch {}", currentTransactionBatch); + } + + if (currentTransactionBatch.isClosed()) { + throw new IllegalStateException("Cannot begin next transaction on a closed streaming connection"); + } + + if (currentTransactionBatch.remainingTransactions() == 0) { + LOG.info("Transaction batch {} is done. Rolling over to next transaction batch.", + currentTransactionBatch); + currentTransactionBatch.close(); + currentTransactionBatch = getTransactionBatch(recordWriter); + // we have rolled over to next transaction batch, update the reference so that heartbeat thread can heartbeat + // correctly using the new transaction range + currentTxnBatchRef.set(currentTransactionBatch); + LOG.info("Rolled over to new transaction batch {}", currentTransactionBatch); + } + currentTransactionBatch.beginNextTransaction(); + } + } + + @Override + public Long getCurrentTxnId() { + Preconditions.checkNotNull(currentTransactionBatch, "Transaction cannot be null. missing beginTransaction()?"); + return currentTransactionBatch.getCurrentTxnId(); + } + + @Override + public Long getCurrentWriteId() { + Preconditions.checkNotNull(currentTransactionBatch, "Transaction cannot be null. missing beginTransaction()?"); + return currentTransactionBatch.getCurrentWriteId(); + } + + @Override + public TxnState getCurrentTransactionState() { + Preconditions.checkNotNull(currentTransactionBatch, "Transaction cannot be null. missing beginTransaction()?"); + return currentTransactionBatch.getCurrentTransactionState(); + } + + @Override + public void commit() throws StreamingException { + Preconditions.checkNotNull(currentTransactionBatch, "Transaction cannot be null. missing beginTransaction()?"); + currentTransactionBatch.commit(); + } + + @Override + public void abort() throws StreamingException { + Preconditions.checkNotNull(currentTransactionBatch, "Transaction cannot be null. missing beginTransaction()?"); + currentTransactionBatch.abort(); + } + + @Override + public int remainingTransactions() { + Preconditions.checkNotNull(currentTransactionBatch, "Transaction cannot be null. missing beginTransaction()?"); + return currentTransactionBatch.remainingTransactions(); + } + + @Override + public void write(final byte[] record) throws StreamingException { + Preconditions.checkNotNull(currentTransactionBatch, "Transaction cannot be null. missing beginTransaction()?"); + currentTransactionBatch.write(record); + } + + @Override + public void write(final Collection records) throws StreamingException { + Preconditions.checkNotNull(currentTransactionBatch, "Transaction cannot be null. missing beginTransaction()?"); + currentTransactionBatch.write(records); + } + + /** + * Close connection + */ + @Override + public void close() { + try { + if (currentTransactionBatch != null) { + currentTransactionBatch.close(); + } + if (driver != null) { + driver.close(); + } + if (localSession != null) { + localSession.close(); + } + } catch (IOException e) { + LOG.warn("Error closing SessionState used to run Hive DDL."); + } catch (StreamingException e) { + LOG.error("Unable to close current transaction batch: " + currentTransactionBatch, e); + } + msClient.close(); + heartbeatMSClient.close(); + if (loggedInUser != null) { + try { + FileSystem.closeAllForUGI(loggedInUser); + } catch (IOException exception) { + LOG.error("Could not clean up file-system handles for UGI: " + loggedInUser, exception); + } + } + } + + @Override + public boolean isClosed() { + return currentTransactionBatch.isClosed(); + } + + private TransactionBatchImpl getTransactionBatch(final RecordWriter recordWriter) + throws StreamingException { + Preconditions.checkNotNull(recordWriter, "Record writer cannot be null"); + return fetchTransactionBatchImpl(transactionBatchSize, recordWriter); + } + + private TransactionBatchImpl fetchTransactionBatchImpl(int numTransactions, + RecordWriter recordWriter) + throws StreamingException { + return new TransactionBatchImpl(username, this, numTransactions, msClient, + heartbeatMSClient, recordWriter, agentInfo); + } + + private IMetaStoreClient getMetaStoreClient() throws ConnectionError { + if (metastoreUri != null) { + conf.set(MetastoreConf.ConfVars.THRIFT_URIS.getHiveName(), metastoreUri); + } + if (secureMode) { + conf.setBoolean(MetastoreConf.ConfVars.USE_THRIFT_SASL.getHiveName(), true); + } + try { + return HiveMetaStoreUtils.getHiveMetastoreClient(conf); + } catch (MetaException | IOException e) { + throw new ConnectionError("Error connecting to Hive Metastore URI: " + + metastoreUri + ". " + e.getMessage(), e); + } + } + + private static class TransactionBatchImpl implements TransactionBatch { + private final String username; + private final HiveStreamingConnection conn; + private final IMetaStoreClient msClient; + private final IMetaStoreClient heartbeaterMSClient; + private final RecordWriter recordWriter; + private final String partNameForLock; + private final List txnToWriteIds; + private final ArrayList singleRecord = new ArrayList<>(1); + + //volatile because heartbeat() may be in a "different" thread; updates of this are "piggybacking" + private volatile int currentTxnIndex = -1; + //volatile because heartbeat() may be in a "different" thread + private volatile TxnState state; + private LockRequest lockRequest = null; + /** + * once any operation on this batch encounters a system exception + * (e.g. IOException on write) it's safest to assume that we can'table write to the + * file backing this batch any more. This guards important public methods + */ + private volatile boolean isClosed = false; + private final String agentInfo; + /** + * Tracks the state of each transaction + */ + private final TxnState[] txnStatus; + /** + * ID of the last txn used by {@link #beginNextTransactionImpl()} + */ + private long lastTxnUsed; + + /** + * Represents a batch of transactions acquired from MetaStore + * + * @throws StreamingException if failed to create new RecordUpdater for batch + */ + private TransactionBatchImpl(final String user, HiveStreamingConnection conn, + final int numTxns, final IMetaStoreClient msClient, + final IMetaStoreClient heartbeaterMSClient, RecordWriter recordWriter, String agentInfo) + throws StreamingException { + boolean success = false; + try { + if (conn.isPartitionedTable() && !conn.isDynamicPartitioning()) { + Table tableObj = msClient.getTable(conn.database, conn.table); + List partKeys = tableObj.getPartitionKeys(); + partNameForLock = Warehouse.makePartName(partKeys, conn.staticPartitionValues); + } else { + partNameForLock = null; + } + this.username = user; + this.conn = conn; + this.msClient = msClient; + this.heartbeaterMSClient = heartbeaterMSClient; + this.recordWriter = recordWriter; + this.agentInfo = agentInfo; + + List txnIds = openTxnImpl(msClient, user, numTxns); + txnToWriteIds = allocateWriteIdsImpl(msClient, txnIds); + assert (txnToWriteIds.size() == numTxns); + + txnStatus = new TxnState[numTxns]; + for (int i = 0; i < txnStatus.length; i++) { + assert (txnToWriteIds.get(i).getTxnId() == txnIds.get(i)); + txnStatus[i] = TxnState.OPEN;//Open matches Metastore state + } + this.state = TxnState.INACTIVE; + + // The Write Ids returned for the transaction batch is also sequential + recordWriter.newBatch(txnToWriteIds.get(0).getWriteId(), txnToWriteIds.get(numTxns - 1).getWriteId()); + singleRecord.add(null); + success = true; + } catch (TException e) { + throw new StreamingException(conn.toString(), e); + } finally { + //clean up if above throws + markDead(success); + } + } + + private List openTxnImpl(final IMetaStoreClient msClient, final String user, final int numTxns) + throws TException { + return msClient.openTxns(user, numTxns).getTxn_ids(); + } + + private List allocateWriteIdsImpl(final IMetaStoreClient msClient, + final List txnIds) throws TException { + return msClient.allocateTableWriteIdsBatch(txnIds, conn.database, conn.table); + } + + @Override + public String toString() { + if (txnToWriteIds == null || txnToWriteIds.isEmpty()) { + return "{}"; + } + StringBuilder sb = new StringBuilder(" TxnStatus["); + for (TxnState state : txnStatus) { + //'state' should not be null - future proofing + sb.append(state == null ? "N" : state); + } + sb.append("] LastUsed ").append(JavaUtils.txnIdToString(lastTxnUsed)); + return "TxnId/WriteIds=[" + txnToWriteIds.get(0).getTxnId() + + "/" + txnToWriteIds.get(0).getWriteId() + + "..." + + txnToWriteIds.get(txnToWriteIds.size() - 1).getTxnId() + + "/" + txnToWriteIds.get(txnToWriteIds.size() - 1).getWriteId() + + "] on endPoint = " + conn + "; " + sb; + } + + @Override + public void beginNextTransaction() throws StreamingException { + checkIsClosed(); + beginNextTransactionImpl(); + } + + private void beginNextTransactionImpl() throws TransactionError { + state = TxnState.INACTIVE;//clear state from previous txn + + if ((currentTxnIndex + 1) >= txnToWriteIds.size()) { + throw new InvalidTransactionState("No more transactions available in" + + " next batch for connection: " + conn + " user: " + username); + } + ++currentTxnIndex; + state = TxnState.OPEN; + lastTxnUsed = getCurrentTxnId(); + lockRequest = createLockRequest(conn, partNameForLock, username, getCurrentTxnId(), agentInfo); + try { + LockResponse res = msClient.lock(lockRequest); + if (res.getState() != LockState.ACQUIRED) { + throw new TransactionError("Unable to acquire lock on " + conn); + } + } catch (TException e) { + throw new TransactionError("Unable to acquire lock on " + conn, e); + } + } + + @Override + public Long getCurrentTxnId() { + if (currentTxnIndex >= 0) { + return txnToWriteIds.get(currentTxnIndex).getTxnId(); + } + return -1L; + } + + @Override + public Long getCurrentWriteId() { + if (currentTxnIndex >= 0) { + return txnToWriteIds.get(currentTxnIndex).getWriteId(); + } + return -1L; + } + + @Override + public TxnState getCurrentTransactionState() { + return state; + } + + @Override + public int remainingTransactions() { + if (currentTxnIndex >= 0) { + return txnToWriteIds.size() - currentTxnIndex - 1; + } + return txnToWriteIds.size(); + } + + + @Override + public void write(final byte[] record) throws StreamingException { + singleRecord.set(0, record); + write(singleRecord); + } + + private void checkIsClosed() throws IllegalStateException { + if (isClosed) { + throw new IllegalStateException("TransactionBatch " + toString() + " has been closed()"); + } + } + + /** + * A transaction batch opens a single HDFS file and writes multiple transaction to it. If there is any issue + * with the write, we can'table continue to write to the same file any as it may be corrupted now (at the tail). + * This ensures that a client can'table ignore these failures and continue to write. + */ + private void markDead(boolean success) { + if (success) { + return; + } + isClosed = true;//also ensures that heartbeat() is no-op since client is likely doing it async + try { + abort(true);//abort all remaining txns + } catch (Exception ex) { + LOG.error("Fatal error on " + toString() + "; cause " + ex.getMessage(), ex); + } + try { + closeImpl(); + } catch (Exception ex) { + LOG.error("Fatal error on " + toString() + "; cause " + ex.getMessage(), ex); + } + } + + + @Override + public void write(final Collection records) throws StreamingException { + checkIsClosed(); + boolean success = false; + try { + writeImpl(records); + success = true; + } catch (SerializationError ex) { + //this exception indicates that a {@code record} could not be parsed and the + //caller can decide whether to drop it or send it to dead letter queue. + //rolling back the txn and retrying won'table help since the tuple will be exactly the same + //when it's replayed. + success = true; + throw ex; + } finally { + markDead(success); + } + } + + private void writeImpl(Collection records) + throws StreamingException { + for (byte[] record : records) { + recordWriter.write(getCurrentWriteId(), record); + } + } + + + @Override + public void commit() throws StreamingException { + checkIsClosed(); + boolean success = false; + try { + commitImpl(); + success = true; + } finally { + markDead(success); + } + } + + private void commitImpl() throws StreamingException { + try { + recordWriter.flush(); + msClient.commitTxn(txnToWriteIds.get(currentTxnIndex).getTxnId()); + state = TxnState.COMMITTED; + txnStatus[currentTxnIndex] = TxnState.COMMITTED; + } catch (NoSuchTxnException e) { + throw new TransactionError("Invalid transaction id : " + + getCurrentTxnId(), e); + } catch (TxnAbortedException e) { + throw new TransactionError("Aborted transaction cannot be committed" + , e); + } catch (TException e) { + throw new TransactionError("Unable to commit transaction" + + getCurrentTxnId(), e); + } + } + + @Override + public void abort() throws StreamingException { + if (isClosed) { + /* + * isDead is only set internally by this class. {@link #markDead(boolean)} will abort all + * remaining txns, so make this no-op to make sure that a well-behaved client that calls abort() + * error doesn'table get misleading errors + */ + return; + } + abort(false); + } + + private void abort(final boolean abortAllRemaining) throws StreamingException { + abortImpl(abortAllRemaining); + } + + private void abortImpl(boolean abortAllRemaining) throws StreamingException { + try { + if (abortAllRemaining) { + //when last txn finished (abort/commit) the currentTxnIndex is pointing at that txn + //so we need to start from next one, if any. Also if batch was created but + //fetchTransactionBatch() was never called, we want to start with first txn + int minOpenTxnIndex = Math.max(currentTxnIndex + + (state == TxnState.ABORTED || state == TxnState.COMMITTED ? 1 : 0), 0); + for (currentTxnIndex = minOpenTxnIndex; + currentTxnIndex < txnToWriteIds.size(); currentTxnIndex++) { + msClient.rollbackTxn(txnToWriteIds.get(currentTxnIndex).getTxnId()); + txnStatus[currentTxnIndex] = TxnState.ABORTED; + } + currentTxnIndex--;//since the loop left it == txnToWriteIds.size() + } else { + if (getCurrentTxnId() > 0) { + msClient.rollbackTxn(getCurrentTxnId()); + txnStatus[currentTxnIndex] = TxnState.ABORTED; + } + } + state = TxnState.ABORTED; + } catch (NoSuchTxnException e) { + throw new TransactionError("Unable to abort invalid transaction id : " + + getCurrentTxnId(), e); + } catch (TException e) { + throw new TransactionError("Unable to abort transaction id : " + + getCurrentTxnId(), e); + } + } + + public void heartbeat() throws StreamingException { + if (isClosed) { + return; + } + if (state != TxnState.OPEN && currentTxnIndex >= txnToWriteIds.size() - 1) { + //here means last txn in the batch is resolved but the close() hasn'table been called yet so + //there is nothing to heartbeat + return; + } + //if here after commit()/abort() but before next beginNextTransaction(), currentTxnIndex still + //points at the last txn which we don'table want to heartbeat + Long first = txnToWriteIds.get(state == TxnState.OPEN ? currentTxnIndex : currentTxnIndex + 1).getTxnId(); + Long last = txnToWriteIds.get(txnToWriteIds.size() - 1).getTxnId(); + try { + HeartbeatTxnRangeResponse resp = heartbeaterMSClient.heartbeatTxnRange(first, last); + if (!resp.getAborted().isEmpty() || !resp.getNosuch().isEmpty()) { + throw new HeartBeatFailure(resp.getAborted(), resp.getNosuch()); + } + } catch (TException e) { + throw new StreamingException("Failure to heartbeat on ids (" + first + "src/gen/thrift" + + last + ") on end point : " + conn); + } + } + + @Override + public boolean isClosed() { + return isClosed; + } + + /** + * Close the TransactionBatch. This will abort any still open txns in this batch. + * + * @throws StreamingException - failure when closing transaction batch + */ + @Override + public void close() throws StreamingException { + if (isClosed) { + return; + } + isClosed = true; + abortImpl(true);//abort proactively so that we don'table wait for timeout + closeImpl();//perhaps we should add a version of RecordWriter.closeBatch(boolean abort) which + //will call RecordUpdater.close(boolean abort) + } + + private void closeImpl() throws StreamingException { + state = TxnState.INACTIVE; + recordWriter.closeBatch(); + } + + static LockRequest createLockRequest(final HiveStreamingConnection connection, + String partNameForLock, String user, long txnId, String agentInfo) { + LockRequestBuilder rqstBuilder = new LockRequestBuilder(agentInfo); + rqstBuilder.setUser(user); + rqstBuilder.setTransactionId(txnId); + + LockComponentBuilder lockCompBuilder = new LockComponentBuilder() + .setDbName(connection.database) + .setTableName(connection.table) + .setShared() + .setOperationType(DataOperationType.INSERT); + if (connection.isDynamicPartitioning()) { + lockCompBuilder.setIsDynamicPartitionWrite(true); + } + if (partNameForLock != null && !partNameForLock.isEmpty()) { + lockCompBuilder.setPartitionName(partNameForLock); + } + rqstBuilder.addLockComponent(lockCompBuilder.build()); + + return rqstBuilder.build(); + } + } // class TransactionBatchImpl + + private static HiveConf createHiveConf(Class clazz, String metaStoreUri) { + HiveConf conf = new HiveConf(clazz); + if (metaStoreUri != null) { + conf.set(MetastoreConf.ConfVars.THRIFT_URIS.getHiveName(), metaStoreUri); + } + HiveStreamingConnection.overrideConfSettings(conf); + return conf; + } + + private static void overrideConfSettings(HiveConf conf) { + setHiveConf(conf, HiveConf.ConfVars.HIVE_TXN_MANAGER, DbTxnManager.class.getName()); + setHiveConf(conf, HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY); + setHiveConf(conf, MetastoreConf.ConfVars.EXECUTE_SET_UGI.getHiveName()); + // Avoids creating Tez Client sessions internally as it takes much longer currently + setHiveConf(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE, "mr"); + setHiveConf(conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict"); + } + + private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var, String value) { + if (LOG.isDebugEnabled()) { + LOG.debug("Overriding HiveConf setting : " + var + " = " + value); + } + conf.setVar(var, value); + } + + private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var) { + if (LOG.isDebugEnabled()) { + LOG.debug("Overriding HiveConf setting : " + var + " = " + true); + } + conf.setBoolVar(var, true); + } + + private static void setHiveConf(HiveConf conf, String var) { + if (LOG.isDebugEnabled()) { + LOG.debug("Overriding HiveConf setting : " + var + " = " + true); + } + conf.setBoolean(var, true); + } + + @Override + public HiveConf getHiveConf() { + return conf; + } + + @Override + public String getMetastoreUri() { + return metastoreUri; + } + + @Override + public String getDatabase() { + return database; + } + + @Override + public String getTable() { + return table; + } + + @Override + public List getStaticPartitionValues() { + return staticPartitionValues; + } + + @Override + public String getAgentInfo() { + return agentInfo; + } + + @Override + public boolean isPartitionedTable() { + return isPartitionedTable; + } + + @Override + public boolean isStaticPartitioning() { + return isPartitionedTable() && (staticPartitionValues != null && staticPartitionValues.isEmpty()); + } + + @Override + public boolean isDynamicPartitioning() { + return isPartitionedTable() && (staticPartitionValues == null || staticPartitionValues.isEmpty()); + } +} diff --git a/streaming/src/java/org/apache/hive/streaming/ImpersonationFailed.java b/streaming/src/java/org/apache/hive/streaming/ImpersonationFailed.java deleted file mode 100644 index 23e17e7..0000000 --- a/streaming/src/java/org/apache/hive/streaming/ImpersonationFailed.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class ImpersonationFailed extends StreamingException { - public ImpersonationFailed(String username, Exception e) { - super("Failed to impersonate user " + username, e); - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidColumn.java b/streaming/src/java/org/apache/hive/streaming/InvalidColumn.java deleted file mode 100644 index 0011b14..0000000 --- a/streaming/src/java/org/apache/hive/streaming/InvalidColumn.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class InvalidColumn extends StreamingException { - - public InvalidColumn(String msg) { - super(msg); - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidPartition.java b/streaming/src/java/org/apache/hive/streaming/InvalidPartition.java deleted file mode 100644 index f1f9804..0000000 --- a/streaming/src/java/org/apache/hive/streaming/InvalidPartition.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class InvalidPartition extends StreamingException { - - public InvalidPartition(String partitionName, String partitionValue) { - super("Invalid partition: Name=" + partitionName + - ", Value=" + partitionValue); - } - -} diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidTable.java b/streaming/src/java/org/apache/hive/streaming/InvalidTable.java index ef1c91d..5c60160 100644 --- a/streaming/src/java/org/apache/hive/streaming/InvalidTable.java +++ b/streaming/src/java/org/apache/hive/streaming/InvalidTable.java @@ -24,15 +24,11 @@ private static String makeMsg(String db, String table) { return "Invalid table db:" + db + ", table:" + table; } - public InvalidTable(String db, String table) { - super(makeMsg(db,table), null); - } - - public InvalidTable(String db, String table, String msg) { + InvalidTable(String db, String table, String msg) { super(makeMsg(db, table) + ": " + msg, null); } - public InvalidTable(String db, String table, Exception inner) { + InvalidTable(String db, String table, Exception inner) { super(makeMsg(db, table) + ": " + inner.getMessage(), inner); } } diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidTransactionState.java b/streaming/src/java/org/apache/hive/streaming/InvalidTransactionState.java new file mode 100644 index 0000000..9d92dfa --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/InvalidTransactionState.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +public class InvalidTransactionState extends TransactionError { + InvalidTransactionState(String msg) { + super(msg); + } +} diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidTrasactionState.java b/streaming/src/java/org/apache/hive/streaming/InvalidTrasactionState.java deleted file mode 100644 index 762f5f8..0000000 --- a/streaming/src/java/org/apache/hive/streaming/InvalidTrasactionState.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class InvalidTrasactionState extends TransactionError { - public InvalidTrasactionState(String msg) { - super(msg); - } - -} diff --git a/streaming/src/java/org/apache/hive/streaming/PartitionCreationFailed.java b/streaming/src/java/org/apache/hive/streaming/PartitionCreationFailed.java index 5f9aca6..e464399 100644 --- a/streaming/src/java/org/apache/hive/streaming/PartitionCreationFailed.java +++ b/streaming/src/java/org/apache/hive/streaming/PartitionCreationFailed.java @@ -19,7 +19,7 @@ package org.apache.hive.streaming; public class PartitionCreationFailed extends StreamingException { - public PartitionCreationFailed(HiveEndPoint endPoint, Exception cause) { - super("Failed to create partition " + endPoint, cause); + PartitionCreationFailed(StreamingConnection connection, Throwable cause) { + super("Failed to create partition " + connection, cause); } } diff --git a/streaming/src/java/org/apache/hive/streaming/PartitionHandler.java b/streaming/src/java/org/apache/hive/streaming/PartitionHandler.java new file mode 100644 index 0000000..33e6d24 --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/PartitionHandler.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +import java.util.List; + +import org.apache.hadoop.fs.Path; + +public interface PartitionHandler { + + /** + * Creates a partition if it does not exist. + * + * @param partitionValues - partition values + * @throws StreamingException - any metastore related exceptions + */ + void createPartitionIfNotExists(List partitionValues) throws StreamingException; + + /** + * Return partition location or table location. + * + * @param partitionValues - partition values + * @return - table location when partitionValues is null or empty else partition location + * @throws StreamingException - any metastore related exceptions + */ + Path getPathForPartition(List partitionValues) throws StreamingException; +} diff --git a/streaming/src/java/org/apache/hive/streaming/QueryFailedException.java b/streaming/src/java/org/apache/hive/streaming/QueryFailedException.java deleted file mode 100644 index ccd3ae0..0000000 --- a/streaming/src/java/org/apache/hive/streaming/QueryFailedException.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class QueryFailedException extends StreamingException { - String query; - - public QueryFailedException(String query, Exception e) { - super("Query failed: " + query + ". Due to :" + e.getMessage(), e); - this.query = query; - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/RecordWriter.java b/streaming/src/java/org/apache/hive/streaming/RecordWriter.java index dc6d70e..ca226d7 100644 --- a/streaming/src/java/org/apache/hive/streaming/RecordWriter.java +++ b/streaming/src/java/org/apache/hive/streaming/RecordWriter.java @@ -21,23 +21,38 @@ public interface RecordWriter { - /** Writes using a hive RecordUpdater + /** + * Initialize record writer. + * + * @throws StreamingException - thrown when initialization failed + */ + void init(StreamingConnection connection) throws StreamingException; + + /** + * Writes using a hive RecordUpdater * * @param writeId the write ID of the table mapping to Txn in which the write occurs - * @param record the record to be written + * @param record the record to be written */ void write(long writeId, byte[] record) throws StreamingException; - /** Flush records from buffer. Invoked by TransactionBatch.commit() */ + /** + * Flush records from buffer. Invoked by TransactionBatch.commit() + */ void flush() throws StreamingException; - /** Clear bufferred writes. Invoked by TransactionBatch.abort() */ - void clear() throws StreamingException; - - /** Acquire a new RecordUpdater. Invoked when - * StreamingConnection.fetchTransactionBatch() is called */ - void newBatch(Long minWriteId, Long maxWriteID) throws StreamingException; + /** + * Acquire a new RecordUpdater. + * + * @param minWriteId - min write id + * @param maxWriteID - max write id + */ + void newBatch(Long minWriteId, Long maxWriteID); - /** Close the RecordUpdater. Invoked by TransactionBatch.close() */ + /** + * Close the RecordUpdater. Invoked by TransactionBatch.close() + * + * @throws StreamingException - thrown when record writer cannot be closed. + */ void closeBatch() throws StreamingException; } diff --git a/streaming/src/java/org/apache/hive/streaming/SerializationError.java b/streaming/src/java/org/apache/hive/streaming/SerializationError.java index a57ba00..1473ff8 100644 --- a/streaming/src/java/org/apache/hive/streaming/SerializationError.java +++ b/streaming/src/java/org/apache/hive/streaming/SerializationError.java @@ -20,7 +20,7 @@ public class SerializationError extends StreamingException { - public SerializationError(String msg, Exception e) { + SerializationError(String msg, Exception e) { super(msg,e); } } diff --git a/streaming/src/java/org/apache/hive/streaming/StreamingConnection.java b/streaming/src/java/org/apache/hive/streaming/StreamingConnection.java index 2f760ea..9731c49 100644 --- a/streaming/src/java/org/apache/hive/streaming/StreamingConnection.java +++ b/streaming/src/java/org/apache/hive/streaming/StreamingConnection.java @@ -18,40 +18,18 @@ package org.apache.hive.streaming; -import org.apache.hadoop.security.UserGroupInformation; - -/** - * Represents a connection to a HiveEndPoint. Used to acquire transaction batches. - * Note: the expectation is that there is at most 1 TransactionBatch outstanding for any given - * StreamingConnection. Violating this may result in "out of sequence response". - */ -public interface StreamingConnection { - - /** - * Acquires a new batch of transactions from Hive. - - * @param numTransactionsHint is a hint from client indicating how many transactions client needs. - * @param writer Used to write record. The same writer instance can - * be shared with another TransactionBatch (to the same endpoint) - * only after the first TransactionBatch has been closed. - * Writer will be closed when the TransactionBatch is closed. - * @return - * @throws ConnectionError - * @throws InvalidPartition - * @throws StreamingException - * @return a batch of transactions - */ - public TransactionBatch fetchTransactionBatch(int numTransactionsHint, - RecordWriter writer) - throws ConnectionError, StreamingException, InterruptedException; +import org.apache.hadoop.hive.conf.HiveConf; +public interface StreamingConnection extends ConnectionInfo, PartitionHandler, TransactionBatch { /** - * Close connection + * Returns hive configuration object used during connection creation. + * + * @return - hive conf */ - public void close(); + HiveConf getHiveConf(); /** - * @return UserGroupInformation associated with this connection or {@code null} if there is none + * Closes streaming connection. */ - UserGroupInformation getUserGroupInformation(); + void close(); } diff --git a/streaming/src/java/org/apache/hive/streaming/StreamingException.java b/streaming/src/java/org/apache/hive/streaming/StreamingException.java index a7f84c1..1af5c6a 100644 --- a/streaming/src/java/org/apache/hive/streaming/StreamingException.java +++ b/streaming/src/java/org/apache/hive/streaming/StreamingException.java @@ -19,7 +19,7 @@ package org.apache.hive.streaming; public class StreamingException extends Exception { - public StreamingException(String msg, Exception cause) { + public StreamingException(String msg, Throwable cause) { super(msg, cause); } public StreamingException(String msg) { diff --git a/streaming/src/java/org/apache/hive/streaming/StreamingIOFailure.java b/streaming/src/java/org/apache/hive/streaming/StreamingIOFailure.java index 0dfbfa7..090167d 100644 --- a/streaming/src/java/org/apache/hive/streaming/StreamingIOFailure.java +++ b/streaming/src/java/org/apache/hive/streaming/StreamingIOFailure.java @@ -21,11 +21,11 @@ public class StreamingIOFailure extends StreamingException { - public StreamingIOFailure(String msg, Exception cause) { + StreamingIOFailure(String msg, Exception cause) { super(msg, cause); } - public StreamingIOFailure(String msg) { + StreamingIOFailure(String msg) { super(msg); } } diff --git a/streaming/src/java/org/apache/hive/streaming/StrictDelimitedInputWriter.java b/streaming/src/java/org/apache/hive/streaming/StrictDelimitedInputWriter.java new file mode 100644 index 0000000..6ea0319 --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/StrictDelimitedInputWriter.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + + +import java.util.Properties; + +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.io.BytesWritable; + +import com.google.common.base.Joiner; + +/** + * Streaming Writer handles delimited input (eg. CSV). + * Delimited input is parsed to extract partition values, bucketing info and is forwarded to record updater. + * Uses Lazy Simple SerDe to process delimited input + */ +public class StrictDelimitedInputWriter extends AbstractRecordWriter { + private char fieldDelimiter; + private char collectionDelimiter; + private char mapKeyDelimiter; + private LazySimpleSerDe serde; + + private StrictDelimitedInputWriter(Builder builder) { + this.fieldDelimiter = builder.fieldDelimiter; + this.collectionDelimiter = builder.collectionDelimiter; + this.mapKeyDelimiter = builder.mapKeyDelimiter; + } + + public static Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + private char fieldDelimiter = (char) LazySerDeParameters.DefaultSeparators[0]; + private char collectionDelimiter = (char) LazySerDeParameters.DefaultSeparators[1]; + private char mapKeyDelimiter = (char) LazySerDeParameters.DefaultSeparators[2]; + + public Builder withFieldDelimiter(final char fieldDelimiter) { + this.fieldDelimiter = fieldDelimiter; + return this; + } + + public Builder withCollectionDelimiter(final char collectionDelimiter) { + this.collectionDelimiter = collectionDelimiter; + return this; + } + + public Builder withMapKeyDelimiter(final char mapKeyDelimiter) { + this.mapKeyDelimiter = mapKeyDelimiter; + return this; + } + + public StrictDelimitedInputWriter build() { + return new StrictDelimitedInputWriter(this); + } + } + + @Override + public Object encode(byte[] record) throws SerializationError { + try { + BytesWritable blob = new BytesWritable(); + blob.set(record, 0, record.length); + return serde.deserialize(blob); + } catch (SerDeException e) { + throw new SerializationError("Unable to convert byte[] record into Object", e); + } + } + + @Override + public LazySimpleSerDe createSerde() throws SerializationError { + try { + Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); + tableProps.setProperty(serdeConstants.LIST_COLUMNS, Joiner.on(",").join(inputColumns)); + tableProps.setProperty(serdeConstants.LIST_COLUMN_TYPES, Joiner.on(":").join(inputTypes)); + tableProps.setProperty(serdeConstants.FIELD_DELIM, String.valueOf(fieldDelimiter)); + tableProps.setProperty(serdeConstants.COLLECTION_DELIM, String.valueOf(collectionDelimiter)); + tableProps.setProperty(serdeConstants.MAPKEY_DELIM, String.valueOf(mapKeyDelimiter)); + LazySimpleSerDe serde = new LazySimpleSerDe(); + SerDeUtils.initializeSerDe(serde, conf, tableProps, null); + this.serde = serde; + return serde; + } catch (SerDeException e) { + throw new SerializationError("Error initializing serde", e); + } + } +} diff --git a/streaming/src/java/org/apache/hive/streaming/StrictJsonWriter.java b/streaming/src/java/org/apache/hive/streaming/StrictJsonWriter.java index 0077913..570aaed 100644 --- a/streaming/src/java/org/apache/hive/streaming/StrictJsonWriter.java +++ b/streaming/src/java/org/apache/hive/streaming/StrictJsonWriter.java @@ -18,131 +18,43 @@ package org.apache.hive.streaming; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.Table; +import java.util.Properties; + import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; -import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.hive.serde2.JsonSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.io.Text; -import org.apache.hive.hcatalog.data.HCatRecordObjectInspector; -import org.apache.hive.hcatalog.data.JsonSerDe; - -import java.io.IOException; -import java.util.List; -import java.util.Properties; /** * Streaming Writer handles utf8 encoded Json (Strict syntax). - * Uses org.apache.hive.hcatalog.data.JsonSerDe to process Json input + * Uses org.apache.hadoop.hive.serde2.JsonSerDe to process Json input */ public class StrictJsonWriter extends AbstractRecordWriter { private JsonSerDe serde; - private final HCatRecordObjectInspector recordObjInspector; - private final ObjectInspector[] bucketObjInspectors; - private final StructField[] bucketStructFields; - - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #StrictJsonWriter(HiveEndPoint, HiveConf, StreamingConnection)} - */ - public StrictJsonWriter(HiveEndPoint endPoint) - throws ConnectionError, SerializationError, StreamingException { - this(endPoint, null, null); - } - - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #StrictJsonWriter(HiveEndPoint, HiveConf, StreamingConnection)} - */ - public StrictJsonWriter(HiveEndPoint endPoint, HiveConf conf) throws StreamingException { - this(endPoint, conf, null); - } - /** - * @param endPoint the end point to write to - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictJsonWriter(HiveEndPoint endPoint, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - this(endPoint, null, conn); - } - /** - * @param endPoint the end point to write to - * @param conf a Hive conf object. Should be null if not using advanced Hive settings. - * @param conn connection this Writer is to be used with - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictJsonWriter(HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - super(endPoint, conf, conn); - this.serde = createSerde(tbl, conf); - // get ObjInspectors for entire record and bucketed cols - try { - recordObjInspector = ( HCatRecordObjectInspector ) serde.getObjectInspector(); - this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, recordObjInspector); - } catch (SerDeException e) { - throw new SerializationError("Unable to get ObjectInspector for bucket columns", e); - } - - // get StructFields for bucketed cols - bucketStructFields = new StructField[bucketIds.size()]; - List allFields = recordObjInspector.getAllStructFieldRefs(); - for (int i = 0; i < bucketIds.size(); i++) { - bucketStructFields[i] = allFields.get(bucketIds.get(i)); - } - } - - @Override - public AbstractSerDe getSerde() { - return serde; - } - - protected HCatRecordObjectInspector getRecordObjectInspector() { - return recordObjInspector; + public static Builder newBuilder() { + return new Builder(); } - @Override - protected StructField[] getBucketStructFields() { - return bucketStructFields; - } - - protected ObjectInspector[] getBucketObjectInspectors() { - return bucketObjInspectors; - } - - - @Override - public void write(long writeId, byte[] record) - throws StreamingIOFailure, SerializationError { - try { - Object encodedRow = encode(record); - int bucket = getBucket(encodedRow); - getRecordUpdater(bucket).insert(writeId, encodedRow); - } catch (IOException e) { - throw new StreamingIOFailure("Error writing record in transaction write id(" - + writeId + ")", e); + public static class Builder { + public StrictJsonWriter build() { + return new StrictJsonWriter(); } - } /** * Creates JsonSerDe - * @param tbl used to create serde - * @param conf used to create serde - * @return + * * @throws SerializationError if serde could not be initialized */ - private static JsonSerDe createSerde(Table tbl, HiveConf conf) - throws SerializationError { + @Override + public JsonSerDe createSerde() throws SerializationError { try { Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); JsonSerDe serde = new JsonSerDe(); SerDeUtils.initializeSerDe(serde, conf, tableProps, null); + this.serde = serde; return serde; } catch (SerDeException e) { throw new SerializationError("Error initializing serde " + JsonSerDe.class.getName(), e); @@ -158,5 +70,4 @@ public Object encode(byte[] utf8StrRecord) throws SerializationError { throw new SerializationError("Unable to convert byte[] record into Object", e); } } - } diff --git a/streaming/src/java/org/apache/hive/streaming/StrictRegexWriter.java b/streaming/src/java/org/apache/hive/streaming/StrictRegexWriter.java index c0b7324..db25176 100644 --- a/streaming/src/java/org/apache/hive/streaming/StrictRegexWriter.java +++ b/streaming/src/java/org/apache/hive/streaming/StrictRegexWriter.java @@ -18,24 +18,14 @@ package org.apache.hive.streaming; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import java.util.Properties; import org.apache.commons.lang.StringUtils; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.RegexSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.io.Text; /** @@ -43,138 +33,56 @@ * org.apache.hadoop.hive.serde2.RegexSerDe */ public class StrictRegexWriter extends AbstractRecordWriter { + private String regex; private RegexSerDe serde; - private final StructObjectInspector recordObjInspector; - private final ObjectInspector[] bucketObjInspectors; - private final StructField[] bucketStructFields; - /** - * @param endPoint the end point to write to - * @param conn connection this Writer is to be used with - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictRegexWriter(HiveEndPoint endPoint, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - this(null, endPoint, null, conn); + private StrictRegexWriter(final Builder builder) { + this.regex = builder.regex; } - /** - * @param endPoint the end point to write to - * @param conf a Hive conf object. Should be null if not using advanced Hive settings. - * @param conn connection this Writer is to be used with - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictRegexWriter(HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - this(null, endPoint, conf, conn); + public static Builder newBuilder() { + return new Builder(); } - /** - * @param regex to parse the data - * @param endPoint the end point to write to - * @param conf a Hive conf object. Should be null if not using advanced Hive settings. - * @param conn connection this Writer is to be used with - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictRegexWriter(String regex, HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - super(endPoint, conf, conn); - this.serde = createSerde(tbl, conf, regex); - // get ObjInspectors for entire record and bucketed cols - try { - recordObjInspector = (StructObjectInspector) serde.getObjectInspector(); - this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, recordObjInspector); - } catch (SerDeException e) { - throw new SerializationError("Unable to get ObjectInspector for bucket columns", e); - } + public static class Builder { + private String regex; - // get StructFields for bucketed cols - bucketStructFields = new StructField[bucketIds.size()]; - List allFields = recordObjInspector.getAllStructFieldRefs(); - for (int i = 0; i < bucketIds.size(); i++) { - bucketStructFields[i] = allFields.get(bucketIds.get(i)); + public Builder withRegex(final String regex) { + this.regex = regex; + return this; } - } - @Override - public AbstractSerDe getSerde() { - return serde; - } - - @Override - protected StructObjectInspector getRecordObjectInspector() { - return recordObjInspector; - } - - @Override - protected StructField[] getBucketStructFields() { - return bucketStructFields; - } - - @Override - protected ObjectInspector[] getBucketObjectInspectors() { - return bucketObjInspectors; - } - - - @Override - public void write(long writeId, byte[] record) - throws StreamingIOFailure, SerializationError { - try { - Object encodedRow = encode(record); - int bucket = getBucket(encodedRow); - getRecordUpdater(bucket).insert(writeId, encodedRow); - } catch (IOException e) { - throw new StreamingIOFailure("Error writing record in transaction write id(" - + writeId + ")", e); + public StrictRegexWriter build() { + return new StrictRegexWriter(this); } } /** * Creates RegexSerDe * - * @param tbl used to create serde - * @param conf used to create serde - * @param regex used to create serde - * @return * @throws SerializationError if serde could not be initialized */ - private static RegexSerDe createSerde(Table tbl, HiveConf conf, String regex) - throws SerializationError { + @Override + public RegexSerDe createSerde() throws SerializationError { try { Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); tableProps.setProperty(RegexSerDe.INPUT_REGEX, regex); - ArrayList tableColumns = getCols(tbl); - tableProps.setProperty(serdeConstants.LIST_COLUMNS, StringUtils.join(tableColumns, ",")); + tableProps.setProperty(serdeConstants.LIST_COLUMNS, StringUtils.join(inputColumns, ",")); RegexSerDe serde = new RegexSerDe(); SerDeUtils.initializeSerDe(serde, conf, tableProps, null); + this.serde = serde; return serde; } catch (SerDeException e) { throw new SerializationError("Error initializing serde " + RegexSerDe.class.getName(), e); } } - private static ArrayList getCols(Table table) { - List cols = table.getSd().getCols(); - ArrayList colNames = new ArrayList(cols.size()); - for (FieldSchema col : cols) { - colNames.add(col.getName().toLowerCase()); - } - return colNames; - } - /** * Encode Utf8 encoded string bytes using RegexSerDe * - * @param utf8StrRecord + * @param utf8StrRecord - serialized record * @return The encoded object - * @throws SerializationError + * @throws SerializationError - in case of any deserialization error */ @Override public Object encode(byte[] utf8StrRecord) throws SerializationError { diff --git a/streaming/src/java/org/apache/hive/streaming/TransactionBatch.java b/streaming/src/java/org/apache/hive/streaming/TransactionBatch.java index 2b05771..eb85c0e 100644 --- a/streaming/src/java/org/apache/hive/streaming/TransactionBatch.java +++ b/streaming/src/java/org/apache/hive/streaming/TransactionBatch.java @@ -23,103 +23,101 @@ /** * Represents a set of Transactions returned by Hive. Supports opening, writing to - * and commiting/aborting each transaction. The interface is designed to ensure + * and committing/aborting each transaction. The interface is designed to ensure * transactions in a batch are used up sequentially. To stream to the same HiveEndPoint * concurrently, create separate StreamingConnections. - * - * Note on thread safety: At most 2 threads can run through a given TransactionBatch at the same - * time. One thread may call {@link #heartbeat()} and the other all other methods. - * Violating this may result in "out of sequence response". - * */ -public interface TransactionBatch { +public interface TransactionBatch { enum TxnState { INACTIVE("I"), OPEN("O"), COMMITTED("C"), ABORTED("A"); private final String code; + TxnState(String code) { this.code = code; - }; + } + public String toString() { return code; } } /** - * Activate the next available transaction in the current transaction batch. - * @throws StreamingException if not able to switch to next Txn - * @throws InterruptedException if call in interrupted + * Begins a transaction batch. If the current transaction batch is exhausted then a new transaction batch is created. + * + * @throws StreamingException - if connection is closed already or if failed to create transaction batch */ - void beginNextTransaction() throws StreamingException, InterruptedException; + void beginNextTransaction() throws StreamingException; /** * Get Id of currently open transaction. - * @return transaction id + * + * @return - transaction id */ Long getCurrentTxnId(); - /** * Get write Id mapping to currently open transaction. - * @return write id + * + * @return - write id */ Long getCurrentWriteId(); /** - * get state of current transaction. + * Get state of current transaction. + * + * @return - transaction state */ TxnState getCurrentTransactionState(); /** * Commit the currently open transaction. - * @throws StreamingException if there are errors committing - * @throws InterruptedException if call in interrupted + * + * @throws StreamingException - if there are errors committing */ - void commit() throws StreamingException, InterruptedException; + void commit() throws StreamingException; /** * Abort the currently open transaction. - * @throws StreamingException if there are errors - * @throws InterruptedException if call in interrupted + * + * @throws StreamingException - if there are errors */ - void abort() throws StreamingException, InterruptedException; + void abort() throws StreamingException; /** * Remaining transactions are the ones that are not committed or aborted or open. * Current open transaction is not considered part of remaining txns. - * @return number of transactions remaining this batch. + * + * @return - number of transactions remaining this batch. */ int remainingTransactions(); - /** - * Write record using RecordWriter. - * @param record the data to be written - * @throws StreamingException if there are errors when writing - * @throws InterruptedException if call in interrupted + * Write record using RecordWriter. + * + * @param record - the data to be written + * @throws StreamingException - if there are errors when writing */ - void write(byte[] record) throws StreamingException, InterruptedException; + void write(byte[] record) throws StreamingException; /** - * Write records using RecordWriter. - * @throws StreamingException if there are errors when writing - * @throws InterruptedException if call in interrupted + * Write records using RecordWriter. + * + * @throws StreamingException - if there are errors when writing */ - void write(Collection records) throws StreamingException, InterruptedException; - + void write(Collection records) throws StreamingException; /** - * Issues a heartbeat to hive metastore on the current and remaining txn ids - * to keep them from expiring. - * @throws StreamingException if there are errors + * Close the TransactionBatch. + * + * @throws StreamingException - if there are errors closing batch */ - void heartbeat() throws StreamingException; + void close() throws StreamingException; /** - * Close the TransactionBatch. - * @throws StreamingException if there are errors closing batch - * @throws InterruptedException if call in interrupted + * Returns true if transaction batch is closed already. + * + * @return - true if batch is closed else false */ - void close() throws StreamingException, InterruptedException; boolean isClosed(); } diff --git a/streaming/src/java/org/apache/hive/streaming/TransactionBatchUnAvailable.java b/streaming/src/java/org/apache/hive/streaming/TransactionBatchUnAvailable.java deleted file mode 100644 index a8c8cd4..0000000 --- a/streaming/src/java/org/apache/hive/streaming/TransactionBatchUnAvailable.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class TransactionBatchUnAvailable extends StreamingException { - public TransactionBatchUnAvailable(HiveEndPoint ep, Exception e) { - super("Unable to acquire transaction batch on end point: " + ep, e); - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/TransactionError.java b/streaming/src/java/org/apache/hive/streaming/TransactionError.java index a331b20..ae56e7c 100644 --- a/streaming/src/java/org/apache/hive/streaming/TransactionError.java +++ b/streaming/src/java/org/apache/hive/streaming/TransactionError.java @@ -19,11 +19,11 @@ package org.apache.hive.streaming; public class TransactionError extends StreamingException { - public TransactionError(String msg, Exception e) { + TransactionError(String msg, Exception e) { super(msg + (e == null ? "" : ": " + e.getMessage()), e); } - public TransactionError(String msg) { + TransactionError(String msg) { super(msg); } } diff --git a/streaming/src/test/org/apache/hive/streaming/TestDelimitedInputWriter.java b/streaming/src/test/org/apache/hive/streaming/TestDelimitedInputWriter.java deleted file mode 100644 index f0843a1..0000000 --- a/streaming/src/test/org/apache/hive/streaming/TestDelimitedInputWriter.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -import java.util.ArrayList; -import java.util.Arrays; - -import org.junit.Test; - -import com.google.common.collect.Lists; - -import junit.framework.Assert; - -public class TestDelimitedInputWriter { - @Test - public void testFieldReordering() throws Exception { - - ArrayList colNames = Lists.newArrayList(new String[]{"col1", "col2", "col3", "col4", "col5"}); - {//1) test dropping fields - first middle & last - String[] fieldNames = {null, "col2", null, "col4", null}; - int[] mapping = DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.assertTrue(Arrays.equals(mapping, new int[]{-1, 1, -1, 3, -1})); - } - - {//2) test reordering - String[] fieldNames = {"col5", "col4", "col3", "col2", "col1"}; - int[] mapping = DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.assertTrue( Arrays.equals(mapping, new int[]{4,3,2,1,0}) ); - } - - {//3) test bad field names - String[] fieldNames = {"xyz", "abc", "col3", "col4", "as"}; - try { - DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.fail(); - } catch (InvalidColumn e) { - // should throw - } - } - - {//4) test few field names - String[] fieldNames = {"col3", "col4"}; - int[] mapping = DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.assertTrue( Arrays.equals(mapping, new int[]{2,3}) ); - } - - {//5) test extra field names - String[] fieldNames = {"col5", "col4", "col3", "col2", "col1", "col1"}; - try { - DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.fail(); - } catch (InvalidColumn e) { - //show throw - } - } - } -} diff --git a/streaming/src/test/org/apache/hive/streaming/TestStreaming.java b/streaming/src/test/org/apache/hive/streaming/TestStreaming.java index 6f63bfb..eb4e27f 100644 --- a/streaming/src/test/org/apache/hive/streaming/TestStreaming.java +++ b/streaming/src/test/org/apache/hive/streaming/TestStreaming.java @@ -53,12 +53,10 @@ import org.apache.hadoop.hive.conf.Validator; import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; import org.apache.hadoop.hive.metastore.IMetaStoreClient; -import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.GetOpenTxnsInfoResponse; import org.apache.hadoop.hive.metastore.api.LockState; import org.apache.hadoop.hive.metastore.api.LockType; import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; @@ -88,13 +86,11 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; -import org.apache.hadoop.hive.shims.Utils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.security.UserGroupInformation; import org.apache.orc.impl.OrcAcidUtils; import org.apache.orc.tools.FileDump; import org.apache.thrift.TException; @@ -113,6 +109,7 @@ public static class RawFileSystem extends RawLocalFileSystem { private static final URI NAME; + static { try { NAME = new URI("raw:///"); @@ -126,12 +123,16 @@ public URI getUri() { return NAME; } + @Override + public String getScheme() { + return "raw"; + } @Override public FileStatus getFileStatus(Path path) throws IOException { File file = pathToFile(path); if (!file.exists()) { - throw new FileNotFoundException("Can't find " + path); + throw new FileNotFoundException("Can'table find " + path); } // get close enough short mod = 0; @@ -145,32 +146,30 @@ public FileStatus getFileStatus(Path path) throws IOException { mod |= 0111; } return new FileStatus(file.length(), file.isDirectory(), 1, 1024, - file.lastModified(), file.lastModified(), - FsPermission.createImmutable(mod), "owen", "users", path); + file.lastModified(), file.lastModified(), + FsPermission.createImmutable(mod), "owen", "users", path); } } private static final String COL1 = "id"; private static final String COL2 = "msg"; - private final HiveConf conf; + private static HiveConf conf = null; private IDriver driver; private final IMetaStoreClient msClient; - final String metaStoreURI = null; - // partitioned table private final static String dbName = "testing"; private final static String tblName = "alerts"; - private final static String[] fieldNames = new String[]{COL1,COL2}; - List partitionVals; + private final static String[] fieldNames = new String[]{COL1, COL2}; + static List partitionVals; private static Path partLoc; private static Path partLoc2; // unpartitioned table private final static String dbName2 = "testing2"; private final static String tblName2 = "alerts"; - private final static String[] fieldNames2 = new String[]{COL1,COL2}; + private final static String[] fieldNames2 = new String[]{COL1, COL2}; // for bucket join testing @@ -199,13 +198,9 @@ public TestStreaming() throws Exception { conf = new HiveConf(this.getClass()); conf.set("fs.raw.impl", RawFileSystem.class.getName()); - conf - .setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, - "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); + conf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, + "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); TxnDbUtil.setConfValues(conf); - if (metaStoreURI!=null) { - conf.setVar(HiveConf.ConfVars.METASTOREURIS, metaStoreURI); - } conf.setBoolVar(HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI, true); conf.setBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, true); dbFolder.create(); @@ -227,12 +222,13 @@ public void setup() throws Exception { // drop and recreate the necessary databases and tables dropDB(msClient, dbName); - String[] colNames = new String[] {COL1, COL2}; - String[] colTypes = new String[] {serdeConstants.INT_TYPE_NAME, serdeConstants.STRING_TYPE_NAME}; - String[] bucketCols = new String[] {COL1}; + String[] colNames = new String[]{COL1, COL2}; + String[] colTypes = new String[]{serdeConstants.INT_TYPE_NAME, serdeConstants.STRING_TYPE_NAME}; + String[] bucketCols = new String[]{COL1}; String loc1 = dbFolder.newFolder(dbName + ".db").toString(); String[] partNames = new String[]{"Continent", "Country"}; - partLoc = createDbAndTable(driver, dbName, tblName, partitionVals, colNames, colTypes, bucketCols, partNames, loc1, 1); + partLoc = createDbAndTable(driver, dbName, tblName, partitionVals, colNames, colTypes, bucketCols, partNames, loc1, + 1); dropDB(msClient, dbName2); String loc2 = dbFolder.newFolder(dbName2 + ".db").toString(); @@ -247,19 +243,11 @@ public void setup() throws Exception { } @After - public void cleanup() throws Exception { + public void cleanup() { msClient.close(); driver.close(); } - private static List getPartitionKeys() { - List fields = new ArrayList(); - // Defining partition names in unsorted order - fields.add(new FieldSchema("continent", serdeConstants.STRING_TYPE_NAME, "")); - fields.add(new FieldSchema("country", serdeConstants.STRING_TYPE_NAME, "")); - return fields; - } - private void createStoreSales(String dbName, String loc) throws Exception { String dbUri = "raw://" + new Path(loc).toUri().toString(); String tableLoc = dbUri + Path.SEPARATOR + "store_sales"; @@ -299,43 +287,48 @@ private void createStoreSales(String dbName, String loc) throws Exception { ")\n" + " partitioned by (dt string)\n" + "clustered by (ss_store_sk, ss_promo_sk)\n" + - "INTO 4 BUCKETS stored as orc " + " location '" + tableLoc + "'" + " TBLPROPERTIES ('orc.compress'='NONE', 'transactional'='true')"); + "INTO 4 BUCKETS stored as orc " + " location '" + tableLoc + "'" + + " TBLPROPERTIES ('orc.compress'='NONE', 'transactional'='true')"); Assert.assertTrue(success); success = runDDL(driver, "alter table store_sales add partition(dt='2015')"); Assert.assertTrue(success); } + /** * make sure it works with table where bucket col is not 1st col + * * @throws Exception */ @Test public void testBucketingWhereBucketColIsNotFirstCol() throws Exception { List partitionVals = new ArrayList(); partitionVals.add("2015"); - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testing5", "store_sales", partitionVals); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"ss_sold_date_sk","ss_sold_time_sk", "ss_item_sk", - "ss_customer_sk", "ss_cdemo_sk", "ss_hdemo_sk", "ss_addr_sk", "ss_store_sk", "ss_promo_sk", "ss_ticket_number", "ss_quantity", - "ss_wholesale_cost", "ss_list_price", "ss_sales_price", "ss_ext_discount_amt", "ss_ext_sales_price", "ss_ext_wholesale_cost", - "ss_ext_list_price", "ss_ext_tax", "ss_coupon_amt", "ss_net_paid", "ss_net_paid_inc_tax", "ss_net_profit"},",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testing5") + .withTable("store_sales") + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); StringBuilder row = new StringBuilder(); - for(int i = 0; i < 10; i++) { - for(int ints = 0; ints < 11; ints++) { + for (int i = 0; i < 10; i++) { + for (int ints = 0; ints < 11; ints++) { row.append(ints).append(','); } - for(int decs = 0; decs < 12; decs++) { + for (int decs = 0; decs < 12; decs++) { row.append(i + 0.1).append(','); } row.setLength(row.length() - 1); - txnBatch.write(row.toString().getBytes()); + connection.write(row.toString().getBytes()); } - txnBatch.commit(); - txnBatch.close(); + connection.commit(); connection.close(); ArrayList res = queryTable(driver, "select row__id.bucketid, * from testing5.store_sales"); @@ -350,30 +343,36 @@ public void testBucketingWhereBucketColIsNotFirstCol() throws Exception { @Test public void testNoBuckets() throws Exception { queryTable(driver, "drop table if exists default.streamingnobuckets"); - //todo: why does it need transactional_properties? - queryTable(driver, "create table default.streamingnobuckets (a string, b string) stored as orc TBLPROPERTIES('transactional'='true', 'transactional_properties'='default')"); + queryTable(driver, "create table default.streamingnobuckets (a string, b string) stored as orc " + + "TBLPROPERTIES('transactional'='true')"); queryTable(driver, "insert into default.streamingnobuckets values('foo','bar')"); List rs = queryTable(driver, "select * from default.streamingnobuckets"); Assert.assertEquals(1, rs.size()); Assert.assertEquals("foo\tbar", rs.get(0)); - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "default", "streamingnobuckets", null); - String[] colNames1 = new String[] { "a", "b" }; - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter wr = new DelimitedInputWriter(colNames1,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, wr); - txnBatch.beginNextTransaction(); - txnBatch.write("a1,b2".getBytes()); - txnBatch.write("a3,b4".getBytes()); - txnBatch.commit(); - txnBatch.beginNextTransaction(); - txnBatch.write("a5,b6".getBytes()); - txnBatch.write("a7,b8".getBytes()); - txnBatch.commit(); - txnBatch.close(); + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("default") + .withTable("streamingnobuckets") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withTransactionBatchSize(2) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.write("a1,b2".getBytes()); + connection.write("a3,b4".getBytes()); + connection.commit(); + connection.beginNextTransaction(); + connection.write("a5,b6".getBytes()); + connection.write("a7,b8".getBytes()); + connection.commit(); + connection.close(); Assert.assertEquals("", 0, BucketCodec.determineVersion(536870912).decodeWriterId(536870912)); - rs = queryTable(driver,"select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); + rs = queryTable(driver, "select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); Assert.assertTrue(rs.get(0), rs.get(0).startsWith("{\"writeid\":1,\"bucketid\":536870912,\"rowid\":0}\tfoo\tbar")); Assert.assertTrue(rs.get(0), rs.get(0).endsWith("streamingnobuckets/delta_0000001_0000001_0000/bucket_00000")); @@ -397,7 +396,7 @@ public void testNoBuckets() throws Exception { queryTable(driver, "alter table default.streamingnobuckets compact 'major'"); runWorker(conf); - rs = queryTable(driver,"select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); + rs = queryTable(driver, "select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); Assert.assertTrue(rs.get(0), rs.get(0).startsWith("{\"writeid\":1,\"bucketid\":536870912,\"rowid\":0}\tfoo\tbar")); Assert.assertTrue(rs.get(0), rs.get(0).endsWith("streamingnobuckets/base_0000005/bucket_00000")); @@ -409,6 +408,152 @@ public void testNoBuckets() throws Exception { Assert.assertTrue(rs.get(3), rs.get(3).endsWith("streamingnobuckets/base_0000005/bucket_00000")); } + @Test + public void testAllTypesDelimitedWriter() throws Exception { + queryTable(driver, "drop table if exists default.alltypes"); + queryTable(driver, + "create table if not exists default.alltypes ( bo boolean, ti tinyint, si smallint, i int, bi bigint, " + + "f float, d double, de decimal(10,3), ts timestamp, da date, s string, c char(5), vc varchar(5), " + + "m map, l array, st struct ) " + + "stored as orc TBLPROPERTIES('transactional'='true')"); + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter('|') + .withCollectionDelimiter(',') + .withMapKeyDelimiter(':') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("default") + .withTable("alltypes") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withTransactionBatchSize(2) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + + String row1 = "true|10|100|1000|10000|4.0|20.0|4.2222|1969-12-31 " + + "15:59:58.174|1970-01-01|string|hello|hello|k1:v1|100,200|10,foo"; + String row2 = "false|20|200|2000|20000|8.0|40.0|2.2222|1970-12-31 15:59:58.174|1971-01-01|abcd|world|world|" + + "k4:v4|200,300|20,bar"; + connection.beginNextTransaction(); + connection.write(row1.getBytes()); + connection.write(row2.getBytes()); + connection.commit(); + connection.close(); + + List rs = queryTable(driver, "select ROW__ID, bo, ti, si, i, bi, f, d, de, ts, da, s, c, vc, m, l, st," + + " INPUT__FILE__NAME from default.alltypes order by ROW__ID"); + Assert.assertEquals(2, rs.size()); + String gotRow1 = rs.get(0); + String expectedPrefixRow1 = "{\"writeid\":1,\"bucketid\":536870912," + + "\"rowid\":0}\ttrue\t10\t100\t1000\t10000\t4.0\t20.0\t4.222\t1969-12-31 15:59:58.174\t1970-01-01\tstring" + + "\thello\thello\t{\"k1\":\"v1\"}\t[100,200]\t{\"c1\":10,\"c2\":\"foo\"}"; + String expectedSuffixRow1 = "alltypes/delta_0000001_0000002/bucket_00000"; + String gotRow2 = rs.get(1); + String expectedPrefixRow2 = "{\"writeid\":1,\"bucketid\":536870912," + + "\"rowid\":1}\tfalse\t20\t200\t2000\t20000\t8.0\t40.0\t2.222\t1970-12-31 15:59:58.174\t1971-01-01\tabcd" + + "\tworld\tworld\t{\"k4\":\"v4\"}\t[200,300]\t{\"c1\":20,\"c2\":\"bar\"}"; + String expectedSuffixRow2 = "alltypes/delta_0000001_0000002/bucket_00000"; + Assert.assertTrue(gotRow1, gotRow1.startsWith(expectedPrefixRow1)); + Assert.assertTrue(gotRow1, gotRow1.endsWith(expectedSuffixRow1)); + Assert.assertTrue(gotRow2, gotRow2.startsWith(expectedPrefixRow2)); + Assert.assertTrue(gotRow2, gotRow2.endsWith(expectedSuffixRow2)); + } + + @Test + public void testAutoRollTransactionBatch() throws Exception { + queryTable(driver, "drop table if exists default.streamingnobuckets"); + queryTable(driver, "create table default.streamingnobuckets (a string, b string) stored as orc " + + "TBLPROPERTIES('transactional'='true')"); + queryTable(driver, "insert into default.streamingnobuckets values('foo','bar')"); + List rs = queryTable(driver, "select * from default.streamingnobuckets"); + Assert.assertEquals(1, rs.size()); + Assert.assertEquals("foo\tbar", rs.get(0)); + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("default") + .withTable("streamingnobuckets") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withTransactionBatchSize(2) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.write("a1,b2".getBytes()); + connection.write("a3,b4".getBytes()); + connection.commit(); + connection.beginNextTransaction(); + connection.write("a5,b6".getBytes()); + connection.write("a7,b8".getBytes()); + connection.commit(); + // should have rolled over to next transaction batch + connection.beginNextTransaction(); + connection.write("a9,b10".getBytes()); + connection.write("a11,b12".getBytes()); + connection.commit(); + connection.beginNextTransaction(); + connection.write("a13,b14".getBytes()); + connection.write("a15,b16".getBytes()); + connection.commit(); + connection.close(); + + Assert.assertEquals("", 0, BucketCodec.determineVersion(536870912).decodeWriterId(536870912)); + rs = queryTable(driver, "select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); + + Assert.assertTrue(rs.get(0), rs.get(0).startsWith("{\"writeid\":1,\"bucketid\":536870912,\"rowid\":0}\tfoo\tbar")); + Assert.assertTrue(rs.get(0), rs.get(0).endsWith("streamingnobuckets/delta_0000001_0000001_0000/bucket_00000")); + Assert.assertTrue(rs.get(1), rs.get(1).startsWith("{\"writeid\":2,\"bucketid\":536870912,\"rowid\":0}\ta1\tb2")); + Assert.assertTrue(rs.get(1), rs.get(1).endsWith("streamingnobuckets/delta_0000002_0000003/bucket_00000")); + Assert.assertTrue(rs.get(2), rs.get(2).startsWith("{\"writeid\":2,\"bucketid\":536870912,\"rowid\":1}\ta3\tb4")); + Assert.assertTrue(rs.get(2), rs.get(2).endsWith("streamingnobuckets/delta_0000002_0000003/bucket_00000")); + Assert.assertTrue(rs.get(3), rs.get(3).startsWith("{\"writeid\":3,\"bucketid\":536870912,\"rowid\":0}\ta5\tb6")); + Assert.assertTrue(rs.get(3), rs.get(3).endsWith("streamingnobuckets/delta_0000002_0000003/bucket_00000")); + Assert.assertTrue(rs.get(4), rs.get(4).startsWith("{\"writeid\":3,\"bucketid\":536870912,\"rowid\":1}\ta7\tb8")); + Assert.assertTrue(rs.get(4), rs.get(4).endsWith("streamingnobuckets/delta_0000002_0000003/bucket_00000")); + + Assert.assertTrue(rs.get(5), rs.get(5).startsWith("{\"writeid\":4,\"bucketid\":536870912,\"rowid\":0}\ta9\tb10")); + Assert.assertTrue(rs.get(5), rs.get(5).endsWith("streamingnobuckets/delta_0000004_0000005/bucket_00000")); + Assert.assertTrue(rs.get(6), rs.get(6).startsWith("{\"writeid\":4,\"bucketid\":536870912,\"rowid\":1}\ta11\tb12")); + Assert.assertTrue(rs.get(6), rs.get(6).endsWith("streamingnobuckets/delta_0000004_0000005/bucket_00000")); + Assert.assertTrue(rs.get(7), rs.get(7).startsWith("{\"writeid\":5,\"bucketid\":536870912,\"rowid\":0}\ta13\tb14")); + Assert.assertTrue(rs.get(7), rs.get(7).endsWith("streamingnobuckets/delta_0000004_0000005/bucket_00000")); + Assert.assertTrue(rs.get(8), rs.get(8).startsWith("{\"writeid\":5,\"bucketid\":536870912,\"rowid\":1}\ta15\tb16")); + Assert.assertTrue(rs.get(8), rs.get(8).endsWith("streamingnobuckets/delta_0000004_0000005/bucket_00000")); + + queryTable(driver, "update default.streamingnobuckets set a=0, b=0 where a='a7'"); + queryTable(driver, "delete from default.streamingnobuckets where a='a1'"); + queryTable(driver, "update default.streamingnobuckets set a=0, b=0 where a='a15'"); + queryTable(driver, "delete from default.streamingnobuckets where a='a9'"); + rs = queryTable(driver, "select a, b from default.streamingnobuckets order by a, b"); + int row = 0; + Assert.assertEquals("at row=" + row, "0\t0", rs.get(row++)); + Assert.assertEquals("at row=" + row, "0\t0", rs.get(row++)); + Assert.assertEquals("at row=" + row, "a11\tb12", rs.get(row++)); + Assert.assertEquals("at row=" + row, "a13\tb14", rs.get(row++)); + Assert.assertEquals("at row=" + row, "a3\tb4", rs.get(row++)); + Assert.assertEquals("at row=" + row, "a5\tb6", rs.get(row++)); + Assert.assertEquals("at row=" + row, "foo\tbar", rs.get(row++)); + + queryTable(driver, "alter table default.streamingnobuckets compact 'major'"); + runWorker(conf); + rs = queryTable(driver, "select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); + + Assert.assertTrue(rs.get(0), rs.get(0).startsWith("{\"writeid\":1,\"bucketid\":536870912,\"rowid\":0}\tfoo\tbar")); + Assert.assertTrue(rs.get(0), rs.get(0).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(1), rs.get(1).startsWith("{\"writeid\":2,\"bucketid\":536870912,\"rowid\":1}\ta3\tb4")); + Assert.assertTrue(rs.get(1), rs.get(1).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(2), rs.get(2).startsWith("{\"writeid\":3,\"bucketid\":536870912,\"rowid\":0}\ta5\tb6")); + Assert.assertTrue(rs.get(2), rs.get(2).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(3), rs.get(3).startsWith("{\"writeid\":4,\"bucketid\":536870912,\"rowid\":1}\ta11\tb12")); + Assert.assertTrue(rs.get(3), rs.get(3).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(4), rs.get(4).startsWith("{\"writeid\":5,\"bucketid\":536870912,\"rowid\":0}\ta13\tb14")); + Assert.assertTrue(rs.get(4), rs.get(4).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(5), rs.get(5).startsWith("{\"writeid\":6,\"bucketid\":536870912,\"rowid\":0}\t0\t0")); + Assert.assertTrue(rs.get(5), rs.get(5).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + } + /** * this is a clone from TestTxnStatement2.... */ @@ -429,57 +574,63 @@ public void testStreamBucketingMatchesRegularBucketing() throws Exception { int bucketCount = 100; String dbUri = "raw://" + new Path(dbFolder.newFolder().toString()).toUri().toString(); - String tableLoc = "'" + dbUri + Path.SEPARATOR + "streamedtable" + "'"; + String tableLoc = "'" + dbUri + Path.SEPARATOR + "streamedtable" + "'"; String tableLoc2 = "'" + dbUri + Path.SEPARATOR + "finaltable" + "'"; String tableLoc3 = "'" + dbUri + Path.SEPARATOR + "nobucket" + "'"; runDDL(driver, "create database testBucketing3"); runDDL(driver, "use testBucketing3"); runDDL(driver, "create table streamedtable ( key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='true')") ; -// In 'nobucket' table we capture bucketid from streamedtable to workaround a hive bug that prevents joins two identically bucketed tables - runDDL(driver, "create table nobucket ( bucketid int, key1 string,key2 int,data string ) location " + tableLoc3) ; - runDDL(driver, "create table finaltable ( bucketid int, key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='true')"); - - - String[] records = new String[] { - "PSFAHYLZVC,29,EPNMA", - "PPPRKWAYAU,96,VUTEE", - "MIAOFERCHI,3,WBDSI", - "CEGQAZOWVN,0,WCUZL", - "XWAKMNSVQF,28,YJVHU", - "XBWTSAJWME,2,KDQFO", - "FUVLQTAXAY,5,LDSDG", - "QTQMDJMGJH,6,QBOMA", - "EFLOTLWJWN,71,GHWPS", - "PEQNAOJHCM,82,CAAFI", - "MOEKQLGZCP,41,RUACR", - "QZXMCOPTID,37,LFLWE", - "EYALVWICRD,13,JEZLC", - "VYWLZAYTXX,16,DMVZX", - "OSALYSQIXR,47,HNZVE", - "JGKVHKCEGQ,25,KSCJB", - "WQFMMYDHET,12,DTRWA", - "AJOVAYZKZQ,15,YBKFO", - "YAQONWCUAU,31,QJNHZ", - "DJBXUEUOEB,35,IYCBL" + + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='true')"); + // In 'nobucket' table we capture bucketid from streamedtable to workaround a hive bug that prevents joins two identically bucketed tables + runDDL(driver, "create table nobucket ( bucketid int, key1 string,key2 int,data string ) location " + tableLoc3); + runDDL(driver, + "create table finaltable ( bucketid int, key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='true')"); + + + String[] records = new String[]{ + "PSFAHYLZVC,29,EPNMA", + "PPPRKWAYAU,96,VUTEE", + "MIAOFERCHI,3,WBDSI", + "CEGQAZOWVN,0,WCUZL", + "XWAKMNSVQF,28,YJVHU", + "XBWTSAJWME,2,KDQFO", + "FUVLQTAXAY,5,LDSDG", + "QTQMDJMGJH,6,QBOMA", + "EFLOTLWJWN,71,GHWPS", + "PEQNAOJHCM,82,CAAFI", + "MOEKQLGZCP,41,RUACR", + "QZXMCOPTID,37,LFLWE", + "EYALVWICRD,13,JEZLC", + "VYWLZAYTXX,16,DMVZX", + "OSALYSQIXR,47,HNZVE", + "JGKVHKCEGQ,25,KSCJB", + "WQFMMYDHET,12,DTRWA", + "AJOVAYZKZQ,15,YBKFO", + "YAQONWCUAU,31,QJNHZ", + "DJBXUEUOEB,35,IYCBL" }; - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testBucketing3", "streamedtable", null); - String[] colNames1 = new String[] { "key1", "key2", "data" }; - DelimitedInputWriter wr = new DelimitedInputWriter(colNames1,",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + String[] colNames1 = new String[]{"key1", "key2", "data"}; + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("streamedtable") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, wr); - txnBatch.beginNextTransaction(); + connection.beginNextTransaction(); for (String record : records) { - txnBatch.write(record.toString().getBytes()); + connection.write(record.toString().getBytes()); } - txnBatch.commit(); - txnBatch.close(); + connection.commit(); connection.close(); ArrayList res1 = queryTable(driver, "select row__id.bucketid, * from streamedtable order by key2"); @@ -489,7 +640,8 @@ public void testStreamBucketingMatchesRegularBucketing() throws Exception { driver.run("insert into nobucket select row__id.bucketid,* from streamedtable"); runDDL(driver, " insert into finaltable select * from nobucket"); - ArrayList res2 = queryTable(driver, "select row__id.bucketid,* from finaltable where row__id.bucketid<>bucketid"); + ArrayList res2 = queryTable(driver, + "select row__id.bucketid,* from finaltable where row__id.bucketid<>bucketid"); for (String s : res2) { LOG.error(s); } @@ -505,29 +657,41 @@ public void testTableValidation() throws Exception { String tbl1 = "validation1"; String tbl2 = "validation2"; - String tableLoc = "'" + dbUri + Path.SEPARATOR + tbl1 + "'"; + String tableLoc = "'" + dbUri + Path.SEPARATOR + tbl1 + "'"; String tableLoc2 = "'" + dbUri + Path.SEPARATOR + tbl2 + "'"; runDDL(driver, "create database testBucketing3"); runDDL(driver, "use testBucketing3"); runDDL(driver, "create table " + tbl1 + " ( key1 string, data string ) clustered by ( key1 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='false')") ; + + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='false')"); runDDL(driver, "create table " + tbl2 + " ( key1 string, data string ) clustered by ( key1 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='false')") ; - + + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='false')"); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); try { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testBucketing3", "validation1", null); - endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("validation2") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); Assert.assertTrue("InvalidTable exception was not thrown", false); } catch (InvalidTable e) { // expecting this exception } try { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testBucketing3", "validation2", null); - endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("validation2") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); Assert.assertTrue("InvalidTable exception was not thrown", false); } catch (InvalidTable e) { // expecting this exception @@ -540,7 +704,7 @@ public void testTableValidation() throws Exception { */ @Deprecated private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles, - String... records) throws Exception { + String... records) throws Exception { ValidWriteIdList writeIds = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName)); AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds); Assert.assertEquals(0, dir.getObsolete().size()); @@ -578,7 +742,7 @@ private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int InputSplit[] splits = inf.getSplits(job, buckets); Assert.assertEquals(numExpectedFiles, splits.length); org.apache.hadoop.mapred.RecordReader rr = - inf.getRecordReader(splits[0], job, Reporter.NULL); + inf.getRecordReader(splits[0], job, Reporter.NULL); NullWritable key = rr.createKey(); OrcStruct value = rr.createValue(); @@ -588,12 +752,13 @@ private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int } Assert.assertEquals(false, rr.next(key, value)); } + /** * @param validationQuery query to read from table to compare data against {@code records} - * @param records expected data. each row is CVS list of values + * @param records expected data. each row is CVS list of values */ private void checkDataWritten2(Path partitionPath, long minTxn, long maxTxn, int numExpectedFiles, - String validationQuery, boolean vectorize, String... records) throws Exception { + String validationQuery, boolean vectorize, String... records) throws Exception { ValidWriteIdList txns = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName)); AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, txns); Assert.assertEquals(0, dir.getObsolete().size()); @@ -619,12 +784,13 @@ private void checkDataWritten2(Path partitionPath, long minTxn, long maxTxn, int Assert.assertEquals(minTxn, min); Assert.assertEquals(maxTxn, max); boolean isVectorizationEnabled = conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED); - if(vectorize) { + if (vectorize) { conf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); } String currStrategy = conf.getVar(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY); - for(String strategy : ((Validator.StringSet)HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.getValidator()).getExpected()) { + for (String strategy : ((Validator.StringSet) HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.getValidator()) + .getExpected()) { //run it with each split strategy - make sure there are differences conf.setVar(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY, strategy.toUpperCase()); List actualResult = queryTable(driver, validationQuery); @@ -649,30 +815,39 @@ private void checkNothingWritten(Path partitionPath) throws Exception { @Test public void testEndpointConnection() throws Exception { // For partitioned table, partitionVals are specified - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, partitionVals); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); //shouldn't throw + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); connection.close(); // For unpartitioned table, partitionVals are not specified - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - endPt.newConnection(false, "UT_" + Thread.currentThread().getName()).close(); // should not throw - - // For partitioned table, partitionVals are not specified - try { - endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, null); - connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - Assert.assertTrue("ConnectionError was not thrown", false); - connection.close(); - } catch (ConnectionError e) { - // expecting this exception - String errMsg = "doesn't specify any partitions for partitioned table"; - Assert.assertTrue(e.toString().endsWith(errMsg)); - } + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.close(); // For unpartitioned table, partition values are specified try { - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, partitionVals); - connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); Assert.assertTrue("ConnectionError was not thrown", false); connection.close(); } catch (ConnectionError e) { @@ -688,68 +863,84 @@ public void testAddPartition() throws Exception { newPartVals.add(PART1_CONTINENT); newPartVals.add("Nepal"); - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName - , newPartVals); - - // Ensure partition is absent - try { - msClient.getPartition(endPt.database, endPt.table, endPt.partitionVals); - Assert.assertTrue("Partition already exists", false); - } catch (NoSuchObjectException e) { - // expect this exception - } + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(newPartVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); // Create partition - Assert.assertNotNull(endPt.newConnection(true, "UT_" + Thread.currentThread().getName())); + Assert.assertNotNull(connection); // Ensure partition is present - Partition p = msClient.getPartition(endPt.database, endPt.table, endPt.partitionVals); + Partition p = msClient.getPartition(dbName, tblName, partitionVals); Assert.assertNotNull("Did not find added partition", p); } @Test public void testTransactionBatchEmptyCommit() throws Exception { // 1) to partitioned table - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - - txnBatch.beginNextTransaction(); - txnBatch.commit(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginNextTransaction(); + connection.commit(); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); - txnBatch.close(); + , connection.getCurrentTransactionState()); connection.close(); // 2) To unpartitioned table - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - writer = new DelimitedInputWriter(fieldNames2,",", endPt); - connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.commit(); + writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.commit(); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); - txnBatch.close(); + , connection.getCurrentTransactionState()); connection.close(); } /** * check that transactions that have not heartbeated and timedout get properly aborted + * * @throws Exception */ @Test public void testTimeOutReaper() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames2,",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(5, writer); - txnBatch.beginNextTransaction(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); conf.setTimeVar(HiveConf.ConfVars.HIVE_TIMEDOUT_TXN_REAPER_START, 0, TimeUnit.SECONDS); //ensure txn timesout conf.setTimeVar(HiveConf.ConfVars.HIVE_TXN_TIMEOUT, 1, TimeUnit.MILLISECONDS); @@ -758,347 +949,385 @@ public void testTimeOutReaper() throws Exception { houseKeeperService.run(); try { //should fail because the TransactionBatch timed out - txnBatch.commit(); - } - catch(TransactionError e) { + connection.commit(); + } catch (TransactionError e) { Assert.assertTrue("Expected aborted transaction", e.getCause() instanceof TxnAbortedException); } - txnBatch.close(); - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.commit(); - txnBatch.beginNextTransaction(); + connection.close(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginNextTransaction(); + connection.commit(); + connection.beginNextTransaction(); houseKeeperService.run(); try { //should fail because the TransactionBatch timed out - txnBatch.commit(); - } - catch(TransactionError e) { + connection.commit(); + } catch (TransactionError e) { Assert.assertTrue("Expected aborted transaction", e.getCause() instanceof TxnAbortedException); } - txnBatch.close(); connection.close(); } @Test public void testHeartbeat() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames2,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(5, writer); - txnBatch.beginNextTransaction(); - //todo: this should ideally check Transaction heartbeat as well, but heartbeat - //timestamp is not reported yet - //GetOpenTxnsInfoResponse txnresp = msClient.showTxns(); - ShowLocksRequest request = new ShowLocksRequest(); - request.setDbname(dbName2); - request.setTablename(tblName2); - ShowLocksResponse response = msClient.showLocks(request); - Assert.assertEquals("Wrong nubmer of locks: " + response, 1, response.getLocks().size()); - ShowLocksResponseElement lock = response.getLocks().get(0); - long acquiredAt = lock.getAcquiredat(); - long heartbeatAt = lock.getLastheartbeat(); - txnBatch.heartbeat(); - response = msClient.showLocks(request); - Assert.assertEquals("Wrong number of locks2: " + response, 1, response.getLocks().size()); - lock = response.getLocks().get(0); - Assert.assertEquals("Acquired timestamp didn't match", acquiredAt, lock.getAcquiredat()); - Assert.assertTrue("Expected new heartbeat (" + lock.getLastheartbeat() + - ") == old heartbeat(" + heartbeatAt +")", lock.getLastheartbeat() == heartbeatAt); - txnBatch.close(); - int txnBatchSize = 200; - txnBatch = connection.fetchTransactionBatch(txnBatchSize, writer); - for(int i = 0; i < txnBatchSize; i++) { - txnBatch.beginNextTransaction(); - if(i % 47 == 0) { - txnBatch.heartbeat(); - } - if(i % 10 == 0) { - txnBatch.abort(); - } - else { - txnBatch.commit(); - } - if(i % 37 == 0) { - txnBatch.heartbeat(); + int transactionBatch = 20; + conf.setTimeVar(HiveConf.ConfVars.HIVE_STREAMING_CONNECTION_CLIENT_HEARTBEAT_INTERVAL, 100, TimeUnit.MILLISECONDS); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withTransactionBatchSize(transactionBatch) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + try { + connection.beginNextTransaction(); + ShowLocksRequest request = new ShowLocksRequest(); + request.setDbname(dbName2); + request.setTablename(tblName2); + ShowLocksResponse response = msClient.showLocks(request); + Assert.assertEquals("Wrong number of locks: " + response, 1, response.getLocks().size()); + ShowLocksResponseElement lock = response.getLocks().get(0); + long acquiredAt = lock.getAcquiredat(); + long heartbeatAt = lock.getLastheartbeat(); + response = msClient.showLocks(request); + Assert.assertEquals("Wrong number of locks2: " + response, 1, response.getLocks().size()); + lock = response.getLocks().get(0); + Assert.assertEquals("Acquired timestamp didn'table match", acquiredAt, lock.getAcquiredat()); + Assert.assertTrue("Expected new heartbeat (" + lock.getLastheartbeat() + + ") == old heartbeat(" + heartbeatAt + ")", lock.getLastheartbeat() == heartbeatAt); + for (int i = 0; i < transactionBatch * 3; i++) { + connection.beginNextTransaction(); + if (i % 10 == 0) { + connection.abort(); + } else { + connection.commit(); + } + Thread.sleep(10); } + } finally { + conf.unset(HiveConf.ConfVars.HIVE_STREAMING_CONNECTION_CLIENT_HEARTBEAT_INTERVAL.varname); + connection.close(); } - } + @Test public void testTransactionBatchEmptyAbort() throws Exception { // 1) to partitioned table - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.abort(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.abort(); Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); - txnBatch.close(); + , connection.getCurrentTransactionState()); connection.close(); // 2) to unpartitioned table - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - writer = new DelimitedInputWriter(fieldNames,",", endPt); - connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.abort(); + writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.abort(); Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); - txnBatch.close(); + , connection.getCurrentTransactionState()); connection.close(); } @Test public void testTransactionBatchCommit_Delimited() throws Exception { - testTransactionBatchCommit_Delimited(null); - } - @Test - public void testTransactionBatchCommit_DelimitedUGI() throws Exception { - testTransactionBatchCommit_Delimited(Utils.getUGI()); - } - private void testTransactionBatchCommit_Delimited(UserGroupInformation ugi) throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(true, conf, ugi, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, conf, connection); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); // 1st Txn - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); + connection.beginNextTransaction(); Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.commit(); + , connection.getCurrentTransactionState()); + connection.write("1,Hello streaming".getBytes()); + connection.commit(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); // 2nd Txn - txnBatch.beginNextTransaction(); + connection.beginNextTransaction(); Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("2,Welcome to streaming".getBytes()); + , connection.getCurrentTransactionState()); + connection.write("2,Welcome to streaming".getBytes()); // data should not be visible checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); - txnBatch.commit(); + connection.commit(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}", "{2, Welcome to streaming}"); - txnBatch.close(); - Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); - - connection.close(); + Assert.assertEquals(TransactionBatch.TxnState.INACTIVE + , connection.getCurrentTransactionState()); - // To Unpartitioned table - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - connection = endPt.newConnection(true, conf, ugi, "UT_" + Thread.currentThread().getName()); - writer = new DelimitedInputWriter(fieldNames,",", endPt, conf, connection); + // To Unpartitioned table + writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); // 1st Txn - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); + connection.beginNextTransaction(); Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.commit(); + , connection.getCurrentTransactionState()); + connection.write("1,Hello streaming".getBytes()); + connection.commit(); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); connection.close(); } @Test public void testTransactionBatchCommit_Regex() throws Exception { - testTransactionBatchCommit_Regex(null); - } - @Test - public void testTransactionBatchCommit_RegexUGI() throws Exception { - testTransactionBatchCommit_Regex(Utils.getUGI()); - } - private void testTransactionBatchCommit_Regex(UserGroupInformation ugi) throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(true, conf, ugi, "UT_" + Thread.currentThread().getName()); String regex = "([^,]*),(.*)"; - StrictRegexWriter writer = new StrictRegexWriter(regex, endPt, conf, connection); + StrictRegexWriter writer = StrictRegexWriter.newBuilder() + .withRegex(regex) + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); // 1st Txn - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); + connection.beginNextTransaction(); Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.commit(); + , connection.getCurrentTransactionState()); + connection.write("1,Hello streaming".getBytes()); + connection.commit(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); // 2nd Txn - txnBatch.beginNextTransaction(); + connection.beginNextTransaction(); Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("2,Welcome to streaming".getBytes()); + , connection.getCurrentTransactionState()); + connection.write("2,Welcome to streaming".getBytes()); // data should not be visible checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); - txnBatch.commit(); + connection.commit(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}", "{2, Welcome to streaming}"); - txnBatch.close(); - Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); - - connection.close(); - + Assert.assertEquals(TransactionBatch.TxnState.INACTIVE + , connection.getCurrentTransactionState()); // To Unpartitioned table - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - connection = endPt.newConnection(true, conf, ugi, "UT_" + Thread.currentThread().getName()); regex = "([^:]*):(.*)"; - writer = new StrictRegexWriter(regex, endPt, conf, connection); + writer = StrictRegexWriter.newBuilder() + .withRegex(regex) + .build(); + + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); // 1st Txn - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); + connection.beginNextTransaction(); Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("1:Hello streaming".getBytes()); - txnBatch.commit(); + , connection.getCurrentTransactionState()); + connection.write("1:Hello streaming".getBytes()); + connection.commit(); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); connection.close(); } @Test public void testTransactionBatchCommit_Json() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - StrictJsonWriter writer = new StrictJsonWriter(endPt, connection); + StrictJsonWriter writer = StrictJsonWriter.newBuilder() + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); // 1st Txn - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); + connection.beginNextTransaction(); Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); String rec1 = "{\"id\" : 1, \"msg\": \"Hello streaming\"}"; - txnBatch.write(rec1.getBytes()); - txnBatch.commit(); + connection.write(rec1.getBytes()); + connection.commit(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); - txnBatch.close(); + connection.close(); Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); - connection.close(); List rs = queryTable(driver, "select * from " + dbName + "." + tblName); Assert.assertEquals(1, rs.size()); } @Test public void testRemainingTransactions() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt); - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginNextTransaction(); // 1) test with txn.Commit() - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - int batch=0; - int initialCount = txnBatch.remainingTransactions(); - while (txnBatch.remainingTransactions()>0) { - txnBatch.beginNextTransaction(); - Assert.assertEquals(--initialCount, txnBatch.remainingTransactions()); - for (int rec=0; rec<2; ++rec) { + int batch = 0; + int initialCount = connection.remainingTransactions(); + while (connection.remainingTransactions() > 0) { + connection.beginNextTransaction(); + Assert.assertEquals(--initialCount, connection.remainingTransactions()); + for (int rec = 0; rec < 2; ++rec) { Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write((batch * rec + ",Hello streaming").getBytes()); + , connection.getCurrentTransactionState()); + connection.write((batch * rec + ",Hello streaming").getBytes()); } - txnBatch.commit(); + connection.commit(); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); ++batch; } - Assert.assertEquals(0, txnBatch.remainingTransactions()); - txnBatch.close(); + Assert.assertEquals(0, connection.remainingTransactions()); + connection.close(); Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); - + , connection.getCurrentTransactionState()); + + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); // 2) test with txn.Abort() - txnBatch = connection.fetchTransactionBatch(10, writer); - batch=0; - initialCount = txnBatch.remainingTransactions(); - while (txnBatch.remainingTransactions()>0) { - txnBatch.beginNextTransaction(); - Assert.assertEquals(--initialCount,txnBatch.remainingTransactions()); - for (int rec=0; rec<2; ++rec) { + connection.beginNextTransaction(); + batch = 0; + initialCount = connection.remainingTransactions(); + while (connection.remainingTransactions() > 0) { + connection.beginNextTransaction(); + Assert.assertEquals(--initialCount, connection.remainingTransactions()); + for (int rec = 0; rec < 2; ++rec) { Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write((batch * rec + ",Hello streaming").getBytes()); + , connection.getCurrentTransactionState()); + connection.write((batch * rec + ",Hello streaming").getBytes()); } - txnBatch.abort(); + connection.abort(); Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); ++batch; } - Assert.assertEquals(0, txnBatch.remainingTransactions()); - txnBatch.close(); + Assert.assertEquals(0, connection.remainingTransactions()); + connection.close(); Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); - - connection.close(); + , connection.getCurrentTransactionState()); } @Test public void testTransactionBatchAbort() throws Exception { - - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, connection); - - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.write("2,Welcome to streaming".getBytes()); - txnBatch.abort(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.write("1,Hello streaming".getBytes()); + connection.write("2,Welcome to streaming".getBytes()); + connection.abort(); checkNothingWritten(partLoc); Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); - txnBatch.close(); connection.close(); checkNothingWritten(partLoc); @@ -1109,123 +1338,157 @@ public void testTransactionBatchAbort() throws Exception { @Test public void testTransactionBatchAbortAndCommit() throws Exception { String agentInfo = "UT_" + Thread.currentThread().getName(); - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(false, agentInfo); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.write("2,Welcome to streaming".getBytes()); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo(agentInfo) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.write("1,Hello streaming".getBytes()); + connection.write("2,Welcome to streaming".getBytes()); ShowLocksResponse resp = msClient.showLocks(new ShowLocksRequest()); Assert.assertEquals("LockCount", 1, resp.getLocksSize()); Assert.assertEquals("LockType", LockType.SHARED_READ, resp.getLocks().get(0).getType()); Assert.assertEquals("LockState", LockState.ACQUIRED, resp.getLocks().get(0).getState()); Assert.assertEquals("AgentInfo", agentInfo, resp.getLocks().get(0).getAgentInfo()); - txnBatch.abort(); + connection.abort(); checkNothingWritten(partLoc); Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); - txnBatch.beginNextTransaction(); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.write("2,Welcome to streaming".getBytes()); - txnBatch.commit(); + connection.beginNextTransaction(); + connection.write("1,Hello streaming".getBytes()); + connection.write("2,Welcome to streaming".getBytes()); + connection.commit(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}", - "{2, Welcome to streaming}"); + "{2, Welcome to streaming}"); - txnBatch.close(); connection.close(); } @Test public void testMultipleTransactionBatchCommits() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt); - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.commit(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.write("1,Hello streaming".getBytes()); + connection.commit(); String validationQuery = "select id, msg from " + dbName + "." + tblName + " order by id, msg"; checkDataWritten2(partLoc, 1, 10, 1, validationQuery, false, "1\tHello streaming"); - txnBatch.beginNextTransaction(); - txnBatch.write("2,Welcome to streaming".getBytes()); - txnBatch.commit(); + connection.beginNextTransaction(); + connection.write("2,Welcome to streaming".getBytes()); + connection.commit(); - checkDataWritten2(partLoc, 1, 10, 1, validationQuery, true, "1\tHello streaming", - "2\tWelcome to streaming"); + checkDataWritten2(partLoc, 1, 10, 1, validationQuery, true, "1\tHello streaming", + "2\tWelcome to streaming"); - txnBatch.close(); + connection.close(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); // 2nd Txn Batch - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("3,Hello streaming - once again".getBytes()); - txnBatch.commit(); + connection.beginNextTransaction(); + connection.write("3,Hello streaming - once again".getBytes()); + connection.commit(); - checkDataWritten2(partLoc, 1, 20, 2, validationQuery, false, "1\tHello streaming", - "2\tWelcome to streaming", "3\tHello streaming - once again"); + checkDataWritten2(partLoc, 1, 20, 2, validationQuery, false, "1\tHello streaming", + "2\tWelcome to streaming", "3\tHello streaming - once again"); - txnBatch.beginNextTransaction(); - txnBatch.write("4,Welcome to streaming - once again".getBytes()); - txnBatch.commit(); + connection.beginNextTransaction(); + connection.write("4,Welcome to streaming - once again".getBytes()); + connection.commit(); - checkDataWritten2(partLoc, 1, 20, 2, validationQuery, true, "1\tHello streaming", - "2\tWelcome to streaming", "3\tHello streaming - once again", - "4\tWelcome to streaming - once again"); + checkDataWritten2(partLoc, 1, 20, 2, validationQuery, true, "1\tHello streaming", + "2\tWelcome to streaming", "3\tHello streaming - once again", + "4\tWelcome to streaming - once again"); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); - - txnBatch.close(); + , connection.getCurrentTransactionState()); connection.close(); } @Test public void testInterleavedTransactionBatchCommits() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames, ",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); // Acquire 1st Txn Batch - TransactionBatch txnBatch1 = connection.fetchTransactionBatch(10, writer); - txnBatch1.beginNextTransaction(); + connection.beginNextTransaction(); // Acquire 2nd Txn Batch - DelimitedInputWriter writer2 = new DelimitedInputWriter(fieldNames, ",", endPt); - TransactionBatch txnBatch2 = connection.fetchTransactionBatch(10, writer2); - txnBatch2.beginNextTransaction(); + StrictDelimitedInputWriter writer2 = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer2) + .withHiveConf(conf) + .connect(); + connection2.beginNextTransaction(); // Interleaved writes to both batches - txnBatch1.write("1,Hello streaming".getBytes()); - txnBatch2.write("3,Hello streaming - once again".getBytes()); + connection.write("1,Hello streaming".getBytes()); + connection2.write("3,Hello streaming - once again".getBytes()); checkNothingWritten(partLoc); - txnBatch2.commit(); + connection2.commit(); String validationQuery = "select id, msg from " + dbName + "." + tblName + " order by id, msg"; checkDataWritten2(partLoc, 11, 20, 1, validationQuery, true, "3\tHello streaming - once again"); - txnBatch1.commit(); + connection.commit(); /*now both batches have committed (but not closed) so we for each primary file we expect a side file to exist and indicate the true length of primary file*/ FileSystem fs = partLoc.getFileSystem(conf); AcidUtils.Directory dir = AcidUtils.getAcidState(partLoc, conf, - msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName))); - for(AcidUtils.ParsedDelta pd : dir.getCurrentDirectories()) { - for(FileStatus stat : fs.listStatus(pd.getPath(), AcidUtils.bucketFileFilter)) { + msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName))); + for (AcidUtils.ParsedDelta pd : dir.getCurrentDirectories()) { + for (FileStatus stat : fs.listStatus(pd.getPath(), AcidUtils.bucketFileFilter)) { Path lengthFile = OrcAcidUtils.getSideFile(stat.getPath()); Assert.assertTrue(lengthFile + " missing", fs.exists(lengthFile)); long lengthFileSize = fs.getFileStatus(lengthFile).getLen(); @@ -1237,20 +1500,20 @@ public void testInterleavedTransactionBatchCommits() throws Exception { } } checkDataWritten2(partLoc, 1, 20, 2, - validationQuery, false,"1\tHello streaming", "3\tHello streaming - once again"); + validationQuery, false, "1\tHello streaming", "3\tHello streaming - once again"); - txnBatch1.beginNextTransaction(); - txnBatch1.write("2,Welcome to streaming".getBytes()); + connection.beginNextTransaction(); + connection.write("2,Welcome to streaming".getBytes()); - txnBatch2.beginNextTransaction(); - txnBatch2.write("4,Welcome to streaming - once again".getBytes()); + connection2.beginNextTransaction(); + connection2.write("4,Welcome to streaming - once again".getBytes()); //here each batch has written data and committed (to bucket0 since table only has 1 bucket) //so each of 2 deltas has 1 bucket0 and 1 bucket0_flush_length. Furthermore, each bucket0 //has now received more data(logically - it's buffered) but it is not yet committed. //lets check that side files exist, etc dir = AcidUtils.getAcidState(partLoc, conf, msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName))); - for(AcidUtils.ParsedDelta pd : dir.getCurrentDirectories()) { - for(FileStatus stat : fs.listStatus(pd.getPath(), AcidUtils.bucketFileFilter)) { + for (AcidUtils.ParsedDelta pd : dir.getCurrentDirectories()) { + for (FileStatus stat : fs.listStatus(pd.getPath(), AcidUtils.bucketFileFilter)) { Path lengthFile = OrcAcidUtils.getSideFile(stat.getPath()); Assert.assertTrue(lengthFile + " missing", fs.exists(lengthFile)); long lengthFileSize = fs.getFileStatus(lengthFile).getLen(); @@ -1262,103 +1525,97 @@ public void testInterleavedTransactionBatchCommits() throws Exception { } } checkDataWritten2(partLoc, 1, 20, 2, - validationQuery, true,"1\tHello streaming", "3\tHello streaming - once again"); + validationQuery, true, "1\tHello streaming", "3\tHello streaming - once again"); - txnBatch1.commit(); + connection.commit(); checkDataWritten2(partLoc, 1, 20, 2, validationQuery, false, "1\tHello streaming", - "2\tWelcome to streaming", - "3\tHello streaming - once again"); + "2\tWelcome to streaming", + "3\tHello streaming - once again"); - txnBatch2.commit(); + connection2.commit(); checkDataWritten2(partLoc, 1, 20, 2, validationQuery, true, "1\tHello streaming", - "2\tWelcome to streaming", - "3\tHello streaming - once again", - "4\tWelcome to streaming - once again"); + "2\tWelcome to streaming", + "3\tHello streaming - once again", + "4\tWelcome to streaming - once again"); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch1.getCurrentTransactionState()); + , connection.getCurrentTransactionState()); Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch2.getCurrentTransactionState()); - - txnBatch1.close(); - txnBatch2.close(); + , connection2.getCurrentTransactionState()); connection.close(); + connection2.close(); } private static class WriterThd extends Thread { private final StreamingConnection conn; - private final DelimitedInputWriter writer; private final String data; private Throwable error; - WriterThd(HiveEndPoint ep, String data) throws Exception { + WriterThd(String data) throws Exception { super("Writer_" + data); - writer = new DelimitedInputWriter(fieldNames, ",", ep); - conn = ep.newConnection(false, "UT_" + Thread.currentThread().getName()); + RecordWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + this.conn = connection; this.data = data; - setUncaughtExceptionHandler(new UncaughtExceptionHandler() { - @Override - public void uncaughtException(Thread thread, Throwable throwable) { - error = throwable; - LOG.error("Thread " + thread.getName() + " died: " + throwable.getMessage(), throwable); - } + setUncaughtExceptionHandler((thread, throwable) -> { + error = throwable; + LOG.error(((HiveStreamingConnection) connection).toTransactionString()); + LOG.error("Thread " + thread.getName() + " died: " + throwable.getMessage(), throwable); }); } @Override public void run() { - TransactionBatch txnBatch = null; try { - txnBatch = conn.fetchTransactionBatch(10, writer); - while (txnBatch.remainingTransactions() > 0) { - txnBatch.beginNextTransaction(); - txnBatch.write(data.getBytes()); - txnBatch.write(data.getBytes()); - txnBatch.commit(); + for (int i = 0; i < 10; i++) { + conn.beginNextTransaction(); + conn.write(data.getBytes()); + conn.write(data.getBytes()); + conn.commit(); } // while } catch (Exception e) { throw new RuntimeException(e); } finally { - if (txnBatch != null) { + if (conn != null) { try { - txnBatch.close(); + conn.close(); } catch (Exception e) { LOG.error("txnBatch.close() failed: " + e.getMessage(), e); - conn.close(); } } - try { - conn.close(); - } catch (Exception e) { - LOG.error("conn.close() failed: " + e.getMessage(), e); - } - } } } @Test public void testConcurrentTransactionBatchCommits() throws Exception { - final HiveEndPoint ep = new HiveEndPoint(metaStoreURI, dbName, tblName, partitionVals); List writers = new ArrayList(3); - writers.add(new WriterThd(ep, "1,Matrix")); - writers.add(new WriterThd(ep, "2,Gandhi")); - writers.add(new WriterThd(ep, "3,Silence")); + writers.add(new WriterThd("1,Matrix")); + writers.add(new WriterThd("2,Gandhi")); + writers.add(new WriterThd("3,Silence")); - for(WriterThd w : writers) { + for (WriterThd w : writers) { w.start(); } - for(WriterThd w : writers) { + for (WriterThd w : writers) { w.join(); } - for(WriterThd w : writers) { - if(w.error != null) { + for (WriterThd w : writers) { + if (w.error != null) { Assert.assertFalse("Writer thread" + w.getName() + " died: " + w.error.getMessage() + " See log file for stack trace", true); } @@ -1369,11 +1626,11 @@ public void testConcurrentTransactionBatchCommits() throws Exception { private ArrayList dumpBucket(Path orcFile) throws IOException { org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration()); Reader reader = OrcFile.createReader(orcFile, - OrcFile.readerOptions(conf).filesystem(fs)); + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); StructObjectInspector inspector = (StructObjectInspector) reader - .getObjectInspector(); + .getObjectInspector(); System.out.format("Found Bucket File : %s \n", orcFile.getName()); ArrayList result = new ArrayList(); @@ -1395,7 +1652,7 @@ public void testConcurrentTransactionBatchCommits() throws Exception { WritableLongObjectInspector f1ins = (WritableLongObjectInspector) fields.get(1).getFieldObjectInspector(); WritableIntObjectInspector f2ins = (WritableIntObjectInspector) fields.get(2).getFieldObjectInspector(); WritableLongObjectInspector f3ins = (WritableLongObjectInspector) fields.get(3).getFieldObjectInspector(); - WritableLongObjectInspector f4ins = (WritableLongObjectInspector) fields.get(4).getFieldObjectInspector(); + WritableLongObjectInspector f4ins = (WritableLongObjectInspector) fields.get(4).getFieldObjectInspector(); StructObjectInspector f5ins = (StructObjectInspector) fields.get(5).getFieldObjectInspector(); int f0 = f0ins.get(inspector.getStructFieldData(row, fields.get(0))); @@ -1405,7 +1662,7 @@ public void testConcurrentTransactionBatchCommits() throws Exception { long f4 = f4ins.get(inspector.getStructFieldData(row, fields.get(4))); SampleRec f5 = deserializeInner(inspector.getStructFieldData(row, fields.get(5)), f5ins); - return new Object[] {f0, f1, f2, f3, f4, f5}; + return new Object[]{f0, f1, f2, f3, f4, f5}; } // Assumes row schema => string,int,string @@ -1430,49 +1687,67 @@ public void testBucketing() throws Exception { // 1) Create two bucketed tables String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; - dbLocation = dbLocation.replaceAll("\\\\","/"); // for windows paths + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths String[] colNames = "key1,key2,data".split(","); String[] colTypes = "string,int,string".split(","); String[] bucketNames = "key1,key2".split(","); int bucketCount = 4; createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames - , null, dbLocation, bucketCount); + , null, dbLocation, bucketCount); String dbLocation2 = dbFolder.newFolder(dbName4).getCanonicalPath() + ".db"; - dbLocation2 = dbLocation2.replaceAll("\\\\","/"); // for windows paths + dbLocation2 = dbLocation2.replaceAll("\\\\", "/"); // for windows paths String[] colNames2 = "key3,key4,data2".split(","); String[] colTypes2 = "string,int,string".split(","); String[] bucketNames2 = "key3,key4".split(","); createDbAndTable(driver, dbName4, tblName4, null, colNames2, colTypes2, bucketNames2 - , null, dbLocation2, bucketCount); + , null, dbLocation2, bucketCount); // 2) Insert data into both tables - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName3, tblName3, null); - StreamingConnection connection = endPt.newConnection(false, agentInfo); - DelimitedInputWriter writer = new DelimitedInputWriter(colNames,",", endPt, connection); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo(agentInfo) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.write("name0,1,Hello streaming".getBytes()); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.commit(); + connection.close(); + + + StrictDelimitedInputWriter writer2 = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name0,1,Hello streaming".getBytes()); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.commit(); + StreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName4) + .withTable(tblName4) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer2) + .withHiveConf(conf) + .connect(); - HiveEndPoint endPt2 = new HiveEndPoint(metaStoreURI, dbName4, tblName4, null); - StreamingConnection connection2 = endPt2.newConnection(false, agentInfo); - DelimitedInputWriter writer2 = new DelimitedInputWriter(colNames2,",", endPt2, connection); - TransactionBatch txnBatch2 = connection2.fetchTransactionBatch(2, writer2); - txnBatch2.beginNextTransaction(); + connection2.beginNextTransaction(); - txnBatch2.write("name5,2,fact3".getBytes()); // bucket 0 - txnBatch2.write("name8,2,fact3".getBytes()); // bucket 1 - txnBatch2.write("name0,1,fact1".getBytes()); // bucket 2 + connection2.write("name5,2,fact3".getBytes()); // bucket 0 + connection2.write("name8,2,fact3".getBytes()); // bucket 1 + connection2.write("name0,1,fact1".getBytes()); // bucket 2 - txnBatch2.commit(); + connection2.commit(); + connection2.close(); // 3 Check data distribution in buckets HashMap> actual1 = dumpAllBuckets(dbLocation, tblName3); @@ -1486,10 +1761,11 @@ public void testBucketing() throws Exception { Assert.assertEquals("number of buckets does not match expectation", actual1.values().size(), 3); Assert.assertEquals("records in bucket does not match expectation", actual1.get(0).size(), 2); Assert.assertEquals("records in bucket does not match expectation", actual1.get(1).size(), 1); - Assert.assertTrue("bucket 2 shouldn't have been created", actual1.get(2) == null); + Assert.assertTrue("bucket 2 shouldn'table have been created", actual1.get(2) == null); Assert.assertEquals("records in bucket does not match expectation", actual1.get(3).size(), 1); } - private void runCmdOnDriver(String cmd) throws QueryFailedException { + + private void runCmdOnDriver(String cmd) { boolean t = runDDL(driver, cmd); Assert.assertTrue(cmd + " failed", t); } @@ -1503,35 +1779,41 @@ public void testFileDump() throws Exception { // 1) Create two bucketed tables String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; - dbLocation = dbLocation.replaceAll("\\\\","/"); // for windows paths + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths String[] colNames = "key1,key2,data".split(","); String[] colTypes = "string,int,string".split(","); String[] bucketNames = "key1,key2".split(","); int bucketCount = 4; createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames - , null, dbLocation, bucketCount); + , null, dbLocation, bucketCount); String dbLocation2 = dbFolder.newFolder(dbName4).getCanonicalPath() + ".db"; - dbLocation2 = dbLocation2.replaceAll("\\\\","/"); // for windows paths + dbLocation2 = dbLocation2.replaceAll("\\\\", "/"); // for windows paths String[] colNames2 = "key3,key4,data2".split(","); String[] colTypes2 = "string,int,string".split(","); String[] bucketNames2 = "key3,key4".split(","); createDbAndTable(driver, dbName4, tblName4, null, colNames2, colTypes2, bucketNames2 - , null, dbLocation2, bucketCount); - - + , null, dbLocation2, bucketCount); + + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo(agentInfo) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); // 2) Insert data into both tables - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName3, tblName3, null); - StreamingConnection connection = endPt.newConnection(false, agentInfo); - DelimitedInputWriter writer = new DelimitedInputWriter(colNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name0,1,Hello streaming".getBytes()); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.commit(); + connection.beginNextTransaction(); + connection.write("name0,1,Hello streaming".getBytes()); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.commit(); + connection.close(); PrintStream origErr = System.err; ByteArrayOutputStream myErr = new ByteArrayOutputStream(); @@ -1549,18 +1831,27 @@ public void testFileDump() throws Exception { // for writes (transaction batch not closed yet) Assert.assertEquals(false, errDump.contains("is still open for writes.")); - HiveEndPoint endPt2 = new HiveEndPoint(metaStoreURI, dbName4, tblName4, null); - DelimitedInputWriter writer2 = new DelimitedInputWriter(colNames2,",", endPt2); - StreamingConnection connection2 = endPt2.newConnection(false, agentInfo); - TransactionBatch txnBatch2 = connection2.fetchTransactionBatch(2, writer2); - txnBatch2.beginNextTransaction(); + StrictDelimitedInputWriter writer2 = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName4) + .withTable(tblName4) + .withAgentInfo(agentInfo) + .withRecordWriter(writer2) + .withHiveConf(conf) + .connect(); + + connection2.beginNextTransaction(); - txnBatch2.write("name5,2,fact3".getBytes()); // bucket 0 - txnBatch2.write("name8,2,fact3".getBytes()); // bucket 1 - txnBatch2.write("name0,1,fact1".getBytes()); // bucket 2 + connection2.write("name5,2,fact3".getBytes()); // bucket 0 + connection2.write("name8,2,fact3".getBytes()); // bucket 1 + connection2.write("name0,1,fact1".getBytes()); // bucket 2 // no data for bucket 3 -- expect 0 length bucket file - txnBatch2.commit(); + connection2.commit(); + connection2.close(); origErr = System.err; myErr = new ByteArrayOutputStream(); @@ -1584,27 +1875,33 @@ public void testFileDumpCorruptDataFiles() throws Exception { // 1) Create two bucketed tables String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; - dbLocation = dbLocation.replaceAll("\\\\","/"); // for windows paths + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths String[] colNames = "key1,key2,data".split(","); String[] colTypes = "string,int,string".split(","); String[] bucketNames = "key1,key2".split(","); int bucketCount = 4; createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames - , null, dbLocation, bucketCount); + , null, dbLocation, bucketCount); // 2) Insert data into both tables - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName3, tblName3, null); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(colNames,",", endPt, connection); - + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); // we need side file for this test, so we create 2 txn batch and test with only one - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name0,1,Hello streaming".getBytes()); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.commit(); + connection.beginNextTransaction(); + connection.write("name0,1,Hello streaming".getBytes()); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.commit(); // intentionally corrupt some files Path path = new Path(dbLocation); @@ -1617,7 +1914,7 @@ public void testFileDumpCorruptDataFiles() throws Exception { } else if (file.contains("bucket_00001")) { corruptDataFile(file, conf, -1); } else if (file.contains("bucket_00002")) { - Assert.assertFalse("bucket 2 shouldn't have been created", true); + Assert.assertFalse("bucket 2 shouldn'table have been created", true); } else if (file.contains("bucket_00003")) { corruptDataFile(file, conf, 100); } @@ -1669,17 +1966,17 @@ public void testFileDumpCorruptDataFiles() throws Exception { Assert.assertEquals(false, errDump.contains("file(s) are corrupted")); Assert.assertEquals(false, errDump.contains("is still open for writes.")); - // after recovery there shouldn't be any *_flush_length files + // after recovery there shouldn'table be any *_flush_length files files = FileDump.getAllFilesInPath(path, conf); for (String file : files) { Assert.assertEquals(false, file.contains("_flush_length")); } - txnBatch.close(); + connection.close(); } private void corruptDataFile(final String file, final Configuration conf, final int addRemoveBytes) - throws Exception { + throws Exception { Path bPath = new Path(file); Path cPath = new Path(bPath.getParent(), bPath.getName() + ".corrupt"); FileSystem fs = bPath.getFileSystem(conf); @@ -1702,48 +1999,55 @@ public void testFileDumpCorruptSideFiles() throws Exception { // 1) Create two bucketed tables String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; - dbLocation = dbLocation.replaceAll("\\\\","/"); // for windows paths + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths String[] colNames = "key1,key2,data".split(","); String[] colTypes = "string,int,string".split(","); String[] bucketNames = "key1,key2".split(","); int bucketCount = 4; createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames - , null, dbLocation, bucketCount); + , null, dbLocation, bucketCount); // 2) Insert data into both tables - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName3, tblName3, null); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(colNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name0,1,Hello streaming".getBytes()); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.write("name6,3,aHello streaming".getBytes()); - txnBatch.commit(); - - Map> offsetMap = new HashMap>(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + connection.write("name0,1,Hello streaming".getBytes()); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.write("name6,3,aHello streaming".getBytes()); + connection.commit(); + + Map> offsetMap = new HashMap>(); recordOffsets(conf, dbLocation, offsetMap); - txnBatch.beginNextTransaction(); - txnBatch.write("name01,11,-Hello streaming".getBytes()); - txnBatch.write("name21,21,-Welcome to streaming".getBytes()); - txnBatch.write("name41,21,-more Streaming unlimited".getBytes()); - txnBatch.write("name51,21,-even more Streaming unlimited".getBytes()); - txnBatch.write("name02,12,--Hello streaming".getBytes()); - txnBatch.write("name22,22,--Welcome to streaming".getBytes()); - txnBatch.write("name42,22,--more Streaming unlimited".getBytes()); - txnBatch.write("name52,22,--even more Streaming unlimited".getBytes()); - txnBatch.write("name7,4,aWelcome to streaming".getBytes()); - txnBatch.write("name8,5,amore Streaming unlimited".getBytes()); - txnBatch.write("name9,6,aeven more Streaming unlimited".getBytes()); - txnBatch.write("name10,7,bHello streaming".getBytes()); - txnBatch.write("name11,8,bWelcome to streaming".getBytes()); - txnBatch.write("name12,9,bmore Streaming unlimited".getBytes()); - txnBatch.write("name13,10,beven more Streaming unlimited".getBytes()); - txnBatch.commit(); + connection.beginNextTransaction(); + connection.write("name01,11,-Hello streaming".getBytes()); + connection.write("name21,21,-Welcome to streaming".getBytes()); + connection.write("name41,21,-more Streaming unlimited".getBytes()); + connection.write("name51,21,-even more Streaming unlimited".getBytes()); + connection.write("name02,12,--Hello streaming".getBytes()); + connection.write("name22,22,--Welcome to streaming".getBytes()); + connection.write("name42,22,--more Streaming unlimited".getBytes()); + connection.write("name52,22,--even more Streaming unlimited".getBytes()); + connection.write("name7,4,aWelcome to streaming".getBytes()); + connection.write("name8,5,amore Streaming unlimited".getBytes()); + connection.write("name9,6,aeven more Streaming unlimited".getBytes()); + connection.write("name10,7,bHello streaming".getBytes()); + connection.write("name11,8,bWelcome to streaming".getBytes()); + connection.write("name12,9,bmore Streaming unlimited".getBytes()); + connection.write("name13,10,beven more Streaming unlimited".getBytes()); + connection.commit(); recordOffsets(conf, dbLocation, offsetMap); @@ -1820,18 +2124,18 @@ public void testFileDumpCorruptSideFiles() throws Exception { Assert.assertEquals(false, errDump.contains("file(s) are corrupted")); Assert.assertEquals(false, errDump.contains("is still open for writes.")); - // after recovery there shouldn't be any *_flush_length files + // after recovery there shouldn'table be any *_flush_length files files = FileDump.getAllFilesInPath(path, conf); for (String file : files) { Assert.assertEquals(false, file.contains("_flush_length")); } - txnBatch.close(); + connection.close(); } private void corruptSideFile(final String file, final HiveConf conf, - final Map> offsetMap, final String key, final int numEntries) - throws IOException { + final Map> offsetMap, final String key, final int numEntries) + throws IOException { Path dataPath = new Path(file); Path sideFilePath = OrcAcidUtils.getSideFile(dataPath); Path cPath = new Path(sideFilePath.getParent(), sideFilePath.getName() + ".corrupt"); @@ -1850,7 +2154,7 @@ private void corruptSideFile(final String file, final HiveConf conf, } else if (numEntries > 0) { int firstRun = Math.min(offsets.size(), numEntries); // add original entries - for (int i=0; i < firstRun; i++) { + for (int i = 0; i < firstRun; i++) { fdos.writeLong(offsets.get(i)); } @@ -1866,17 +2170,17 @@ private void corruptSideFile(final String file, final HiveConf conf, fs.rename(cPath, sideFilePath); } - private byte[] longToBytes(long x) { + private byte[] longToBytes(long x) { ByteBuffer buffer = ByteBuffer.allocate(8); buffer.putLong(x); return buffer.array(); } private void recordOffsets(final HiveConf conf, final String dbLocation, - final Map> offsetMap) throws IOException { + final Map> offsetMap) throws IOException { Path path = new Path(dbLocation); Collection files = FileDump.getAllFilesInPath(path, conf); - for (String file: files) { + for (String file : files) { Path bPath = new Path(file); FileSystem fs = bPath.getFileSystem(conf); FileStatus fileStatus = fs.getFileStatus(bPath); @@ -1931,131 +2235,151 @@ public void testErrorHandling() throws Exception { String agentInfo = "UT_" + Thread.currentThread().getName(); runCmdOnDriver("create database testErrors"); runCmdOnDriver("use testErrors"); - runCmdOnDriver("create table T(a int, b int) clustered by (b) into 2 buckets stored as orc TBLPROPERTIES ('transactional'='true')"); - - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testErrors", "T", null); - StreamingConnection connection = endPt.newConnection(false, agentInfo); - DelimitedInputWriter innerWriter = new DelimitedInputWriter("a,b".split(","),",", endPt, connection); + runCmdOnDriver( + "create table T(a int, b int) clustered by (b) into 2 buckets stored as orc TBLPROPERTIES ('transactional'='true')"); + + StrictDelimitedInputWriter innerWriter = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testErrors") + .withTable("T") + .withAgentInfo(agentInfo) + .withTransactionBatchSize(2) + .withRecordWriter(innerWriter) + .withHiveConf(conf) + .connect(); + connection.beginNextTransaction(); FaultyWriter writer = new FaultyWriter(innerWriter); - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.close(); - txnBatch.heartbeat();//this is no-op on closed batch - txnBatch.abort();//ditto + connection.close(); + Exception expectedEx = null; + connection.abort();//ditto GetOpenTxnsInfoResponse r = msClient.showTxns(); - Assert.assertEquals("HWM didn't match", 17, r.getTxn_high_water_mark()); + Assert.assertEquals("HWM didn'table match", 17, r.getTxn_high_water_mark()); List ti = r.getOpen_txns(); Assert.assertEquals("wrong status ti(0)", TxnState.ABORTED, ti.get(0).getState()); Assert.assertEquals("wrong status ti(1)", TxnState.ABORTED, ti.get(1).getState()); - Exception expectedEx = null; + try { - txnBatch.beginNextTransaction(); - } - catch(IllegalStateException ex) { + connection.beginNextTransaction(); + } catch (IllegalStateException ex) { expectedEx = ex; } Assert.assertTrue("beginNextTransaction() should have failed", - expectedEx != null && expectedEx.getMessage().contains("has been closed()")); + expectedEx != null && expectedEx.getMessage().contains("Cannot begin next transaction on a " + + "closed streaming connection")); expectedEx = null; try { - txnBatch.write("name0,1,Hello streaming".getBytes()); - } - catch(IllegalStateException ex) { + connection.write("name0,1,Hello streaming".getBytes()); + } catch (IllegalStateException ex) { expectedEx = ex; } - Assert.assertTrue("write() should have failed", + Assert.assertTrue("write() should have failed", expectedEx != null && expectedEx.getMessage().contains("has been closed()")); expectedEx = null; try { - txnBatch.commit(); - } - catch(IllegalStateException ex) { + connection.commit(); + } catch (IllegalStateException ex) { expectedEx = ex; } Assert.assertTrue("commit() should have failed", expectedEx != null && expectedEx.getMessage().contains("has been closed()")); - txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.commit(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase("testErrors") + .withTable("T") + .withAgentInfo(agentInfo) + .withTransactionBatchSize(2) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginNextTransaction(); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.commit(); //test toString() - String s = txnBatch.toString(); - Assert.assertTrue("Actual: " + s, s.contains("LastUsed " + JavaUtils.txnIdToString(txnBatch.getCurrentTxnId()))); + String s = connection.toTransactionString(); + Assert.assertTrue("Actual: " + s, s.contains("LastUsed " + JavaUtils.txnIdToString(connection.getCurrentTxnId()))); Assert.assertTrue("Actual: " + s, s.contains("TxnStatus[CO]")); expectedEx = null; - txnBatch.beginNextTransaction(); + connection.beginNextTransaction(); writer.enableErrors(); try { - txnBatch.write("name6,2,Doh!".getBytes()); - } - catch(StreamingIOFailure ex) { + connection.write("name6,2,Doh!".getBytes()); + } catch (StreamingIOFailure ex) { expectedEx = ex; - txnBatch.getCurrentTransactionState(); - txnBatch.getCurrentTxnId();//test it doesn't throw ArrayIndexOutOfBounds... + connection.getCurrentTransactionState(); + connection.getCurrentTxnId();//test it doesn'table throw ArrayIndexOutOfBounds... } Assert.assertTrue("Wrong exception: " + (expectedEx != null ? expectedEx.getMessage() : "?"), expectedEx != null && expectedEx.getMessage().contains("Simulated fault occurred")); expectedEx = null; try { - txnBatch.commit(); - } - catch(IllegalStateException ex) { + connection.commit(); + } catch (IllegalStateException ex) { expectedEx = ex; } Assert.assertTrue("commit() should have failed", expectedEx != null && expectedEx.getMessage().contains("has been closed()")); //test toString() - s = txnBatch.toString(); - Assert.assertTrue("Actual: " + s, s.contains("LastUsed " + JavaUtils.txnIdToString(txnBatch.getCurrentTxnId()))); + s = connection.toTransactionString(); + Assert.assertTrue("Actual: " + s, s.contains("LastUsed " + JavaUtils.txnIdToString(connection.getCurrentTxnId()))); Assert.assertTrue("Actual: " + s, s.contains("TxnStatus[CA]")); r = msClient.showTxns(); - Assert.assertEquals("HWM didn't match", 19, r.getTxn_high_water_mark()); + Assert.assertEquals("HWM didn'table match", 19, r.getTxn_high_water_mark()); ti = r.getOpen_txns(); Assert.assertEquals("wrong status ti(0)", TxnState.ABORTED, ti.get(0).getState()); Assert.assertEquals("wrong status ti(1)", TxnState.ABORTED, ti.get(1).getState()); //txnid 3 was committed and thus not open Assert.assertEquals("wrong status ti(2)", TxnState.ABORTED, ti.get(2).getState()); + connection.close(); writer.disableErrors(); - txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); + connection = HiveStreamingConnection.newBuilder() + .withDatabase("testErrors") + .withTable("T") + .withAgentInfo(agentInfo) + .withTransactionBatchSize(2) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginNextTransaction(); + connection.write("name2,2,Welcome to streaming".getBytes()); writer.enableErrors(); expectedEx = null; try { - txnBatch.commit(); - } - catch(StreamingIOFailure ex) { + connection.commit(); + } catch (StreamingIOFailure ex) { expectedEx = ex; } Assert.assertTrue("Wrong exception: " + (expectedEx != null ? expectedEx.getMessage() : "?"), expectedEx != null && expectedEx.getMessage().contains("Simulated fault occurred")); r = msClient.showTxns(); - Assert.assertEquals("HWM didn't match", 21, r.getTxn_high_water_mark()); + Assert.assertEquals("HWM didn'table match", 21, r.getTxn_high_water_mark()); ti = r.getOpen_txns(); Assert.assertEquals("wrong status ti(3)", TxnState.ABORTED, ti.get(3).getState()); Assert.assertEquals("wrong status ti(4)", TxnState.ABORTED, ti.get(4).getState()); - txnBatch.abort(); + connection.abort(); } - // assumes un partitioned table + // assumes un partitioned table // returns a map > private HashMap> dumpAllBuckets(String dbLocation, String tableName) - throws IOException { + throws IOException { HashMap> result = new HashMap>(); for (File deltaDir : new File(dbLocation + "/" + tableName).listFiles()) { - if(!deltaDir.getName().startsWith("delta")) { + if (!deltaDir.getName().startsWith("delta")) { continue; } File[] bucketFiles = deltaDir.listFiles(new FileFilter() { @@ -2066,11 +2390,11 @@ public boolean accept(File pathname) { } }); for (File bucketFile : bucketFiles) { - if(bucketFile.toString().endsWith("length")) { + if (bucketFile.toString().endsWith("length")) { continue; } Integer bucketNum = getBucketNumber(bucketFile); - ArrayList recs = dumpBucket(new Path(bucketFile.toString())); + ArrayList recs = dumpBucket(new Path(bucketFile.toString())); result.put(bucketNum, recs); } } @@ -2081,14 +2405,14 @@ public boolean accept(File pathname) { private Integer getBucketNumber(File bucketFile) { String fname = bucketFile.getName(); int start = fname.indexOf('_'); - String number = fname.substring(start+1, fname.length()); + String number = fname.substring(start + 1, fname.length()); return Integer.parseInt(number); } // delete db and all tables in it public static void dropDB(IMetaStoreClient client, String databaseName) { try { - for (String table : client.listTableNamesByFilter(databaseName, "", (short)-1)) { + for (String table : client.listTableNamesByFilter(databaseName, "", (short) -1)) { client.dropTable(databaseName, table, true, true); } client.dropDatabase(databaseName); @@ -2098,15 +2422,14 @@ public static void dropDB(IMetaStoreClient client, String databaseName) { } - ///////// -------- UTILS ------- ///////// // returns Path of the partition created (if any) else Path of table - public static Path createDbAndTable(IDriver driver, String databaseName, - String tableName, List partVals, - String[] colNames, String[] colTypes, - String[] bucketCols, - String[] partNames, String dbLocation, int bucketCount) - throws Exception { + private static Path createDbAndTable(IDriver driver, String databaseName, + String tableName, List partVals, + String[] colNames, String[] colTypes, + String[] bucketCols, + String[] partNames, String dbLocation, int bucketCount) + throws Exception { String dbUri = "raw://" + new Path(dbLocation).toUri().toString(); String tableLoc = dbUri + Path.SEPARATOR + tableName; @@ -2114,24 +2437,24 @@ public static Path createDbAndTable(IDriver driver, String databaseName, runDDL(driver, "create database IF NOT EXISTS " + databaseName + " location '" + dbUri + "'"); runDDL(driver, "use " + databaseName); String crtTbl = "create table " + tableName + - " ( " + getTableColumnsStr(colNames,colTypes) + " )" + - getPartitionStmtStr(partNames) + - " clustered by ( " + join(bucketCols, ",") + " )" + - " into " + bucketCount + " buckets " + - " stored as orc " + - " location '" + tableLoc + "'" + - " TBLPROPERTIES ('transactional'='true') "; + " ( " + getTableColumnsStr(colNames, colTypes) + " )" + + getPartitionStmtStr(partNames) + + " clustered by ( " + join(bucketCols, ",") + " )" + + " into " + bucketCount + " buckets " + + " stored as orc " + + " location '" + tableLoc + "'" + + " TBLPROPERTIES ('transactional'='true') "; runDDL(driver, crtTbl); - if(partNames!=null && partNames.length!=0) { + if (partNames != null && partNames.length != 0) { return addPartition(driver, tableName, partVals, partNames); } return new Path(tableLoc); } private static Path addPartition(IDriver driver, String tableName, List partVals, String[] partNames) - throws Exception { + throws Exception { String partSpec = getPartsSpec(partNames, partVals); - String addPart = "alter table " + tableName + " add partition ( " + partSpec + " )"; + String addPart = "alter table " + tableName + " add partition ( " + partSpec + " )"; runDDL(driver, addPart); return getPartitionPath(driver, tableName, partSpec); } @@ -2140,15 +2463,15 @@ private static Path getPartitionPath(IDriver driver, String tableName, String pa ArrayList res = queryTable(driver, "describe extended " + tableName + " PARTITION (" + partSpec + ")"); String partInfo = res.get(res.size() - 1); int start = partInfo.indexOf("location:") + "location:".length(); - int end = partInfo.indexOf(",",start); - return new Path( partInfo.substring(start,end) ); + int end = partInfo.indexOf(",", start); + return new Path(partInfo.substring(start, end)); } private static String getTableColumnsStr(String[] colNames, String[] colTypes) { StringBuilder sb = new StringBuilder(); - for (int i=0; i < colNames.length; ++i) { - sb.append(colNames[i] + " " + colTypes[i]); - if (i partVals) { StringBuilder sb = new StringBuilder(); - for (int i=0; i < partVals.size(); ++i) { - sb.append(partNames[i] + " = '" + partVals.get(i) + "'"); - if(i < partVals.size()-1) { + for (int i = 0; i < partVals.size(); ++i) { + sb.append(partNames[i]).append(" = '").append(partVals.get(i)).append("'"); + if (i < partVals.size() - 1) { sb.append(","); } } @@ -2183,28 +2506,33 @@ private static String getPartsSpec(String[] partNames, List partVals) { } private static String join(String[] values, String delimiter) { - if(values==null) { + if (values == null) { return null; } StringBuilder strbuf = new StringBuilder(); boolean first = true; - for (Object value : values) { - if (!first) { strbuf.append(delimiter); } else { first = false; } + for (Object value : values) { + if (!first) { + strbuf.append(delimiter); + } else { + first = false; + } strbuf.append(value.toString()); } return strbuf.toString(); } + private static String getPartitionStmtStr(String[] partNames) { - if ( partNames == null || partNames.length == 0) { + if (partNames == null || partNames.length == 0) { return ""; } return " partitioned by (" + getTablePartsStr(partNames) + " )"; } - private static boolean runDDL(IDriver driver, String sql) throws QueryFailedException { + private static boolean runDDL(IDriver driver, String sql) { LOG.debug(sql); System.out.println(sql); //LOG.debug("Running Hive Query: "+ sql); @@ -2217,9 +2545,9 @@ private static boolean runDDL(IDriver driver, String sql) throws QueryFailedExce } - public static ArrayList queryTable(IDriver driver, String query) throws IOException { + private static ArrayList queryTable(IDriver driver, String query) throws IOException { CommandProcessorResponse cpr = driver.run(query); - if(cpr.getResponseCode() != 0) { + if (cpr.getResponseCode() != 0) { throw new RuntimeException(query + " failed: " + cpr); } ArrayList res = new ArrayList(); @@ -2270,12 +2598,13 @@ public int hashCode() { @Override public String toString() { return " { " + - "'" + field1 + '\'' + - "," + field2 + - ",'" + field3 + '\'' + - " }"; + "'" + field1 + '\'' + + "," + field2 + + ",'" + field3 + '\'' + + " }"; } } + /** * This is test-only wrapper around the real RecordWriter. * It can simulate faults from lower levels to test error handling logic. @@ -2288,24 +2617,29 @@ private FaultyWriter(RecordWriter delegate) { assert delegate != null; this.delegate = delegate; } + + @Override + public void init(final StreamingConnection connection) throws StreamingException { + delegate.init(connection); + } + @Override public void write(long writeId, byte[] record) throws StreamingException { delegate.write(writeId, record); produceFault(); } + @Override public void flush() throws StreamingException { delegate.flush(); produceFault(); } + @Override - public void clear() throws StreamingException { - delegate.clear(); - } - @Override - public void newBatch(Long minTxnId, Long maxTxnID) throws StreamingException { + public void newBatch(Long minTxnId, Long maxTxnID) { delegate.newBatch(minTxnId, maxTxnID); } + @Override public void closeBatch() throws StreamingException { delegate.closeBatch(); @@ -2313,16 +2647,19 @@ public void closeBatch() throws StreamingException { /** * allows testing of "unexpected" errors + * * @throws StreamingIOFailure */ private void produceFault() throws StreamingIOFailure { - if(shouldThrow) { + if (shouldThrow) { throw new StreamingIOFailure("Simulated fault occurred"); } } + void enableErrors() { shouldThrow = true; } + void disableErrors() { shouldThrow = false; } diff --git a/streaming/src/test/org/apache/hive/streaming/TestStreamingDynamicPartitioning.java b/streaming/src/test/org/apache/hive/streaming/TestStreamingDynamicPartitioning.java new file mode 100644 index 0000000..e2bc3ac --- /dev/null +++ b/streaming/src/test/org/apache/hive/streaming/TestStreamingDynamicPartitioning.java @@ -0,0 +1,565 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RawLocalFileSystem; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hive.cli.CliSessionState; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.txn.TxnDbUtil; +import org.apache.hadoop.hive.ql.DriverFactory; +import org.apache.hadoop.hive.ql.IDriver; +import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.thrift.TException; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class TestStreamingDynamicPartitioning { + private static final Logger LOG = LoggerFactory.getLogger(TestStreamingDynamicPartitioning.class); + + public static class RawFileSystem extends RawLocalFileSystem { + private static final URI NAME; + + static { + try { + NAME = new URI("raw:///"); + } catch (URISyntaxException se) { + throw new IllegalArgumentException("bad uri", se); + } + } + + @Override + public URI getUri() { + return NAME; + } + + @Override + public String getScheme() { + return "raw"; + } + + @Override + public FileStatus getFileStatus(Path path) throws IOException { + File file = pathToFile(path); + if (!file.exists()) { + throw new FileNotFoundException("Can'table find " + path); + } + // get close enough + short mod = 0; + if (file.canRead()) { + mod |= 0444; + } + if (file.canWrite()) { + mod |= 0200; + } + if (file.canExecute()) { + mod |= 0111; + } + return new FileStatus(file.length(), file.isDirectory(), 1, 1024, + file.lastModified(), file.lastModified(), + FsPermission.createImmutable(mod), "owen", "users", path); + } + } + + private final HiveConf conf; + private IDriver driver; + private final IMetaStoreClient msClient; + + private static final String COL1 = "id"; + private static final String COL2 = "msg"; + + // partitioned table + private final static String dbName = "testing"; + private final static String tblName = "alerts"; + private final static String[] fieldNames = new String[]{COL1, COL2}; + private String[] allNames = new String[]{COL1, COL2, "Continent", "Country"}; + + // unpartitioned table + private final static String dbName2 = "testing2"; + private Path tableLoc; + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + + public TestStreamingDynamicPartitioning() throws Exception { + conf = new HiveConf(this.getClass()); + conf.set("fs.raw.impl", RawFileSystem.class.getName()); + conf + .setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, + "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); + TxnDbUtil.setConfValues(conf); + conf.setBoolVar(HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI, true); + conf.setBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, true); + conf.setVar(HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict"); + dbFolder.create(); + + + //1) Start from a clean slate (metastore) + TxnDbUtil.cleanDb(conf); + TxnDbUtil.prepDb(conf); + + //2) obtain metastore clients + msClient = new HiveMetaStoreClient(conf); + } + + @Before + public void setup() throws Exception { + SessionState.start(new CliSessionState(conf)); + driver = DriverFactory.newDriver(conf); + driver.setMaxRows(200002);//make sure Driver returns all results + // drop and recreate the necessary databases and tables + dropDB(msClient, dbName); + + String[] partNames = new String[]{"Continent", "Country"}; + String[] colNames = new String[]{COL1, COL2}; + String[] colTypes = new String[]{serdeConstants.INT_TYPE_NAME, serdeConstants.STRING_TYPE_NAME}; + + String[] bucketCols = new String[]{COL1}; + String loc1 = dbFolder.newFolder(dbName + ".db").toString(); + tableLoc = createDbAndTable(driver, dbName, tblName, null, colNames, colTypes, bucketCols, partNames, loc1, 1); + + dropDB(msClient, dbName2); + String loc2 = dbFolder.newFolder(dbName2 + ".db").toString(); + String loc3 = dbFolder.newFolder("testing5.db").toString(); + createStoreSales("testing5", loc3); + + runDDL(driver, "drop table testBucketing3.streamedtable"); + runDDL(driver, "drop table testBucketing3.finaltable"); + runDDL(driver, "drop table testBucketing3.nobucket"); + } + + @After + public void cleanup() throws Exception { + msClient.close(); + driver.close(); + } + + private void createStoreSales(String dbName, String loc) throws Exception { + String dbUri = "raw://" + new Path(loc).toUri().toString(); + String tableLoc = dbUri + Path.SEPARATOR + "store_sales"; + + boolean success = runDDL(driver, "create database IF NOT EXISTS " + dbName + " location '" + dbUri + "'"); + Assert.assertTrue(success); + success = runDDL(driver, "use " + dbName); + Assert.assertTrue(success); + + success = runDDL(driver, "drop table if exists store_sales"); + Assert.assertTrue(success); + success = runDDL(driver, "create table store_sales\n" + + "(\n" + + " ss_sold_date_sk int,\n" + + " ss_sold_time_sk int,\n" + + " ss_item_sk int,\n" + + " ss_customer_sk int,\n" + + " ss_cdemo_sk int,\n" + + " ss_hdemo_sk int,\n" + + " ss_addr_sk int,\n" + + " ss_store_sk int,\n" + + " ss_promo_sk int,\n" + + " ss_ticket_number int,\n" + + " ss_quantity int,\n" + + " ss_wholesale_cost decimal(7,2),\n" + + " ss_list_price decimal(7,2),\n" + + " ss_sales_price decimal(7,2),\n" + + " ss_ext_discount_amt decimal(7,2),\n" + + " ss_ext_sales_price decimal(7,2),\n" + + " ss_ext_wholesale_cost decimal(7,2),\n" + + " ss_ext_list_price decimal(7,2),\n" + + " ss_ext_tax decimal(7,2),\n" + + " ss_coupon_amt decimal(7,2),\n" + + " ss_net_paid decimal(7,2),\n" + + " ss_net_paid_inc_tax decimal(7,2),\n" + + " ss_net_profit decimal(7,2)\n" + + ")\n" + + " partitioned by (dt string)\n" + + "clustered by (ss_store_sk, ss_promo_sk)\n" + + "INTO 4 BUCKETS stored as orc " + " location '" + tableLoc + "'" + + " TBLPROPERTIES ('orc.compress'='NONE', 'transactional'='true')"); + Assert.assertTrue(success); + + success = runDDL(driver, "alter table store_sales add partition(dt='2015')"); + Assert.assertTrue(success); + } + + @Test + public void testDynamicPartitioning() throws Exception { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testing5") + .withTable("store_sales") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginNextTransaction(); + + for (int i = 0; i < 10; i++) { + StringBuilder row = new StringBuilder(); + for (int ints = 0; ints < 11; ints++) { + row.append(ints).append(','); + } + for (int decs = 0; decs < 12; decs++) { + row.append(i + 0.1).append(','); + } + row.append("2018-04-").append(i); + connection.write(row.toString().getBytes()); + } + connection.commit(); + connection.close(); + + List partitions = queryTable(driver, "show partitions testing5.store_sales"); + // 1 static partition created during setup + 10 dynamic partitions + assertEquals(11, partitions.size()); + // ignore the first static partition + for (int i = 1; i < partitions.size(); i++) { + assertEquals("dt=2018-04-" + (i - 1), partitions.get(i)); + } + + ArrayList res = queryTable(driver, "select row__id.bucketid, * from testing5.store_sales"); + for (String re : res) { + System.out.println(re); + assertEquals(true, re.contains("2018-04-")); + } + } + + // stream data into streaming table with N buckets, then copy the data into another bucketed table + // check if bucketing in both was done in the same way + @Test + public void testDPStreamBucketingMatchesRegularBucketing() throws Exception { + int bucketCount = 100; + + String dbUri = "raw://" + new Path(dbFolder.newFolder().toString()).toUri().toString(); + String tableLoc = "'" + dbUri + Path.SEPARATOR + "streamedtable" + "'"; + String tableLoc2 = "'" + dbUri + Path.SEPARATOR + "finaltable" + "'"; + String tableLoc3 = "'" + dbUri + Path.SEPARATOR + "nobucket" + "'"; + + runDDL(driver, "create database testBucketing3"); + runDDL(driver, "use testBucketing3"); + runDDL(driver, "create table streamedtable ( key1 string,key2 int,data string ) partitioned by (year " + + "int) clustered by " + "( " + "key1,key2 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='true')"); + // In 'nobucket' table we capture bucketid from streamedtable to workaround a hive bug that prevents joins two identically bucketed tables + runDDL(driver, "create table nobucket ( bucketid int, key1 string,key2 int,data string ) partitioned by " + + "(year int) location " + tableLoc3); + runDDL(driver, "create table finaltable ( bucketid int, key1 string,key2 int,data string ) partitioned " + + "by (year int) clustered by ( key1,key2 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='true')"); + + + String[] records = new String[]{ + "PSFAHYLZVC,29,EPNMA,2017", + "PPPRKWAYAU,96,VUTEE,2017", + "MIAOFERCHI,3,WBDSI,2017", + "CEGQAZOWVN,0,WCUZL,2017", + "XWAKMNSVQF,28,YJVHU,2017", + "XBWTSAJWME,2,KDQFO,2017", + "FUVLQTAXAY,5,LDSDG,2017", + "QTQMDJMGJH,6,QBOMA,2018", + "EFLOTLWJWN,71,GHWPS,2018", + "PEQNAOJHCM,82,CAAFI,2018", + "MOEKQLGZCP,41,RUACR,2018", + "QZXMCOPTID,37,LFLWE,2018", + "EYALVWICRD,13,JEZLC,2018", + "VYWLZAYTXX,16,DMVZX,2018", + "OSALYSQIXR,47,HNZVE,2018", + "JGKVHKCEGQ,25,KSCJB,2018", + "WQFMMYDHET,12,DTRWA,2018", + "AJOVAYZKZQ,15,YBKFO,2018", + "YAQONWCUAU,31,QJNHZ,2018", + "DJBXUEUOEB,35,IYCBL,2018" + }; + + + String[] colNames1 = new String[]{"key1", "key2", "data", "year"}; + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("streamedtable") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + + connection.beginNextTransaction(); + + for (String record : records) { + connection.write(record.getBytes()); + } + + connection.commit(); + connection.close(); + + ArrayList res1 = queryTable(driver, "select row__id.bucketid, * from streamedtable order by key2"); + for (String re : res1) { + System.out.println(re); + assertTrue(re.endsWith("2017") || re.endsWith("2018")); + } + + driver.run("insert into nobucket partition(year) select row__id.bucketid,* from streamedtable"); + ArrayList res = queryTable(driver, "select * from nobucket"); + assertEquals(records.length, res.size()); + runDDL(driver, " insert into finaltable partition(year) select * from nobucket"); + res = queryTable(driver, "select * from finaltable"); + assertEquals(records.length, res.size()); + ArrayList res2 = queryTable(driver, + "select row__id.bucketid,* from finaltable where row__id.bucketid<>bucketid"); + for (String s : res2) { + LOG.error(s); + } + Assert.assertTrue(res2.isEmpty()); + + res2 = queryTable(driver, "select * from finaltable where year=2018"); + assertEquals(13, res2.size()); + for (String s : res2) { + assertTrue(s.endsWith("2018")); + } + + res2 = queryTable(driver, "show partitions finaltable"); + assertEquals(2, res2.size()); + assertEquals("year=2017", res2.get(0)); + assertEquals("year=2018", res2.get(1)); + } + + + @Test + public void testTableValidation() throws Exception { + int bucketCount = 100; + + String dbUri = "raw://" + new Path(dbFolder.newFolder().toString()).toUri().toString(); + String tbl1 = "validation1"; + String tbl2 = "validation2"; + + String tableLoc = "'" + dbUri + Path.SEPARATOR + tbl1 + "'"; + String tableLoc2 = "'" + dbUri + Path.SEPARATOR + tbl2 + "'"; + + runDDL(driver, "create database testBucketing3"); + runDDL(driver, "use testBucketing3"); + + runDDL(driver, "create table " + tbl1 + " ( key1 string, data string ) clustered by ( key1 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='false')"); + + runDDL(driver, "create table " + tbl2 + " ( key1 string, data string ) clustered by ( key1 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='false')"); + + String[] colNames1 = new String[]{"key1", "data"}; + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + try { + HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("validation2") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + Assert.assertTrue("InvalidTable exception was not thrown", false); + } catch (InvalidTable e) { + // expecting this exception + } + try { + HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("validation2") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + Assert.assertTrue("InvalidTable exception was not thrown", false); + } catch (InvalidTable e) { + // expecting this exception + } + } + + private static boolean runDDL(IDriver driver, String sql) { + LOG.debug(sql); + System.out.println(sql); + CommandProcessorResponse cpr = driver.run(sql); + if (cpr.getResponseCode() == 0) { + return true; + } + LOG.error("Statement: " + sql + " failed: " + cpr); + return false; + } + + + private static ArrayList queryTable(IDriver driver, String query) throws IOException { + CommandProcessorResponse cpr = driver.run(query); + if (cpr.getResponseCode() != 0) { + throw new RuntimeException(query + " failed: " + cpr); + } + ArrayList res = new ArrayList(); + driver.getResults(res); + return res; + } + + + // delete db and all tables in it + public static void dropDB(IMetaStoreClient client, String databaseName) { + try { + for (String table : client.listTableNamesByFilter(databaseName, "", (short) -1)) { + client.dropTable(databaseName, table, true, true); + } + client.dropDatabase(databaseName); + } catch (TException e) { + } + + } + + + ///////// -------- UTILS ------- ///////// + // returns Path of the partition created (if any) else Path of table + private static Path createDbAndTable(IDriver driver, String databaseName, + String tableName, List partVals, + String[] colNames, String[] colTypes, + String[] bucketCols, + String[] partNames, String dbLocation, int bucketCount) + throws Exception { + + String dbUri = "raw://" + new Path(dbLocation).toUri().toString(); + String tableLoc = dbUri + Path.SEPARATOR + tableName; + + runDDL(driver, "create database IF NOT EXISTS " + databaseName + " location '" + dbUri + "'"); + runDDL(driver, "use " + databaseName); + String crtTbl = "create table " + tableName + + " ( " + getTableColumnsStr(colNames, colTypes) + " )" + + getPartitionStmtStr(partNames) + + " clustered by ( " + join(bucketCols, ",") + " )" + + " into " + bucketCount + " buckets " + + " stored as orc " + + " location '" + tableLoc + "'" + + " TBLPROPERTIES ('transactional'='true') "; + runDDL(driver, crtTbl); + if (partNames != null && partNames.length != 0 && partVals != null) { + return addPartition(driver, tableName, partVals, partNames); + } + return new Path(tableLoc); + } + + + private static Path addPartition(IDriver driver, String tableName, List partVals, String[] partNames) + throws Exception { + String partSpec = getPartsSpec(partNames, partVals); + String addPart = "alter table " + tableName + " add partition ( " + partSpec + " )"; + runDDL(driver, addPart); + return getPartitionPath(driver, tableName, partSpec); + } + + private static Path getPartitionPath(IDriver driver, String tableName, String partSpec) throws Exception { + ArrayList res = queryTable(driver, "describe extended " + tableName + " PARTITION (" + partSpec + ")"); + String partInfo = res.get(res.size() - 1); + int start = partInfo.indexOf("location:") + "location:".length(); + int end = partInfo.indexOf(",", start); + return new Path(partInfo.substring(start, end)); + } + + private static String getTableColumnsStr(String[] colNames, String[] colTypes) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < colNames.length; ++i) { + sb.append(colNames[i]).append(" ").append(colTypes[i]); + if (i < colNames.length - 1) { + sb.append(","); + } + } + return sb.toString(); + } + + // converts partNames into "partName1 string, partName2 string" + private static String getTablePartsStr(String[] partNames) { + if (partNames == null || partNames.length == 0) { + return ""; + } + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < partNames.length; ++i) { + sb.append(partNames[i]).append(" string"); + if (i < partNames.length - 1) { + sb.append(","); + } + } + return sb.toString(); + } + + // converts partNames,partVals into "partName1=val1, partName2=val2" + private static String getPartsSpec(String[] partNames, List partVals) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < partVals.size(); ++i) { + sb.append(partNames[i]).append(" = '").append(partVals.get(i)).append("'"); + if (i < partVals.size() - 1) { + sb.append(","); + } + } + return sb.toString(); + } + + private static String join(String[] values, String delimiter) { + if (values == null) { + return null; + } + StringBuilder strbuf = new StringBuilder(); + + boolean first = true; + + for (Object value : values) { + if (!first) { + strbuf.append(delimiter); + } else { + first = false; + } + strbuf.append(value.toString()); + } + + return strbuf.toString(); + } + + private static String getPartitionStmtStr(String[] partNames) { + if (partNames == null || partNames.length == 0) { + return ""; + } + return " partitioned by (" + getTablePartsStr(partNames) + " )"; + } +}