diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 6e35653..a6e4127 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -660,6 +660,16 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal @Deprecated METASTORE_CAPABILITY_CHECK("hive.metastore.client.capability.check", true, "Whether to check client capabilities for potentially breaking API usage."), + METASTORE_CLIENT_CACHE_ENABLED("hive.metastore.client.cache.enabled", false, + "Whether to enable metastore client cache"), + METASTORE_CLIENT_CACHE_EXPIRY_TIME("hive.metastore.client.cache.expiry.time", "120s", + new TimeValidator(TimeUnit.SECONDS), "Expiry time for metastore client cache"), + METASTORE_CLIENT_CACHE_INITIAL_CAPACITY("hive.metastore.client.cache.initial.capacity", 50, + "Initial capacity for metastore client cache"), + METASTORE_CLIENT_CACHE_MAX_CAPACITY("hive.metastore.client.cache.max.capacity", 50, + "Max capacity for metastore client cache"), + METASTORE_CLIENT_CACHE_STATS_ENABLED("hive.metastore.client.cache.stats.enabled", false, + "Whether to enable metastore client cache stats"), METASTORE_FASTPATH("hive.metastore.fastpath", false, "Used to avoid all of the proxies and object copies in the metastore. Note, if this is " + "set, you MUST use a local metastore (hive.metastore.uris must be empty) otherwise " + diff --git a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java index 90dbdac..13aa5e9 100644 --- a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java +++ b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java @@ -439,68 +439,75 @@ public void testStreamBucketingMatchesRegularBucketing() throws Exception { String tableLoc = "'" + dbUri + Path.SEPARATOR + "streamedtable" + "'"; String tableLoc2 = "'" + dbUri + Path.SEPARATOR + "finaltable" + "'"; String tableLoc3 = "'" + dbUri + Path.SEPARATOR + "nobucket" + "'"; + // disabling vectorization as this test yields incorrect results with vectorization + conf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false); + try (IDriver driver = DriverFactory.newDriver(conf)) { + runDDL(driver, "create database testBucketing3"); + runDDL(driver, "use testBucketing3"); + runDDL(driver, "create table streamedtable ( key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='true')"); + // In 'nobucket' table we capture bucketid from streamedtable to workaround a hive bug that prevents joins two identically bucketed tables + runDDL(driver, "create table nobucket ( bucketid int, key1 string,key2 int,data string ) location " + tableLoc3); + runDDL(driver, + "create table finaltable ( bucketid int, key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='true')"); + + + String[] records = new String[]{ + "PSFAHYLZVC,29,EPNMA", + "PPPRKWAYAU,96,VUTEE", + "MIAOFERCHI,3,WBDSI", + "CEGQAZOWVN,0,WCUZL", + "XWAKMNSVQF,28,YJVHU", + "XBWTSAJWME,2,KDQFO", + "FUVLQTAXAY,5,LDSDG", + "QTQMDJMGJH,6,QBOMA", + "EFLOTLWJWN,71,GHWPS", + "PEQNAOJHCM,82,CAAFI", + "MOEKQLGZCP,41,RUACR", + "QZXMCOPTID,37,LFLWE", + "EYALVWICRD,13,JEZLC", + "VYWLZAYTXX,16,DMVZX", + "OSALYSQIXR,47,HNZVE", + "JGKVHKCEGQ,25,KSCJB", + "WQFMMYDHET,12,DTRWA", + "AJOVAYZKZQ,15,YBKFO", + "YAQONWCUAU,31,QJNHZ", + "DJBXUEUOEB,35,IYCBL" + }; + + HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testBucketing3", "streamedtable", null); + String[] colNames1 = new String[]{"key1", "key2", "data"}; + DelimitedInputWriter wr = new DelimitedInputWriter(colNames1, ",", endPt); + StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + + TransactionBatch txnBatch = connection.fetchTransactionBatch(2, wr); + txnBatch.beginNextTransaction(); - runDDL(driver, "create database testBucketing3"); - runDDL(driver, "use testBucketing3"); - runDDL(driver, "create table streamedtable ( key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='true')") ; -// In 'nobucket' table we capture bucketid from streamedtable to workaround a hive bug that prevents joins two identically bucketed tables - runDDL(driver, "create table nobucket ( bucketid int, key1 string,key2 int,data string ) location " + tableLoc3) ; - runDDL(driver, "create table finaltable ( bucketid int, key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='true')"); - - - String[] records = new String[] { - "PSFAHYLZVC,29,EPNMA", - "PPPRKWAYAU,96,VUTEE", - "MIAOFERCHI,3,WBDSI", - "CEGQAZOWVN,0,WCUZL", - "XWAKMNSVQF,28,YJVHU", - "XBWTSAJWME,2,KDQFO", - "FUVLQTAXAY,5,LDSDG", - "QTQMDJMGJH,6,QBOMA", - "EFLOTLWJWN,71,GHWPS", - "PEQNAOJHCM,82,CAAFI", - "MOEKQLGZCP,41,RUACR", - "QZXMCOPTID,37,LFLWE", - "EYALVWICRD,13,JEZLC", - "VYWLZAYTXX,16,DMVZX", - "OSALYSQIXR,47,HNZVE", - "JGKVHKCEGQ,25,KSCJB", - "WQFMMYDHET,12,DTRWA", - "AJOVAYZKZQ,15,YBKFO", - "YAQONWCUAU,31,QJNHZ", - "DJBXUEUOEB,35,IYCBL" - }; - - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testBucketing3", "streamedtable", null); - String[] colNames1 = new String[] { "key1", "key2", "data" }; - DelimitedInputWriter wr = new DelimitedInputWriter(colNames1,",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, wr); - txnBatch.beginNextTransaction(); - - for (String record : records) { - txnBatch.write(record.toString().getBytes()); - } + for (String record : records) { + txnBatch.write(record.toString().getBytes()); + } - txnBatch.commit(); - txnBatch.close(); - connection.close(); + txnBatch.commit(); + txnBatch.close(); + connection.close(); - ArrayList res1 = queryTable(driver, "select row__id.bucketid, * from streamedtable order by key2"); - for (String re : res1) { - System.out.println(re); - } + ArrayList res1 = queryTable(driver, "select row__id.bucketid, * from streamedtable order by key2"); + for (String re : res1) { + System.out.println(re); + } - driver.run("insert into nobucket select row__id.bucketid,* from streamedtable"); - runDDL(driver, " insert into finaltable select * from nobucket"); - ArrayList res2 = queryTable(driver, "select row__id.bucketid,* from finaltable where row__id.bucketid<>bucketid"); - for (String s : res2) { - LOG.error(s); + driver.run("insert into nobucket select row__id.bucketid,* from streamedtable"); + runDDL(driver, " insert into finaltable select * from nobucket"); + ArrayList res2 = queryTable(driver, + "select row__id.bucketid,* from finaltable where row__id.bucketid<>bucketid"); + for (String s : res2) { + LOG.error(s); + } + Assert.assertTrue(res2.isEmpty()); + } finally { + conf.unset(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.varname); } - Assert.assertTrue(res2.isEmpty()); } diff --git a/itests/hive-unit/pom.xml b/itests/hive-unit/pom.xml index 3ae7f2f..b51ebf2 100644 --- a/itests/hive-unit/pom.xml +++ b/itests/hive-unit/pom.xml @@ -76,6 +76,11 @@ ${project.version} + org.apache.hive.hcatalog + hive-hcatalog-streaming + ${project.version} + + org.apache.hive hive-streaming ${project.version} diff --git a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java index 8ee033d..7e17d5d 100644 --- a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java +++ b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java @@ -17,12 +17,16 @@ */ package org.apache.hadoop.hive.ql.txn.compactor; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + import java.io.File; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Random; @@ -75,11 +79,13 @@ import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.mapred.JobConf; import org.apache.hive.hcatalog.common.HCatUtil; -import org.apache.hive.streaming.DelimitedInputWriter; -import org.apache.hive.streaming.HiveEndPoint; +import org.apache.hive.hcatalog.streaming.DelimitedInputWriter; +import org.apache.hive.hcatalog.streaming.HiveEndPoint; +import org.apache.hive.hcatalog.streaming.TransactionBatch; +import org.apache.hive.streaming.HiveStreamingConnection; import org.apache.hive.streaming.StreamingConnection; import org.apache.hive.streaming.StreamingException; -import org.apache.hive.streaming.TransactionBatch; +import org.apache.hive.streaming.StrictDelimitedInputWriter; import org.apache.orc.OrcConf; import org.junit.After; import org.junit.Assert; @@ -87,19 +93,32 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - */ +@RunWith(Parameterized.class) public class TestCompactor { private static final AtomicInteger salt = new AtomicInteger(new Random().nextInt()); private static final Logger LOG = LoggerFactory.getLogger(TestCompactor.class); private final String TEST_DATA_DIR = HCatUtil.makePathASafeFileName(System.getProperty("java.io.tmpdir") + - File.separator + TestCompactor.class.getCanonicalName() + "-" + System.currentTimeMillis() + "_" + salt.getAndIncrement()); + File.separator + TestCompactor.class.getCanonicalName() + "-" + System.currentTimeMillis() + "_" + + salt.getAndIncrement()); private final String BASIC_FILE_NAME = TEST_DATA_DIR + "/basic.input.data"; private final String TEST_WAREHOUSE_DIR = TEST_DATA_DIR + "/warehouse"; + @Parameterized.Parameters + public static Collection data() { + return Arrays.asList(new Object[][]{{true}, {false}}); + } + + private boolean newStreamingAPI; + + public TestCompactor(boolean newStreamingAPI) { + this.newStreamingAPI = newStreamingAPI; + } + @Rule public TemporaryFolder stagingFolder = new TemporaryFolder(); private HiveConf conf; @@ -113,7 +132,7 @@ public void setup() throws Exception { if (f.exists()) { FileUtil.fullyDelete(f); } - if(!(new File(TEST_WAREHOUSE_DIR).mkdirs())) { + if (!(new File(TEST_WAREHOUSE_DIR).mkdirs())) { throw new RuntimeException("Could not create " + TEST_WAREHOUSE_DIR); } @@ -147,25 +166,26 @@ public void setup() throws Exception { } createTestDataFile(BASIC_FILE_NAME, input); } + @After public void tearDown() { conf = null; - if(msClient != null) { + if (msClient != null) { msClient.close(); } - if(driver != null) { + if (driver != null) { driver.close(); } } /** * Simple schema evolution add columns with partitioning. + * * @throws Exception */ @Test public void schemaEvolutionAddColDynamicPartitioningInsert() throws Exception { String tblName = "dpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + @@ -174,7 +194,7 @@ public void schemaEvolutionAddColDynamicPartitioningInsert() throws Exception { // First INSERT round. executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " + - "'today'), (2, 'wilma', 'yesterday')", driver); + "'today'), (2, 'wilma', 'yesterday')", driver); // ALTER TABLE ... ADD COLUMNS executeStatementOnDriver("ALTER TABLE " + tblName + " ADD COLUMNS(c int)", driver); @@ -191,7 +211,7 @@ public void schemaEvolutionAddColDynamicPartitioningInsert() throws Exception { executeStatementOnDriver("insert into " + tblName + " partition (ds) values " + "(3, 'mark', 1900, 'soon'), (4, 'douglas', 1901, 'last_century'), " + "(5, 'doc', 1902, 'yesterday')", - driver); + driver); // Validate there the new insertions for column c. executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver); @@ -205,7 +225,7 @@ public void schemaEvolutionAddColDynamicPartitioningInsert() throws Exception { Assert.assertEquals("5\tdoc\t1902\tyesterday", valuesReadFromHiveDriver.get(4)); Initiator initiator = new Initiator(); - initiator.setThreadId((int)initiator.getId()); + initiator.setThreadId((int) initiator.getId()); conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 0); initiator.setConf(conf); AtomicBoolean stop = new AtomicBoolean(); @@ -246,14 +266,13 @@ public void schemaEvolutionAddColDynamicPartitioningInsert() throws Exception { @Test public void schemaEvolutionAddColDynamicPartitioningUpdate() throws Exception { String tblName = "udpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + " CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " + - "'today'), (2, 'wilma', 'yesterday')", driver); + "'today'), (2, 'wilma', 'yesterday')", driver); executeStatementOnDriver("update " + tblName + " set b = 'barney'", driver); @@ -280,7 +299,7 @@ public void schemaEvolutionAddColDynamicPartitioningUpdate() throws Exception { executeStatementOnDriver("insert into " + tblName + " partition (ds) values " + "(3, 'mark', 1900, 'soon'), (4, 'douglas', 1901, 'last_century'), " + "(5, 'doc', 1902, 'yesterday')", - driver); + driver); // Validate there the new insertions for column c. executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver); @@ -307,7 +326,7 @@ public void schemaEvolutionAddColDynamicPartitioningUpdate() throws Exception { Assert.assertEquals("5\tdoc\t2000\tyesterday", valuesReadFromHiveDriver.get(4)); Initiator initiator = new Initiator(); - initiator.setThreadId((int)initiator.getId()); + initiator.setThreadId((int) initiator.getId()); // Set to 1 so insert doesn't set it off but update does conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 1); initiator.setConf(conf); @@ -354,10 +373,10 @@ public void schemaEvolutionAddColDynamicPartitioningUpdate() throws Exception { * 4. insert some data into the table using StreamingAPI * 5. Trigger major compaction (which should update stats) * 6. check that stats have been updated - * @throws Exception - * todo: - * 2. add non-partitioned test - * 4. add a test with sorted table? + * + * @throws Exception todo: + * 2. add non-partitioned test + * 4. add a test with sorted table? */ @Test public void testStatsAfterCompactionPartTbl() throws Exception { @@ -407,7 +426,7 @@ public void testStatsAfterCompactionPartTbl() throws Exception { Map> stats = msClient.getPartitionColumnStatistics(ci.dbname, ci.tableName, Arrays.asList(ci.partName), colNames); List colStats = stats.get(ci.partName); - Assert.assertNotNull("No stats found for partition " + ci.partName, colStats); + assertNotNull("No stats found for partition " + ci.partName, colStats); Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName()); Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName()); LongColumnStatsData colAStats = colStats.get(0).getStatsData().getLongStats(); @@ -428,32 +447,58 @@ public void testStatsAfterCompactionPartTbl() throws Exception { LongColumnStatsData colAStatsPart2 = colStats.get(0).getStatsData().getLongStats(); StringColumnStatsData colBStatsPart2 = colStats.get(1).getStatsData().getStringStats(); - - HiveEndPoint endPt = new HiveEndPoint(null, ci.dbname, ci.tableName, Arrays.asList("0")); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); + if (newStreamingAPI) { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(ci.dbname) + .withTable(ci.tableName) + .withStaticPartitionValues(Arrays.asList("0")) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); + connection.beginTransaction(); + connection.write("50,Kiev".getBytes()); + connection.write("51,St. Petersburg".getBytes()); + connection.write("44,Boston".getBytes()); + connection.commitTransaction(); + + connection.beginTransaction(); + connection.write("52,Tel Aviv".getBytes()); + connection.write("53,Atlantis".getBytes()); + connection.write("53,Boston".getBytes()); + connection.commitTransaction(); + connection.close(); + } else { + HiveEndPoint endPt = new HiveEndPoint(null, ci.dbname, ci.tableName, Arrays.asList("0")); + DelimitedInputWriter writer = new DelimitedInputWriter(new String[]{"a", "b"}, ",", endPt); /*next call will eventually end up in HiveEndPoint.createPartitionIfNotExists() which makes an operation on Driver * and starts it's own CliSessionState and then closes it, which removes it from ThreadLoacal; * thus the session * created in this class is gone after this; I fixed it in HiveEndPoint*/ - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); + org.apache.hive.hcatalog.streaming.StreamingConnection connection = endPt + .newConnection(true, "UT_" + Thread.currentThread().getName()); - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - Assert.assertEquals(TransactionBatch.TxnState.OPEN, txnBatch.getCurrentTransactionState()); - txnBatch.write("50,Kiev".getBytes()); - txnBatch.write("51,St. Petersburg".getBytes()); - txnBatch.write("44,Boston".getBytes()); - txnBatch.commit(); + TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); + txnBatch.beginNextTransaction(); + Assert.assertEquals(TransactionBatch.TxnState.OPEN, txnBatch.getCurrentTransactionState()); + txnBatch.write("50,Kiev".getBytes()); + txnBatch.write("51,St. Petersburg".getBytes()); + txnBatch.write("44,Boston".getBytes()); + txnBatch.commit(); - txnBatch.beginNextTransaction(); - txnBatch.write("52,Tel Aviv".getBytes()); - txnBatch.write("53,Atlantis".getBytes()); - txnBatch.write("53,Boston".getBytes()); - txnBatch.commit(); + txnBatch.beginNextTransaction(); + txnBatch.write("52,Tel Aviv".getBytes()); + txnBatch.write("53,Atlantis".getBytes()); + txnBatch.write("53,Boston".getBytes()); + txnBatch.commit(); - txnBatch.close(); - connection.close(); + txnBatch.close(); + connection.close(); + } execSelectAndDumpData("select * from " + ci.getFullTableName(), driver, ci.getFullTableName()); //so now we have written some new data to bkt=0 and it shows up @@ -478,7 +523,7 @@ public void testStatsAfterCompactionPartTbl() throws Exception { stats = msClient.getPartitionColumnStatistics(ci.dbname, ci.tableName, Arrays.asList(ci.partName), colNames); colStats = stats.get(ci.partName); - Assert.assertNotNull("No stats found for partition " + ci.partName, colStats); + assertNotNull("No stats found for partition " + ci.partName, colStats); Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName()); Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName()); colAStats = colStats.get(0).getStatsData().getLongStats(); @@ -489,7 +534,7 @@ public void testStatsAfterCompactionPartTbl() throws Exception { colBStats = colStats.get(1).getStatsData().getStringStats(); Assert.assertEquals("maxColLen b", 14, colBStats.getMaxColLen()); //cast it to long to get rid of periodic decimal - Assert.assertEquals("avgColLen b", (long)6.1111111111, (long)colBStats.getAvgColLen()); + Assert.assertEquals("avgColLen b", (long) 6.1111111111, (long) colBStats.getAvgColLen()); Assert.assertEquals("numNulls b", 0, colBStats.getNumNulls()); Assert.assertEquals("nunDVs", 8, colBStats.getNumDVs()); @@ -506,17 +551,16 @@ public void testStatsAfterCompactionPartTbl() throws Exception { @Test public void dynamicPartitioningInsert() throws Exception { String tblName = "dpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + " CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " + - "'today'), (2, 'wilma', 'yesterday')", driver); + "'today'), (2, 'wilma', 'yesterday')", driver); Initiator initiator = new Initiator(); - initiator.setThreadId((int)initiator.getId()); + initiator.setThreadId((int) initiator.getId()); conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 0); initiator.setConf(conf); AtomicBoolean stop = new AtomicBoolean(); @@ -543,19 +587,18 @@ public void dynamicPartitioningInsert() throws Exception { @Test public void dynamicPartitioningUpdate() throws Exception { String tblName = "udpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + " CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " + - "'today'), (2, 'wilma', 'yesterday')", driver); + "'today'), (2, 'wilma', 'yesterday')", driver); executeStatementOnDriver("update " + tblName + " set b = 'barney'", driver); Initiator initiator = new Initiator(); - initiator.setThreadId((int)initiator.getId()); + initiator.setThreadId((int) initiator.getId()); // Set to 1 so insert doesn't set it off but update does conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 1); initiator.setConf(conf); @@ -583,21 +626,20 @@ public void dynamicPartitioningUpdate() throws Exception { @Test public void dynamicPartitioningDelete() throws Exception { String tblName = "ddpct"; - List colNames = Arrays.asList("a", "b"); executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + " CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " + - "'today'), (2, 'wilma', 'yesterday')", driver); + "'today'), (2, 'wilma', 'yesterday')", driver); executeStatementOnDriver("update " + tblName + " set b = 'fred' where a = 1", driver); executeStatementOnDriver("delete from " + tblName + " where b = 'fred'", driver); Initiator initiator = new Initiator(); - initiator.setThreadId((int)initiator.getId()); + initiator.setThreadId((int) initiator.getId()); // Set to 2 so insert and update don't set it off but delete does conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 2); initiator.setConf(conf); @@ -625,25 +667,22 @@ public void dynamicPartitioningDelete() throws Exception { public void minorCompactWhileStreaming() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed - " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); + " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StreamingConnection connection = null; try { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + writeBatch(dbName, tblName, false); } // Start a third batch, but don't close it. - writeBatch(connection, writer, true); + connection = writeBatch(dbName, tblName, true); // Now, compact TxnStore txnHandler = TxnUtils.getTxnStore(conf); @@ -661,7 +700,7 @@ public void minorCompactWhileStreaming() throws Exception { Table table = msClient.getTable(dbName, tblName); FileSystem fs = FileSystem.get(conf); FileStatus[] stat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); String[] names = new String[stat.length]; Path resultFile = null; for (int i = 0; i < names.length; i++) { @@ -672,15 +711,17 @@ public void minorCompactWhileStreaming() throws Exception { } Arrays.sort(names); String[] expected = new String[]{"delta_0000001_0000002", - "delta_0000001_0000004", "delta_0000003_0000004", "delta_0000005_0000006"}; + "delta_0000001_0000004", "delta_0000003_0000004", "delta_0000005_0000006"}; if (!Arrays.deepEquals(expected, names)) { Assert.fail("Expected: " + Arrays.toString(expected) + ", found: " + Arrays.toString(names)); } checkExpectedTxnsPresent(null, new Path[]{resultFile}, columnNamesProperty, columnTypesProperty, - 0, 1L, 4L, 1); + 0, 1L, 4L, 1); } finally { - connection.close(); + if (connection != null) { + connection.close(); + } } } @@ -688,26 +729,23 @@ public void minorCompactWhileStreaming() throws Exception { public void majorCompactWhileStreaming() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed - " STORED AS ORC TBLPROPERTIES ('transactional'='true') ", driver); + " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed + " STORED AS ORC TBLPROPERTIES ('transactional'='true') ", driver); - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StreamingConnection connection = null; try { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + writeBatch(dbName, tblName, false); } // Start a third batch, but don't close it. this delta will be ignored by compaction since // it has an open txn in it - writeBatch(connection, writer, true); + connection = writeBatch(dbName, tblName, true); runMajorCompaction(dbName, tblName); @@ -716,7 +754,7 @@ public void majorCompactWhileStreaming() throws Exception { Table table = msClient.getTable(dbName, tblName); FileSystem fs = FileSystem.get(conf); FileStatus[] stat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.baseFileFilter); + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.baseFileFilter); if (1 != stat.length) { Assert.fail("Expecting 1 file \"base_0000004\" and found " + stat.length + " files " + Arrays.toString(stat)); } @@ -724,127 +762,187 @@ public void majorCompactWhileStreaming() throws Exception { Assert.assertEquals(name, "base_0000004"); checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 0, 1L, 4L, 1); } finally { - connection.close(); + if (connection != null) { + connection.close(); + } } } @Test public void minorCompactAfterAbort() throws Exception { - String agentInfo = "UT_" + Thread.currentThread().getName(); String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed - " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); - - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - try { - // Write a couple of batches - for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); - } - - // Start a third batch, abort everything, don't properly close it - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.abort(); - txnBatch.beginNextTransaction(); - txnBatch.abort(); + " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); - // Now, compact - TxnStore txnHandler = TxnUtils.getTxnStore(conf); - txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MINOR)); - Worker t = new Worker(); - t.setThreadId((int) t.getId()); - t.setConf(conf); - AtomicBoolean stop = new AtomicBoolean(true); - AtomicBoolean looped = new AtomicBoolean(); - t.init(stop, looped); - t.run(); + if (newStreamingAPI) { + StreamingConnection connection = null; + try { + // Write a couple of batches + for (int i = 0; i < 2; i++) { + connection = writeBatch(dbName, tblName, false); + assertNull(connection); + } - // Find the location of the table - IMetaStoreClient msClient = new HiveMetaStoreClient(conf); - Table table = msClient.getTable(dbName, tblName); - FileSystem fs = FileSystem.get(conf); - FileStatus[] stat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); - String[] names = new String[stat.length]; - Path resultDelta = null; - for (int i = 0; i < names.length; i++) { - names[i] = stat[i].getPath().getName(); - if (names[i].equals("delta_0000001_0000004")) { - resultDelta = stat[i].getPath(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .withTransactionBatchSize(2) + .connect(); + // Start a third batch, abortTransaction everything, don't properly close it + connection2.beginTransaction(); + connection2.abortTransaction(); + connection2.close(); + } finally { + if (connection != null) { + connection.close(); } } - Arrays.sort(names); - String[] expected = new String[]{"delta_0000001_0000002", - "delta_0000001_0000004", "delta_0000003_0000004"}; - if (!Arrays.deepEquals(expected, names)) { - Assert.fail("Expected: " + Arrays.toString(expected) + ", found: " + Arrays.toString(names)); + } else { + HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); + DelimitedInputWriter writer = new DelimitedInputWriter(new String[]{"a", "b"}, ",", endPt); + org.apache.hive.hcatalog.streaming.StreamingConnection connection = endPt + .newConnection(false, "UT_" + Thread.currentThread().getName()); + try { + // Write a couple of batches + for (int i = 0; i < 2; i++) { + writeBatch(connection, writer, false); + } + + // Start a third batch, abort everything, don't properly close it + TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); + txnBatch.beginNextTransaction(); + txnBatch.abort(); + txnBatch.beginNextTransaction(); + txnBatch.abort(); + } finally { + connection.close(); } - checkExpectedTxnsPresent(null, new Path[]{resultDelta}, columnNamesProperty, columnTypesProperty, 0, 1L, 4L, 1); - } finally { - connection.close(); } + // Now, compact + TxnStore txnHandler = TxnUtils.getTxnStore(conf); + txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MINOR)); + Worker t = new Worker(); + t.setThreadId((int) t.getId()); + t.setConf(conf); + AtomicBoolean stop = new AtomicBoolean(true); + AtomicBoolean looped = new AtomicBoolean(); + t.init(stop, looped); + t.run(); + + // Find the location of the table + IMetaStoreClient msClient = new HiveMetaStoreClient(conf); + Table table = msClient.getTable(dbName, tblName); + FileSystem fs = FileSystem.get(conf); + FileStatus[] stat = + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); + String[] names = new String[stat.length]; + Path resultDelta = null; + for (int i = 0; i < names.length; i++) { + names[i] = stat[i].getPath().getName(); + if (names[i].equals("delta_0000001_0000004")) { + resultDelta = stat[i].getPath(); + } + } + Arrays.sort(names); + String[] expected = new String[]{"delta_0000001_0000002", + "delta_0000001_0000004", "delta_0000003_0000004"}; + if (!Arrays.deepEquals(expected, names)) { + Assert.fail("Expected: " + Arrays.toString(expected) + ", found: " + Arrays.toString(names)); + } + checkExpectedTxnsPresent(null, new Path[]{resultDelta}, columnNamesProperty, columnTypesProperty, 0, 1L, 4L, 1); } @Test public void majorCompactAfterAbort() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed - " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); - - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - try { - // Write a couple of batches - for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); - } - - // Start a third batch, but don't close it. - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.abort(); - txnBatch.beginNextTransaction(); - txnBatch.abort(); - + " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver); - runMajorCompaction(dbName, tblName); + if (newStreamingAPI) { + StreamingConnection connection = null; + try { + // Write a couple of batches + for (int i = 0; i < 2; i++) { + connection = writeBatch(dbName, tblName, false); + assertNull(connection); + } - // Find the location of the table - IMetaStoreClient msClient = new HiveMetaStoreClient(conf); - Table table = msClient.getTable(dbName, tblName); - FileSystem fs = FileSystem.get(conf); - FileStatus[] stat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.baseFileFilter); - if (1 != stat.length) { - Assert.fail("majorCompactAfterAbort FileStatus[] stat " + Arrays.toString(stat)); - } - if (1 != stat.length) { - Assert.fail("Expecting 1 file \"base_0000004\" and found " + stat.length + " files " + Arrays.toString(stat)); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .withTransactionBatchSize(2) + .connect(); + // Start a third batch, but don't close it. + connection2.beginTransaction(); + connection2.abortTransaction(); + connection2.close(); + } finally { + if (connection != null) { + connection.close(); + } } - String name = stat[0].getPath().getName(); - if (!name.equals("base_0000004")) { - Assert.fail("majorCompactAfterAbort name " + name + " not equals to base_0000004"); + } else { + HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); + DelimitedInputWriter writer = new DelimitedInputWriter(new String[]{"a", "b"}, ",", endPt); + org.apache.hive.hcatalog.streaming.StreamingConnection connection = endPt + .newConnection(false, "UT_" + Thread.currentThread().getName()); + try { + // Write a couple of batches + for (int i = 0; i < 2; i++) { + writeBatch(connection, writer, false); + } + + // Start a third batch, abort everything, don't properly close it + TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); + txnBatch.beginNextTransaction(); + txnBatch.abort(); + txnBatch.beginNextTransaction(); + txnBatch.abort(); + } finally { + connection.close(); } - checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 0, 1L, 4L, 1); - } finally { - connection.close(); } + runMajorCompaction(dbName, tblName); + + // Find the location of the table + IMetaStoreClient msClient = new HiveMetaStoreClient(conf); + Table table = msClient.getTable(dbName, tblName); + FileSystem fs = FileSystem.get(conf); + FileStatus[] stat = + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.baseFileFilter); + if (1 != stat.length) { + Assert.fail("majorCompactAfterAbort FileStatus[] stat " + Arrays.toString(stat)); + } + if (1 != stat.length) { + Assert.fail("Expecting 1 file \"base_0000004\" and found " + stat.length + " files " + Arrays.toString(stat)); + } + String name = stat[0].getPath().getName(); + if (!name.equals("base_0000004")) { + Assert.fail("majorCompactAfterAbort name " + name + " not equals to base_0000004"); + } + checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 0, 1L, 4L, 1); } @@ -1073,68 +1171,79 @@ private void runMajorCompaction( public void majorCompactWhileStreamingForSplitUpdate() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed - " STORED AS ORC TBLPROPERTIES ('transactional'='true', " - + "'transactional_properties'='default') ", driver); // this turns on split-update U=D+I + " CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed + " STORED AS ORC TBLPROPERTIES ('transactional'='true', " + + "'transactional_properties'='default') ", driver); // this turns on split-update U=D+I - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - try { + StreamingConnection connection1 = null; + org.apache.hive.hcatalog.streaming.StreamingConnection connection2 = null; + if (newStreamingAPI) { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + writeBatch(dbName, tblName, false); } // Start a third batch, but don't close it. - writeBatch(connection, writer, true); + connection1 = writeBatch(dbName, tblName, true); + } else { + HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); + DelimitedInputWriter writer = new DelimitedInputWriter(new String[]{"a", "b"}, ",", endPt); + connection2 = endPt + .newConnection(false, "UT_" + Thread.currentThread().getName()); + // Write a couple of batches + for (int i = 0; i < 2; i++) { + writeBatch(connection2, writer, false); + } - runMajorCompaction(dbName, tblName); + // Start a third batch, but don't close it. + writeBatch(connection2, writer, true); + } + runMajorCompaction(dbName, tblName); - // Find the location of the table - IMetaStoreClient msClient = new HiveMetaStoreClient(conf); - Table table = msClient.getTable(dbName, tblName); - FileSystem fs = FileSystem.get(conf); - FileStatus[] stat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.baseFileFilter); - if (1 != stat.length) { - Assert.fail("Expecting 1 file \"base_0000004\" and found " + stat.length + " files " + Arrays.toString(stat)); - } - String name = stat[0].getPath().getName(); - Assert.assertEquals(name, "base_0000004"); - checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 1, 1L, 4L, 2); - } finally { - connection.close(); + // Find the location of the table + IMetaStoreClient msClient = new HiveMetaStoreClient(conf); + Table table = msClient.getTable(dbName, tblName); + FileSystem fs = FileSystem.get(conf); + FileStatus[] stat = + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.baseFileFilter); + if (1 != stat.length) { + Assert.fail("Expecting 1 file \"base_0000004\" and found " + stat.length + " files " + Arrays.toString(stat)); + } + String name = stat[0].getPath().getName(); + Assert.assertEquals(name, "base_0000004"); + checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 1, 1L, 4L, 2); + if (connection1 != null) { + connection1.close(); + } + if (connection2 != null) { + connection2.close(); } } @Test public void testMinorCompactionForSplitUpdateWithInsertsAndDeletes() throws Exception { - String agentInfo = "UT_" + Thread.currentThread().getName(); String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed - " STORED AS ORC TBLPROPERTIES ('transactional'='true'," - + "'transactional_properties'='default')", driver); + " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed + " STORED AS ORC TBLPROPERTIES ('transactional'='true'," + + "'transactional_properties'='default')", driver); // Insert some data -> this will generate only insert deltas and no delete deltas: delta_3_3 - executeStatementOnDriver("INSERT INTO " + tblName +"(a,b) VALUES(1, 'foo')", driver); + executeStatementOnDriver("INSERT INTO " + tblName + "(a,b) VALUES(1, 'foo')", driver); // Insert some data -> this will again generate only insert deltas and no delete deltas: delta_4_4 - executeStatementOnDriver("INSERT INTO " + tblName +"(a,b) VALUES(2, 'bar')", driver); + executeStatementOnDriver("INSERT INTO " + tblName + "(a,b) VALUES(2, 'bar')", driver); // Delete some data -> this will generate only delete deltas and no insert deltas: delete_delta_5_5 - executeStatementOnDriver("DELETE FROM " + tblName +" WHERE a = 2", driver); + executeStatementOnDriver("DELETE FROM " + tblName + " WHERE a = 2", driver); // Now, compact -> Compaction produces a single range for both delta and delete delta // That is, both delta and delete_deltas would be compacted into delta_3_5 and delete_delta_3_5 @@ -1156,7 +1265,7 @@ public void testMinorCompactionForSplitUpdateWithInsertsAndDeletes() throws Exce // Verify that we have got correct set of deltas. FileStatus[] stat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); String[] deltas = new String[stat.length]; Path minorCompactedDelta = null; for (int i = 0; i < deltas.length; i++) { @@ -1166,16 +1275,17 @@ public void testMinorCompactionForSplitUpdateWithInsertsAndDeletes() throws Exce } } Arrays.sort(deltas); - String[] expectedDeltas = new String[]{"delta_0000001_0000001_0000", "delta_0000001_0000003", "delta_0000002_0000002_0000"}; + String[] expectedDeltas = new String[]{"delta_0000001_0000001_0000", "delta_0000001_0000003", + "delta_0000002_0000002_0000"}; if (!Arrays.deepEquals(expectedDeltas, deltas)) { Assert.fail("Expected: " + Arrays.toString(expectedDeltas) + ", found: " + Arrays.toString(deltas)); } checkExpectedTxnsPresent(null, new Path[]{minorCompactedDelta}, columnNamesProperty, columnTypesProperty, - 0, 1L, 2L, 1); + 0, 1L, 2L, 1); // Verify that we have got correct set of delete_deltas. FileStatus[] deleteDeltaStat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deleteEventDeltaDirFilter); + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deleteEventDeltaDirFilter); String[] deleteDeltas = new String[deleteDeltaStat.length]; Path minorCompactedDeleteDelta = null; for (int i = 0; i < deleteDeltas.length; i++) { @@ -1190,28 +1300,26 @@ public void testMinorCompactionForSplitUpdateWithInsertsAndDeletes() throws Exce Assert.fail("Expected: " + Arrays.toString(expectedDeleteDeltas) + ", found: " + Arrays.toString(deleteDeltas)); } checkExpectedTxnsPresent(null, new Path[]{minorCompactedDeleteDelta}, columnNamesProperty, columnTypesProperty, - 0, 2L, 2L, 1); + 0, 2L, 2L, 1); } @Test public void testMinorCompactionForSplitUpdateWithOnlyInserts() throws Exception { - String agentInfo = "UT_" + Thread.currentThread().getName(); String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed - " STORED AS ORC TBLPROPERTIES ('transactional'='true'," - + "'transactional_properties'='default')", driver); + " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed + " STORED AS ORC TBLPROPERTIES ('transactional'='true'," + + "'transactional_properties'='default')", driver); // Insert some data -> this will generate only insert deltas and no delete deltas: delta_1_1 - executeStatementOnDriver("INSERT INTO " + tblName +"(a,b) VALUES(1, 'foo')", driver); + executeStatementOnDriver("INSERT INTO " + tblName + "(a,b) VALUES(1, 'foo')", driver); // Insert some data -> this will again generate only insert deltas and no delete deltas: delta_2_2 - executeStatementOnDriver("INSERT INTO " + tblName +"(a,b) VALUES(2, 'bar')", driver); + executeStatementOnDriver("INSERT INTO " + tblName + "(a,b) VALUES(2, 'bar')", driver); // Now, compact // One important thing to note in this test is that minor compaction always produces @@ -1235,7 +1343,7 @@ public void testMinorCompactionForSplitUpdateWithOnlyInserts() throws Exception // Verify that we have got correct set of deltas. FileStatus[] stat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); String[] deltas = new String[stat.length]; Path minorCompactedDelta = null; for (int i = 0; i < deltas.length; i++) { @@ -1245,16 +1353,17 @@ public void testMinorCompactionForSplitUpdateWithOnlyInserts() throws Exception } } Arrays.sort(deltas); - String[] expectedDeltas = new String[]{"delta_0000001_0000001_0000", "delta_0000001_0000002", "delta_0000002_0000002_0000"}; + String[] expectedDeltas = new String[]{"delta_0000001_0000001_0000", "delta_0000001_0000002", + "delta_0000002_0000002_0000"}; if (!Arrays.deepEquals(expectedDeltas, deltas)) { Assert.fail("Expected: " + Arrays.toString(expectedDeltas) + ", found: " + Arrays.toString(deltas)); } checkExpectedTxnsPresent(null, new Path[]{minorCompactedDelta}, columnNamesProperty, columnTypesProperty, - 0, 1L, 2L, 1); + 0, 1L, 2L, 1); // Verify that we have got correct set of delete_deltas. FileStatus[] deleteDeltaStat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deleteEventDeltaDirFilter); + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deleteEventDeltaDirFilter); String[] deleteDeltas = new String[deleteDeltaStat.length]; Path minorCompactedDeleteDelta = null; for (int i = 0; i < deleteDeltas.length; i++) { @@ -1269,89 +1378,105 @@ public void testMinorCompactionForSplitUpdateWithOnlyInserts() throws Exception Assert.fail("Expected: " + Arrays.toString(expectedDeleteDeltas) + ", found: " + Arrays.toString(deleteDeltas)); } // There should be no rows in the delete_delta because there have been no delete events. - checkExpectedTxnsPresent(null, new Path[]{minorCompactedDeleteDelta}, columnNamesProperty, columnTypesProperty, 0, 0L, 0L, 1); + checkExpectedTxnsPresent(null, new Path[]{minorCompactedDeleteDelta}, columnNamesProperty, columnTypesProperty, 0, + 0L, 0L, 1); } @Test public void minorCompactWhileStreamingWithSplitUpdate() throws Exception { String dbName = "default"; String tblName = "cws"; - List colNames = Arrays.asList("a", "b"); String columnNamesProperty = "a,b"; String columnTypesProperty = "int:string"; executeStatementOnDriver("drop table if exists " + tblName, driver); executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed - " STORED AS ORC TBLPROPERTIES ('transactional'='true'," - + "'transactional_properties'='default')", driver); + " CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed + " STORED AS ORC TBLPROPERTIES ('transactional'='true'," + + "'transactional_properties'='default')", driver); + + StreamingConnection connection1 = null; + org.apache.hive.hcatalog.streaming.StreamingConnection connection2 = null; + if (newStreamingAPI) { - HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - try { // Write a couple of batches for (int i = 0; i < 2; i++) { - writeBatch(connection, writer, false); + writeBatch(dbName, tblName, false); } // Start a third batch, but don't close it. - writeBatch(connection, writer, true); + connection1 = writeBatch(dbName, tblName, true); + } else { + HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null); + DelimitedInputWriter writer = new DelimitedInputWriter(new String[]{"a", "b"}, ",", endPt); + connection2 = endPt + .newConnection(false, "UT_" + Thread.currentThread().getName()); + // Write a couple of batches + for (int i = 0; i < 2; i++) { + writeBatch(connection2, writer, false); + } - // Now, compact - TxnStore txnHandler = TxnUtils.getTxnStore(conf); - txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MINOR)); - Worker t = new Worker(); - t.setThreadId((int) t.getId()); - t.setConf(conf); - AtomicBoolean stop = new AtomicBoolean(true); - AtomicBoolean looped = new AtomicBoolean(); - t.init(stop, looped); - t.run(); + // Start a third batch, but don't close it. + writeBatch(connection2, writer, true); + } + // Now, compact + TxnStore txnHandler = TxnUtils.getTxnStore(conf); + txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MINOR)); + Worker t = new Worker(); + t.setThreadId((int) t.getId()); + t.setConf(conf); + AtomicBoolean stop = new AtomicBoolean(true); + AtomicBoolean looped = new AtomicBoolean(); + t.init(stop, looped); + t.run(); - // Find the location of the table - IMetaStoreClient msClient = new HiveMetaStoreClient(conf); - Table table = msClient.getTable(dbName, tblName); - FileSystem fs = FileSystem.get(conf); - FileStatus[] stat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); - String[] names = new String[stat.length]; - Path resultFile = null; - for (int i = 0; i < names.length; i++) { - names[i] = stat[i].getPath().getName(); - if (names[i].equals("delta_0000001_0000004")) { - resultFile = stat[i].getPath(); - } - } - Arrays.sort(names); - String[] expected = new String[]{"delta_0000001_0000002", - "delta_0000001_0000004", "delta_0000003_0000004", "delta_0000005_0000006"}; - if (!Arrays.deepEquals(expected, names)) { - Assert.fail("Expected: " + Arrays.toString(expected) + ", found: " + Arrays.toString(names)); - } - checkExpectedTxnsPresent(null, new Path[]{resultFile}, columnNamesProperty, columnTypesProperty, - 0, 1L, 4L, 1); - - // Verify that we have got correct set of delete_deltas also - FileStatus[] deleteDeltaStat = - fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deleteEventDeltaDirFilter); - String[] deleteDeltas = new String[deleteDeltaStat.length]; - Path minorCompactedDeleteDelta = null; - for (int i = 0; i < deleteDeltas.length; i++) { - deleteDeltas[i] = deleteDeltaStat[i].getPath().getName(); - if (deleteDeltas[i].equals("delete_delta_0000001_0000004")) { - minorCompactedDeleteDelta = deleteDeltaStat[i].getPath(); - } + // Find the location of the table + IMetaStoreClient msClient = new HiveMetaStoreClient(conf); + Table table = msClient.getTable(dbName, tblName); + FileSystem fs = FileSystem.get(conf); + FileStatus[] stat = + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter); + String[] names = new String[stat.length]; + Path resultFile = null; + for (int i = 0; i < names.length; i++) { + names[i] = stat[i].getPath().getName(); + if (names[i].equals("delta_0000001_0000004")) { + resultFile = stat[i].getPath(); } - Arrays.sort(deleteDeltas); - String[] expectedDeleteDeltas = new String[]{"delete_delta_0000001_0000004"}; - if (!Arrays.deepEquals(expectedDeleteDeltas, deleteDeltas)) { - Assert.fail("Expected: " + Arrays.toString(expectedDeleteDeltas) + ", found: " + Arrays.toString(deleteDeltas)); + } + Arrays.sort(names); + String[] expected = new String[]{"delta_0000001_0000002", + "delta_0000001_0000004", "delta_0000003_0000004", "delta_0000005_0000006"}; + if (!Arrays.deepEquals(expected, names)) { + Assert.fail("Expected: " + Arrays.toString(expected) + ", found: " + Arrays.toString(names)); + } + checkExpectedTxnsPresent(null, new Path[]{resultFile}, columnNamesProperty, columnTypesProperty, + 0, 1L, 4L, 1); + + // Verify that we have got correct set of delete_deltas also + FileStatus[] deleteDeltaStat = + fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deleteEventDeltaDirFilter); + String[] deleteDeltas = new String[deleteDeltaStat.length]; + Path minorCompactedDeleteDelta = null; + for (int i = 0; i < deleteDeltas.length; i++) { + deleteDeltas[i] = deleteDeltaStat[i].getPath().getName(); + if (deleteDeltas[i].equals("delete_delta_0000001_0000004")) { + minorCompactedDeleteDelta = deleteDeltaStat[i].getPath(); } - // There should be no rows in the delete_delta because there have been no delete events. - checkExpectedTxnsPresent(null, new Path[]{minorCompactedDeleteDelta}, columnNamesProperty, columnTypesProperty, 0, 0L, 0L, 1); + } + Arrays.sort(deleteDeltas); + String[] expectedDeleteDeltas = new String[]{"delete_delta_0000001_0000004"}; + if (!Arrays.deepEquals(expectedDeleteDeltas, deleteDeltas)) { + Assert.fail("Expected: " + Arrays.toString(expectedDeleteDeltas) + ", found: " + Arrays.toString(deleteDeltas)); + } + // There should be no rows in the delete_delta because there have been no delete events. + checkExpectedTxnsPresent(null, new Path[]{minorCompactedDeleteDelta}, columnNamesProperty, columnTypesProperty, 0, + 0L, 0L, 1); - } finally { - connection.close(); + if (connection1 != null) { + connection1.close(); + } + if (connection2 != null) { + connection2.close(); } } @@ -1366,15 +1491,15 @@ public void testTableProperties() throws Exception { executeStatementOnDriver("drop table if exists " + tblName1, driver); executeStatementOnDriver("drop table if exists " + tblName2, driver); executeStatementOnDriver("CREATE TABLE " + tblName1 + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC" + - " TBLPROPERTIES ('transactional'='true', 'orc.compress.size'='2700')", driver); + " CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC" + + " TBLPROPERTIES ('transactional'='true', 'orc.compress.size'='2700')", driver); executeStatementOnDriver("CREATE TABLE " + tblName2 + "(a INT, b STRING) " + - " CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES (" + - "'transactional'='true'," + - "'compactor.mapreduce.map.memory.mb'='2048'," + // 2048 MB memory for compaction map job - "'compactorthreshold.hive.compactor.delta.num.threshold'='4'," + // minor compaction if more than 4 delta dirs - "'compactorthreshold.hive.compactor.delta.pct.threshold'='0.49'" + // major compaction if more than 49% - ")", driver); + " CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES (" + + "'transactional'='true'," + + "'compactor.mapreduce.map.memory.mb'='2048'," + // 2048 MB memory for compaction map job + "'compactorthreshold.hive.compactor.delta.num.threshold'='4'," + // minor compaction if more than 4 delta dirs + "'compactorthreshold.hive.compactor.delta.pct.threshold'='0.49'" + // major compaction if more than 49% + ")", driver); // Insert 5 rows to both tables executeStatementOnDriver("insert into " + tblName1 + " values (1, 'a')", driver); @@ -1397,7 +1522,8 @@ public void testTableProperties() throws Exception { Assert.assertEquals(1, rsp.getCompacts().size()); Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState()); Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename()); - Assert.assertEquals(CompactionType.MAJOR, rsp.getCompacts().get(0).getType()); // type is MAJOR since there's no base yet + Assert.assertEquals(CompactionType.MAJOR, + rsp.getCompacts().get(0).getType()); // type is MAJOR since there's no base yet // Finish the scheduled compaction for ttp2, and manually compact ttp1, to make them comparable again executeStatementOnDriver("alter table " + tblName1 + " compact 'major'", driver); @@ -1441,15 +1567,15 @@ public void testTableProperties() throws Exception { * It should be the default. */ List rs = execSelectAndDumpData("select distinct INPUT__FILE__NAME from " - + tblName1, driver, "Find Orc File bufer default"); + + tblName1, driver, "Find Orc File bufer default"); Assert.assertTrue("empty rs?", rs != null && rs.size() > 0); Path p = new Path(rs.get(0)); Reader orcReader = OrcFile.createReader(p.getFileSystem(conf), p); Assert.assertEquals("Expected default compression size", - 2700, orcReader.getCompressionSize()); + 2700, orcReader.getCompressionSize()); //make sure 2700 is not the default so that we are testing if tblproperties indeed propagate Assert.assertNotEquals("Unexpected default compression size", 2700, - OrcConf.BUFFER_SIZE.getDefaultValue()); + OrcConf.BUFFER_SIZE.getDefaultValue()); // Insert one more row - this should trigger hive.compactor.delta.pct.threshold to be reached for ttp2 @@ -1476,9 +1602,9 @@ public void testTableProperties() throws Exception { // Now test tblproperties specified on ALTER TABLE .. COMPACT .. statement executeStatementOnDriver("insert into " + tblName2 + " values (7, 'g')", driver); executeStatementOnDriver("alter table " + tblName2 + " compact 'major'" + - " with overwrite tblproperties (" + - "'compactor.mapreduce.map.memory.mb'='3072'," + - "'tblprops.orc.compress.size'='3141')", driver); + " with overwrite tblproperties (" + + "'compactor.mapreduce.map.memory.mb'='3072'," + + "'tblprops.orc.compress.size'='3141')", driver); rsp = txnHandler.showCompact(new ShowCompactRequest()); Assert.assertEquals(4, rsp.getCompacts().size()); @@ -1499,16 +1625,16 @@ public void testTableProperties() throws Exception { Assert.assertEquals(3072, job.getMemoryForMapTask()); Assert.assertTrue(job.get("hive.compactor.table.props").contains("orc.compress.size4:3141")); /*createReader(FileSystem fs, Path path) throws IOException { - */ + */ //we just ran Major compaction so we should have a base_x in tblName2 that has the new files // Get the name of a file and look at its properties to see if orc.compress.size was respected. rs = execSelectAndDumpData("select distinct INPUT__FILE__NAME from " + tblName2, - driver, "Find Compacted Orc File"); + driver, "Find Compacted Orc File"); Assert.assertTrue("empty rs?", rs != null && rs.size() > 0); p = new Path(rs.get(0)); orcReader = OrcFile.createReader(p.getFileSystem(conf), p); Assert.assertEquals("File written with wrong buffer size", - 3141, orcReader.getCompressionSize()); + 3141, orcReader.getCompressionSize()); } @Test @@ -1529,30 +1655,58 @@ public void testCompactionInfoHashCode() { Assert.assertEquals("The hash codes must be equal", compactionInfo.hashCode(), compactionInfo1.hashCode()); } - private void writeBatch(StreamingConnection connection, DelimitedInputWriter writer, - boolean closeEarly) - throws InterruptedException, StreamingException { + private void writeBatch(org.apache.hive.hcatalog.streaming.StreamingConnection connection, + DelimitedInputWriter writer, + boolean closeEarly) throws InterruptedException, org.apache.hive.hcatalog.streaming.StreamingException { TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); txnBatch.beginNextTransaction(); txnBatch.write("50,Kiev".getBytes()); txnBatch.write("51,St. Petersburg".getBytes()); txnBatch.write("44,Boston".getBytes()); txnBatch.commit(); - if (!closeEarly) { txnBatch.beginNextTransaction(); txnBatch.write("52,Tel Aviv".getBytes()); txnBatch.write("53,Atlantis".getBytes()); txnBatch.write("53,Boston".getBytes()); txnBatch.commit(); - txnBatch.close(); } } + private StreamingConnection writeBatch(String dbName, String tblName, boolean closeEarly) throws StreamingException { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + StreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .withTransactionBatchSize(2) + .connect(); + connection.beginTransaction(); + connection.write("50,Kiev".getBytes()); + connection.write("51,St. Petersburg".getBytes()); + connection.write("44,Boston".getBytes()); + connection.commitTransaction(); + + if (!closeEarly) { + connection.beginTransaction(); + connection.write("52,Tel Aviv".getBytes()); + connection.write("53,Atlantis".getBytes()); + connection.write("53,Boston".getBytes()); + connection.commitTransaction(); + connection.close(); + return null; + } + return connection; + } + private void checkExpectedTxnsPresent(Path base, Path[] deltas, String columnNamesProperty, - String columnTypesProperty, int bucket, long min, long max, int numBuckets) - throws IOException { + String columnTypesProperty, int bucket, long min, long max, int numBuckets) + throws IOException { ValidWriteIdList writeIdList = new ValidWriteIdList() { @Override public String getTableName() { @@ -1586,13 +1740,14 @@ public Long getMinOpenWriteId() { @Override public long getHighWatermark() { - return Long.MAX_VALUE; + return Long.MAX_VALUE; } @Override public long[] getInvalidWriteIds() { return new long[0]; } + @Override public boolean isValidBase(long writeid) { return true; @@ -1617,7 +1772,7 @@ public RangeResponse isWriteIdRangeAborted(long minWriteId, long maxWriteId) { conf.set(hive_metastoreConstants.BUCKET_COUNT, Integer.toString(numBuckets)); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true); AcidInputFormat.RawReader reader = - aif.getRawReader(conf, true, bucket, writeIdList, base, deltas); + aif.getRawReader(conf, true, bucket, writeIdList, base, deltas); RecordIdentifier identifier = reader.createKey(); OrcStruct value = reader.createValue(); long currentTxn = min; @@ -1639,25 +1794,27 @@ public RangeResponse isWriteIdRangeAborted(long minWriteId, long maxWriteId) { * convenience method to execute a select stmt and dump results to log file */ private static List execSelectAndDumpData(String selectStmt, IDriver driver, String msg) - throws Exception { + throws Exception { executeStatementOnDriver(selectStmt, driver); ArrayList valuesReadFromHiveDriver = new ArrayList(); driver.getResults(valuesReadFromHiveDriver); int rowIdx = 0; LOG.debug(msg); - for(String row : valuesReadFromHiveDriver) { + for (String row : valuesReadFromHiveDriver) { LOG.debug(" rowIdx=" + rowIdx++ + ":" + row); } return valuesReadFromHiveDriver; } + /** * Execute Hive CLI statement + * * @param cmd arbitrary statement to execute */ static void executeStatementOnDriver(String cmd, IDriver driver) throws Exception { LOG.debug("Executing: " + cmd); CommandProcessorResponse cpr = driver.run(cmd); - if(cpr.getResponseCode() != 0) { + if (cpr.getResponseCode() != 0) { throw new IOException("Failed to execute \"" + cmd + "\". Driver returned: " + cpr); } } diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveClientCache.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveClientCache.java new file mode 100644 index 0000000..6c33f63 --- /dev/null +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveClientCache.java @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.hadoop.hive.metastore; + +import java.io.IOException; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import javax.security.auth.login.LoginException; + +import org.apache.commons.lang.builder.EqualsBuilder; +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.hadoop.hive.common.classification.InterfaceAudience; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.annotation.NoReconnect; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.shims.Utils; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hive.common.util.ShutdownHookManager; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.RemovalListener; +import com.google.common.cache.RemovalNotification; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +/** + * A thread safe time expired cache for HiveMetaStoreClient + */ +class HiveClientCache { + public final static int DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS = 2 * 60; + public final static int DEFAULT_HIVE_CACHE_INITIAL_CAPACITY = 50; + public final static int DEFAULT_HIVE_CACHE_MAX_CAPACITY = 50; + public final static boolean DEFAULT_HIVE_CLIENT_CACHE_STATS_ENABLED = false; + + private final Cache hiveCache; + private static final Logger LOG = LoggerFactory.getLogger(HiveClientCache.class); + private final int timeout; + // This lock is used to make sure removalListener won't close a client that is being contemplated for returning by get() + private final Object CACHE_TEARDOWN_LOCK = new Object(); + + private static final AtomicInteger nextId = new AtomicInteger(0); + + private final ScheduledFuture cleanupHandle; // used to cleanup cache + + private boolean enableStats; + + // Since HiveMetaStoreClient is not threadsafe, hive clients are not shared across threads. + // Thread local variable containing each thread's unique ID, is used as one of the keys for the cache + // causing each thread to get a different client even if the conf is same. + private static final ThreadLocal threadId = + new ThreadLocal() { + @Override + protected Integer initialValue() { + return nextId.getAndIncrement(); + } + }; + + private int getThreadId() { + return threadId.get(); + } + + public static IMetaStoreClient getNonCachedHiveMetastoreClient(HiveConf hiveConf) throws MetaException { + return RetryingMetaStoreClient.getProxy(hiveConf, true); + } + + public HiveClientCache(HiveConf hiveConf) { + this((int) HiveConf.getTimeVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_EXPIRY_TIME, TimeUnit.SECONDS), + HiveConf.getIntVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_INITIAL_CAPACITY), + HiveConf.getIntVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_MAX_CAPACITY), + HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_STATS_ENABLED)); + } + + /** + * @deprecated This constructor will be made private or removed as more configuration properties are required. + */ + @Deprecated + public HiveClientCache(final int timeout) { + this(timeout, DEFAULT_HIVE_CACHE_INITIAL_CAPACITY, DEFAULT_HIVE_CACHE_MAX_CAPACITY, DEFAULT_HIVE_CLIENT_CACHE_STATS_ENABLED); + } + + /** + * @param timeout the length of time in seconds after a client is created that it should be automatically removed + */ + private HiveClientCache(final int timeout, final int initialCapacity, final int maxCapacity, final boolean enableStats) { + this.timeout = timeout; + this.enableStats = enableStats; + + LOG.info("Initializing cache: eviction-timeout=" + timeout + " initial-capacity=" + initialCapacity + " maximum-capacity=" + maxCapacity); + + CacheBuilder builder = CacheBuilder.newBuilder() + .initialCapacity(initialCapacity) + .maximumSize(maxCapacity) + .expireAfterAccess(timeout, TimeUnit.SECONDS) + .removalListener(createRemovalListener()); + + /* + * Guava versions <12.0 have stats collection enabled by default and do not expose a recordStats method. + * Check for newer versions of the library and ensure that stats collection is enabled by default. + */ + try { + java.lang.reflect.Method m = builder.getClass().getMethod("recordStats", null); + m.invoke(builder, null); + } catch (NoSuchMethodException e) { + LOG.debug("Using a version of guava <12.0. Stats collection is enabled by default."); + } catch (Exception e) { + LOG.warn("Unable to invoke recordStats method.", e); + } + + this.hiveCache = builder.build(); + + /* + * We need to use a cleanup interval, which is how often the cleanup thread will kick in + * and go do a check to see if any of the connections can be expired. We don't want to + * do this too often, because it'd be like having a mini-GC going off every so often, + * so we limit it to a minimum of DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS. If the client + * has explicitly set a larger timeout on the cache, though, we respect that, and use that + */ + long cleanupInterval = timeout > DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS ? timeout : DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS; + + this.cleanupHandle = createCleanupThread(cleanupInterval); + + createShutdownHook(); + } + + private RemovalListener createRemovalListener() { + RemovalListener listener = + new RemovalListener() { + @Override + public void onRemoval(RemovalNotification notification) { + ICacheableMetaStoreClient hiveMetaStoreClient = notification.getValue(); + if (hiveMetaStoreClient != null) { + if (LOG.isDebugEnabled()) { + LOG.debug("Evicting client: " + Integer.toHexString(System.identityHashCode(hiveMetaStoreClient))); + } + + // TODO: This global lock may not be necessary as all concurrent methods in ICacheableMetaStoreClient + // are synchronized. + synchronized (CACHE_TEARDOWN_LOCK) { + hiveMetaStoreClient.setExpiredFromCache(); + hiveMetaStoreClient.tearDownIfUnused(); + } + } + } + }; + + return listener; + } + + private ScheduledFuture createCleanupThread(long interval) { + // Add a maintenance thread that will attempt to trigger a cache clean continuously + Runnable cleanupThread = new Runnable() { + @Override + public void run() { + cleanup(); + } + }; + + /** + * Create the cleanup handle. In addition to cleaning up every cleanupInterval, we add + * a slight offset, so that the very first time it runs, it runs with a slight delay, so + * as to catch any other connections that were closed when the first timeout happened. + * As a result, the time we can expect an unused connection to be reaped is + * 5 seconds after the first timeout, and then after that, it'll check for whether or not + * it can be cleaned every max(DEFAULT_HIVE_CACHE_EXPIRY_TIME_SECONDS,timeout) seconds + */ + ThreadFactory daemonThreadFactory = (new ThreadFactoryBuilder()).setDaemon(true) + .setNameFormat("HiveClientCache-cleaner-%d") + .build(); + + return Executors.newScheduledThreadPool(1, daemonThreadFactory) + .scheduleWithFixedDelay(cleanupThread, timeout + 5, interval, TimeUnit.SECONDS); + } + + private void createShutdownHook() { + // Add a shutdown hook for cleanup, if there are elements remaining in the cache which were not cleaned up. + // This is the best effort approach. Ignore any error while doing so. Notice that most of the clients + // would get cleaned up via either the removalListener or the close() call, only the active clients + // that are in the cache or expired but being used in other threads wont get cleaned. The following code will only + // clean the active cache ones. The ones expired from cache but being hold by other threads are in the mercy + // of finalize() being called. + Thread cleanupHiveClientShutdownThread = new Thread() { + @Override + public void run() { + LOG.debug("Cleaning up hive client cache in ShutDown hook"); + cleanupHandle.cancel(false); // Cancel the maintenance thread. + closeAllClientsQuietly(); + } + }; + + ShutdownHookManager.addShutdownHook(cleanupHiveClientShutdownThread); + } + + /** + * Note: This doesn't check if they are being used or not, meant only to be called during shutdown etc. + */ + void closeAllClientsQuietly() { + try { + ConcurrentMap elements = hiveCache.asMap(); + for (ICacheableMetaStoreClient cacheableHiveMetaStoreClient : elements.values()) { + cacheableHiveMetaStoreClient.tearDown(); + } + } catch (Exception e) { + LOG.warn("Clean up of hive clients in the cache failed. Ignored", e); + } + + if (this.enableStats) { + LOG.info("Cache statistics after shutdown: size=" + hiveCache.size() + " " + hiveCache.stats()); + } + } + + public void cleanup() { + // TODO: periodically reload a new HiveConf to check if stats reporting is enabled. + hiveCache.cleanUp(); + + if (enableStats) { + LOG.info("Cache statistics after cleanup: size=" + hiveCache.size() + " " + hiveCache.stats()); + } + } + + /** + * Returns a cached client if exists or else creates one, caches and returns it. It also checks that the client is + * healthy and can be reused + * @param hiveConf + * @return the hive client + * @throws MetaException + * @throws IOException + * @throws LoginException + */ + public IMetaStoreClient get(final HiveConf hiveConf) throws MetaException, IOException, LoginException { + final HiveClientCacheKey cacheKey = HiveClientCacheKey.fromHiveConf(hiveConf, getThreadId()); + ICacheableMetaStoreClient cacheableHiveMetaStoreClient = null; + + // the hmsc is not shared across threads. So the only way it could get closed while we are doing healthcheck + // is if removalListener closes it. The synchronization takes care that removalListener won't do it + synchronized (CACHE_TEARDOWN_LOCK) { + cacheableHiveMetaStoreClient = getOrCreate(cacheKey); + cacheableHiveMetaStoreClient.acquire(); + } + if (!cacheableHiveMetaStoreClient.isOpen()) { + synchronized (CACHE_TEARDOWN_LOCK) { + hiveCache.invalidate(cacheKey); + cacheableHiveMetaStoreClient.close(); + cacheableHiveMetaStoreClient = getOrCreate(cacheKey); + cacheableHiveMetaStoreClient.acquire(); + } + } + return cacheableHiveMetaStoreClient; + } + + /** + * Return from cache if exists else create/cache and return + * @param cacheKey + * @return + * @throws IOException + * @throws MetaException + * @throws LoginException + */ + private ICacheableMetaStoreClient getOrCreate(final HiveClientCacheKey cacheKey) + throws IOException, MetaException, LoginException { + try { + return hiveCache.get(cacheKey, new Callable() { + @Override + public ICacheableMetaStoreClient call() throws MetaException { + // This is called from HCat, so always allow embedded metastore (as was the default). + return + (ICacheableMetaStoreClient) RetryingMetaStoreClient.getProxy(cacheKey.getHiveConf(), + new Class[]{HiveConf.class, Integer.class, Boolean.class}, + new Object[]{cacheKey.getHiveConf(), timeout, true}, + CacheableHiveMetaStoreClient.class.getName()); + } + }); + } catch (ExecutionException e) { + Throwable t = e.getCause(); + if (t instanceof IOException) { + throw (IOException) t; + } else if (t instanceof MetaException) { + throw (MetaException) t; + } else if (t instanceof LoginException) { + throw (LoginException) t; + } else { + throw new IOException("Error creating hiveMetaStoreClient", t); + } + } + } + + /** + * A class to wrap HiveConf and expose equality based only on UserGroupInformation and the metaStoreURIs. + * This becomes the key for the cache and this way the same HiveMetaStoreClient would be returned if + * UserGroupInformation and metaStoreURIs are same. This function can evolve to express + * the cases when HiveConf is different but the same hiveMetaStoreClient can be used + */ + static class HiveClientCacheKey { + final private String metaStoreURIs; + final private UserGroupInformation ugi; + final private HiveConf hiveConf; + final private int threadId; + + private HiveClientCacheKey(HiveConf hiveConf, final int threadId) throws IOException, LoginException { + this.metaStoreURIs = hiveConf.getVar(HiveConf.ConfVars.METASTOREURIS); + ugi = Utils.getUGI(); + this.hiveConf = hiveConf; + this.threadId = threadId; + } + + public static HiveClientCacheKey fromHiveConf(HiveConf hiveConf, final int threadId) throws IOException, LoginException { + return new HiveClientCacheKey(hiveConf, threadId); + } + + public HiveConf getHiveConf() { + return hiveConf; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + HiveClientCacheKey that = (HiveClientCacheKey) o; + return new EqualsBuilder(). + append(this.metaStoreURIs, + that.metaStoreURIs). + append(this.ugi, that.ugi). + append(this.threadId, that.threadId).isEquals(); + } + + @Override + public int hashCode() { + return new HashCodeBuilder(). + append(metaStoreURIs). + append(ugi). + append(threadId).toHashCode(); + } + + @Override + public String toString() { + return "HiveClientCacheKey: uri=" + this.metaStoreURIs + " ugi=" + this.ugi + " thread=" + this.threadId; + } + } + + @InterfaceAudience.Private + public interface ICacheableMetaStoreClient extends IMetaStoreClient { + @NoReconnect + void acquire(); + + @NoReconnect + void setExpiredFromCache(); + + @NoReconnect + AtomicInteger getUsers(); + + @NoReconnect + boolean isClosed(); + + /** + * @deprecated This method is not used internally and should not be visible through HCatClient.create. + */ + @Deprecated + @NoReconnect + boolean isOpen(); + + @NoReconnect + void tearDownIfUnused(); + + @NoReconnect + void tearDown(); + } + + /** + * Add # of current users on HiveMetaStoreClient, so that the client can be cleaned when no one is using it. + */ + static class CacheableHiveMetaStoreClient extends HiveMetaStoreClient implements ICacheableMetaStoreClient { + + private final AtomicInteger users = new AtomicInteger(0); + private volatile boolean expiredFromCache = false; + private boolean isClosed = false; + + CacheableHiveMetaStoreClient(final HiveConf conf, final Integer timeout, Boolean allowEmbedded) + throws MetaException { + super(conf, null, allowEmbedded); + } + + /** + * Increments the user count and optionally renews the expiration time. + * renew should correspond with the expiration policy of the cache. + * When the policy is expireAfterAccess, the expiration time should be extended. + * When the policy is expireAfterWrite, the expiration time should not be extended. + * A mismatch with the policy will lead to closing the connection unnecessarily after the initial + * expiration time is generated. + */ + public synchronized void acquire() { + users.incrementAndGet(); + if (users.get() > 1) { + LOG.warn("Unexpected increment of user count beyond one: " + users.get() + " " + this); + } + } + + /** + * Decrements the user count. + */ + private void release() { + if (users.get() > 0) { + users.decrementAndGet(); + } else { + LOG.warn("Unexpected attempt to decrement user count of zero: " + users.get() + " " + this); + } + } + + /** + * Communicate to the client that it is no longer in the cache. + * The expiration time should be voided to allow the connection to be closed at the first opportunity. + */ + public synchronized void setExpiredFromCache() { + if (users.get() != 0) { + LOG.warn("Evicted client has non-zero user count: " + users.get()); + } + + expiredFromCache = true; + } + + public boolean isClosed() { + return isClosed; + } + + /* + * Used only for Debugging or testing purposes + */ + public AtomicInteger getUsers() { + return users; + } + + /** + * Make a call to hive meta store and see if the client is still usable. Some calls where the user provides + * invalid data renders the client unusable for future use (example: create a table with very long table name) + * @return + */ + @Deprecated + public boolean isOpen() { + try { + // Look for an unlikely database name and see if either MetaException or TException is thrown + super.getDatabases("NonExistentDatabaseUsedForHealthCheck"); + } catch (TException e) { + return false; + } + return true; + } + + /** + * Decrement the user count and piggyback this to set expiry flag as well, then teardown(), if conditions are met. + * This *MUST* be called by anyone who uses this client. + */ + @Override + public synchronized void close() { + release(); + tearDownIfUnused(); + } + + /** + * Attempt to tear down the client connection. + * The connection will be closed if the following conditions hold: + * 1. There are no active user holding the client. + * 2. The client has been evicted from the cache. + */ + public synchronized void tearDownIfUnused() { + if (users.get() != 0) { + LOG.warn("Non-zero user count preventing client tear down: users=" + users.get() + " expired=" + expiredFromCache); + } + + if (users.get() == 0 && expiredFromCache) { + this.tearDown(); + } + } + + /** + * Close the underlying objects irrespective of whether they are in use or not. + */ + public void tearDown() { + try { + if (!isClosed) { + super.close(); + } + isClosed = true; + } catch (Exception e) { + LOG.warn("Error closing hive metastore client. Ignored.", e); + } + } + + @Override + public String toString() { + return "HCatClient: thread: " + Thread.currentThread().getId() + " users=" + users.get() + + " expired=" + expiredFromCache + " closed=" + isClosed; + } + + /** + * GC is attempting to destroy the object. + * No one references this client anymore, so it can be torn down without worrying about user counts. + * @throws Throwable + */ + @Override + protected void finalize() throws Throwable { + if (users.get() != 0) { + LOG.warn("Closing client with non-zero user count: users=" + users.get() + " expired=" + expiredFromCache); + } + + try { + this.tearDown(); + } finally { + super.finalize(); + } + } + } +} diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreUtils.java index a66c135..75a7201 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreUtils.java @@ -18,9 +18,13 @@ package org.apache.hadoop.hive.metastore; +import java.io.IOException; import java.util.ArrayList; import java.util.List; +import javax.security.auth.login.LoginException; + +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,6 +46,7 @@ public class HiveMetaStoreUtils { protected static final Logger LOG = LoggerFactory.getLogger("hive.log"); + private static volatile HiveClientCache hiveClientCache; /** * getDeserializer @@ -210,4 +215,34 @@ public static FieldSchema getFieldSchemaFromTypeInfo(String fieldName, "generated by TypeInfoUtils.getFieldSchemaFromTypeInfo"); } + /** + * Get or create a hive client depending on whether it exits in cache or not + * @param hiveConf The hive configuration + * @return the client + * @throws MetaException When HiveMetaStoreClient couldn't be created + * @throws IOException + */ + public static IMetaStoreClient getHiveMetastoreClient(HiveConf hiveConf) + throws MetaException, IOException { + + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_ENABLED)){ + // If cache is disabled, don't use it. + return HiveClientCache.getNonCachedHiveMetastoreClient(hiveConf); + } + + // Singleton behaviour: create the cache instance if required. + if (hiveClientCache == null) { + synchronized (IMetaStoreClient.class) { + if (hiveClientCache == null) { + hiveClientCache = new HiveClientCache(hiveConf); + } + } + } + try { + return hiveClientCache.get(hiveConf); + } catch (LoginException e) { + throw new IOException("Couldn't create hiveMetaStoreClient, Error getting UGI for user", e); + } + } + } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java index 09f8802..5f9ac5f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java @@ -231,8 +231,7 @@ static StructObjectInspector createEventSchema(ObjectInspector rowInspector) { fs = partitionRoot.getFileSystem(options.getConfiguration()); } this.fs = fs; - if (options.getMinimumWriteId() != options.getMaximumWriteId() - && !options.isWritingBase()) { + if (options.getMinimumWriteId() != options.getMaximumWriteId() && !options.isWritingBase()) { //throw if file already exists as that should never happen flushLengths = fs.create(OrcAcidUtils.getSideFile(this.path), false, 8, options.getReporter()); @@ -468,17 +467,19 @@ public void delete(long currentWriteId, Object row) throws IOException { @Override public void flush() throws IOException { - // We only support flushes on files with multiple transactions, because - // flushes create significant overhead in HDFS. Record updaters with a - // single transaction should be closed rather than flushed. + initWriter(); + // streaming ingest writer with single transaction batch size, in which case the transaction is + // either committed or aborted. In either cases we don't need flush length file but we need to + // flush intermediate footer to reduce memory pressure. if (flushLengths == null) { - throw new IllegalStateException("Attempting to flush a RecordUpdater on " - + path + " with a single transaction."); + // transaction batch size = 1 case + writer.writeIntermediateFooter(); + } else { + // transaction batch size > 1 case + long len = writer.writeIntermediateFooter(); + flushLengths.writeLong(len); + OrcInputFormat.SHIMS.hflush(flushLengths); } - initWriter(); - long len = writer.writeIntermediateFooter(); - flushLengths.writeLong(len); - OrcInputFormat.SHIMS.hflush(flushLengths); //multiple transactions only happen for streaming ingest which only allows inserts assert deleteEventWriter == null : "unexpected delete writer for " + path; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/DbTxnManager.java b/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/DbTxnManager.java index 76569d5..da378e9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/DbTxnManager.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/DbTxnManager.java @@ -1028,13 +1028,13 @@ public void replAllocateTableWriteIdsBatch(String dbName, String tableName, Stri } } - private static long getHeartbeatInterval(Configuration conf) throws LockException { + public static long getHeartbeatInterval(Configuration conf) throws LockException { // Retrieve HIVE_TXN_TIMEOUT in MILLISECONDS (it's defined as SECONDS), // then divide it by 2 to give us a safety factor. long interval = HiveConf.getTimeVar(conf, HiveConf.ConfVars.HIVE_TXN_TIMEOUT, TimeUnit.MILLISECONDS) / 2; if (interval == 0) { - throw new LockException(HiveConf.ConfVars.HIVE_TXN_MANAGER.toString() + " not set," + + throw new LockException(HiveConf.ConfVars.HIVE_TXN_TIMEOUT.toString() + " not set," + " heartbeats won't be sent"); } return interval; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 4661881..a123be1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -2381,7 +2381,7 @@ public Partition createPartition(Table tbl, Map partSpec) throws List in = new ArrayList(size); for (int i = 0; i < size; ++i) { - in.add(convertAddSpecToMetaPartition(tbl, addPartitionDesc.getPartition(i))); + in.add(convertAddSpecToMetaPartition(tbl, addPartitionDesc.getPartition(i), conf)); } List out = new ArrayList(); try { @@ -2431,8 +2431,8 @@ public Partition createPartition(Table tbl, Map partSpec) throws return out; } - private org.apache.hadoop.hive.metastore.api.Partition convertAddSpecToMetaPartition( - Table tbl, AddPartitionDesc.OnePartitionDesc addSpec) throws HiveException { + public static org.apache.hadoop.hive.metastore.api.Partition convertAddSpecToMetaPartition( + Table tbl, AddPartitionDesc.OnePartitionDesc addSpec, final HiveConf conf) throws HiveException { Path location = addSpec.getLocation() != null ? new Path(tbl.getPath(), addSpec.getLocation()) : null; if (location != null) { diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java new file mode 100644 index 0000000..40b2e8e --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java @@ -0,0 +1,669 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.hadoop.hive.serde2; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.CharacterCodingException; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.lazy.LazyFactory; +import org.apache.hadoop.hive.serde2.lazy.LazyObjectBase; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hive.common.util.HiveStringUtils; +import org.apache.hive.common.util.TimestampParser; +import org.codehaus.jackson.JsonFactory; +import org.codehaus.jackson.JsonParseException; +import org.codehaus.jackson.JsonParser; +import org.codehaus.jackson.JsonToken; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, + serdeConstants.LIST_COLUMN_TYPES, + serdeConstants.TIMESTAMP_FORMATS}) + +// FIXME: move TestJsonSerDe from hcat to serde2 +public class JsonSerDe extends AbstractSerDe { + + private static final Logger LOG = LoggerFactory.getLogger(JsonSerDe.class); + private List columnNames; + private StructTypeInfo schema; + + private JsonFactory jsonFactory = null; + + private StandardStructObjectInspector cachedObjectInspector; + private TimestampParser tsParser; + + @Override + public void initialize(Configuration conf, Properties tbl) + throws SerDeException { + List columnTypes; + StructTypeInfo rowTypeInfo; + + LOG.debug("Initializing JsonSerDe: {}", tbl.entrySet()); + + // Get column names and types + String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS); + String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); + final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl + .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA); + // all table column names + if (columnNameProperty.isEmpty()) { + columnNames = Collections.emptyList(); + } else { + columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter)); + } + + // all column types + if (columnTypeProperty.isEmpty()) { + columnTypes = Collections.emptyList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + + LOG.debug("columns: {}, {}", columnNameProperty, columnNames); + LOG.debug("types: {}, {} ", columnTypeProperty, columnTypes); + + assert (columnNames.size() == columnTypes.size()); + + rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + schema = rowTypeInfo; + LOG.debug("schema : {}", schema); + cachedObjectInspector = (StandardStructObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(rowTypeInfo); + + jsonFactory = new JsonFactory(); + tsParser = new TimestampParser( + HiveStringUtils.splitAndUnEscape(tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS))); + } + + /** + * Takes JSON string in Text form, and has to return an object representation above + * it that's readable by the corresponding object inspector. + * For this implementation, since we're using the jackson parser, we can construct + * our own object implementation, and we use HCatRecord for it + */ + @Override + public Object deserialize(Writable blob) throws SerDeException { + + Text t = (Text) blob; + JsonParser p; + List r = new ArrayList<>(Collections.nCopies(columnNames.size(), null)); + try { + p = jsonFactory.createJsonParser(new ByteArrayInputStream((t.getBytes()))); + if (p.nextToken() != JsonToken.START_OBJECT) { + throw new IOException("Start token not found where expected"); + } + JsonToken token; + while (((token = p.nextToken()) != JsonToken.END_OBJECT) && (token != null)) { + // iterate through each token, and create appropriate object here. + populateRecord(r, token, p, schema); + } + } catch (JsonParseException e) { + LOG.warn("Error [{}] parsing json text [{}].", e, t); + throw new SerDeException(e); + } catch (IOException e) { + LOG.warn("Error [{}] parsing json text [{}].", e, t); + throw new SerDeException(e); + } + + return r; + } + + private void populateRecord(List r, JsonToken token, JsonParser p, StructTypeInfo s) throws IOException { + if (token != JsonToken.FIELD_NAME) { + throw new IOException("Field name expected"); + } + String fieldName = p.getText(); + int fpos = s.getAllStructFieldNames().indexOf(fieldName); + if (fpos == -1) { + fpos = getPositionFromHiveInternalColumnName(fieldName); + LOG.debug("NPE finding position for field [{}] in schema [{}]," + + " attempting to check if it is an internal column name like _col0", fieldName, s); + if (fpos == -1) { + skipValue(p); + return; // unknown field, we return. We'll continue from the next field onwards. + } + // If we get past this, then the column name did match the hive pattern for an internal + // column name, such as _col0, etc, so it *MUST* match the schema for the appropriate column. + // This means people can't use arbitrary column names such as _col0, and expect us to ignore it + // if we find it. + if (!fieldName.equalsIgnoreCase(getHiveInternalColumnName(fpos))) { + LOG.error("Hive internal column name {} and position " + + "encoding {} for the column name are at odds", fieldName, fpos); + throw new IOException("Hive internal column name (" + fieldName + + ") and position encoding (" + fpos + + ") for the column name are at odds"); + } + // If we reached here, then we were successful at finding an alternate internal + // column mapping, and we're about to proceed. + } + Object currField = extractCurrentField(p, s.getStructFieldTypeInfo(fieldName), false); + r.set(fpos, currField); + } + + public String getHiveInternalColumnName(int fpos) { + return HiveConf.getColumnInternalName(fpos); + } + + public int getPositionFromHiveInternalColumnName(String internalName) { + // return HiveConf.getPositionFromInternalName(fieldName); + // The above line should have been all the implementation that + // we need, but due to a bug in that impl which recognizes + // only single-digit columns, we need another impl here. + Pattern internalPattern = Pattern.compile("_col([0-9]+)"); + Matcher m = internalPattern.matcher(internalName); + if (!m.matches()) { + return -1; + } else { + return Integer.parseInt(m.group(1)); + } + } + + /** + * Utility method to extract (and forget) the next value token from the JsonParser, + * as a whole. The reason this function gets called is to yank out the next value altogether, + * because it corresponds to a field name that we do not recognize, and thus, do not have + * a schema/type for. Thus, this field is to be ignored. + * + * @throws IOException + * @throws JsonParseException + */ + private void skipValue(JsonParser p) throws JsonParseException, IOException { + JsonToken valueToken = p.nextToken(); + + if ((valueToken == JsonToken.START_ARRAY) || (valueToken == JsonToken.START_OBJECT)) { + // if the currently read token is a beginning of an array or object, move stream forward + // skipping any child tokens till we're at the corresponding END_ARRAY or END_OBJECT token + p.skipChildren(); + } + // At the end of this function, the stream should be pointing to the last token that + // corresponds to the value being skipped. This way, the next call to nextToken + // will advance it to the next field name. + } + + /** + * Utility method to extract current expected field from given JsonParser + * isTokenCurrent is a boolean variable also passed in, which determines + * if the JsonParser is already at the token we expect to read next, or + * needs advancing to the next before we read. + */ + private Object extractCurrentField(JsonParser p, TypeInfo fieldTypeInfo, + boolean isTokenCurrent) throws IOException { + Object val = null; + JsonToken valueToken; + if (isTokenCurrent) { + valueToken = p.getCurrentToken(); + } else { + valueToken = p.nextToken(); + } + + switch (fieldTypeInfo.getCategory()) { + case PRIMITIVE: + PrimitiveObjectInspector.PrimitiveCategory primitiveCategory = PrimitiveObjectInspector.PrimitiveCategory.UNKNOWN; + if (fieldTypeInfo instanceof PrimitiveTypeInfo) { + primitiveCategory = ((PrimitiveTypeInfo) fieldTypeInfo).getPrimitiveCategory(); + } + switch (primitiveCategory) { + case INT: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue(); + break; + case BYTE: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue(); + break; + case SHORT: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue(); + break; + case LONG: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue(); + break; + case BOOLEAN: + String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); + if (bval != null) { + val = Boolean.valueOf(bval); + } else { + val = null; + } + break; + case FLOAT: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue(); + break; + case DOUBLE: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue(); + break; + case STRING: + val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); + break; + case BINARY: + String b = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); + if (b != null) { + try { + String t = Text.decode(b.getBytes(), 0, b.getBytes().length); + return t.getBytes(); + } catch (CharacterCodingException e) { + LOG.warn("Error generating json binary type from object.", e); + return null; + } + } else { + val = null; + } + break; + case DATE: + val = (valueToken == JsonToken.VALUE_NULL) ? null : Date.valueOf(p.getText()); + break; + case TIMESTAMP: + val = (valueToken == JsonToken.VALUE_NULL) ? null : tsParser.parseTimestamp(p.getText()); + break; + case DECIMAL: + val = (valueToken == JsonToken.VALUE_NULL) ? null : HiveDecimal.create(p.getText()); + break; + case VARCHAR: + int vLen = ((BaseCharTypeInfo) fieldTypeInfo).getLength(); + val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), vLen); + break; + case CHAR: + int cLen = ((BaseCharTypeInfo) fieldTypeInfo).getLength(); + val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveChar(p.getText(), cLen); + break; + } + break; + case LIST: + if (valueToken == JsonToken.VALUE_NULL) { + val = null; + break; + } + if (valueToken != JsonToken.START_ARRAY) { + throw new IOException("Start of Array expected"); + } + List arr = new ArrayList(); + while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) { + arr.add(extractCurrentField(p, ((ListTypeInfo)fieldTypeInfo).getListElementTypeInfo(), true)); + } + val = arr; + break; + case MAP: + if (valueToken == JsonToken.VALUE_NULL) { + val = null; + break; + } + if (valueToken != JsonToken.START_OBJECT) { + throw new IOException("Start of Object expected"); + } + Map map = new LinkedHashMap(); + while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) { + Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(), + (PrimitiveTypeInfo) ((MapTypeInfo)fieldTypeInfo).getMapKeyTypeInfo()); + Object v = extractCurrentField(p, ((MapTypeInfo) fieldTypeInfo).getMapValueTypeInfo(), false); + map.put(k, v); + } + val = map; + break; + case STRUCT: + if (valueToken == JsonToken.VALUE_NULL) { + val = null; + break; + } + if (valueToken != JsonToken.START_OBJECT) { + throw new IOException("Start of Object expected"); + } + ArrayList subSchema = ((StructTypeInfo)fieldTypeInfo).getAllStructFieldTypeInfos(); + int sz = subSchema.size(); + List struct = new ArrayList(Collections.nCopies(sz, null)); + while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) { + populateRecord(struct, valueToken, p, ((StructTypeInfo) fieldTypeInfo)); + } + val = struct; + break; + default: + LOG.error("Unknown type found: " + fieldTypeInfo); + return null; + } + return val; + } + + private Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveTypeInfo mapKeyType) + throws IOException { + switch (mapKeyType.getPrimitiveCategory()) { + case INT: + return Integer.valueOf(s); + case BYTE: + return Byte.valueOf(s); + case SHORT: + return Short.valueOf(s); + case LONG: + return Long.valueOf(s); + case BOOLEAN: + return (s.equalsIgnoreCase("true")); + case FLOAT: + return Float.valueOf(s); + case DOUBLE: + return Double.valueOf(s); + case STRING: + return s; + case BINARY: + try { + String t = Text.decode(s.getBytes(), 0, s.getBytes().length); + return t.getBytes(); + } catch (CharacterCodingException e) { + LOG.warn("Error generating json binary type from object.", e); + return null; + } + case DATE: + return Date.valueOf(s); + case TIMESTAMP: + return Timestamp.valueOf(s); + case DECIMAL: + return HiveDecimal.create(s); + case VARCHAR: + return new HiveVarchar(s, ((BaseCharTypeInfo) mapKeyType).getLength()); + case CHAR: + return new HiveChar(s, ((BaseCharTypeInfo) mapKeyType).getLength()); + } + throw new IOException("Could not convert from string to map type " + mapKeyType.getTypeName()); + } + + /** + * Given an object and object inspector pair, traverse the object + * and generate a Text representation of the object. + */ + @Override + public Writable serialize(Object obj, ObjectInspector objInspector) + throws SerDeException { + StringBuilder sb = new StringBuilder(); + try { + + StructObjectInspector soi = (StructObjectInspector) objInspector; + List structFields = soi.getAllStructFieldRefs(); + assert (columnNames.size() == structFields.size()); + if (obj == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACE); + for (int i = 0; i < structFields.size(); i++) { + if (i > 0) { + sb.append(SerDeUtils.COMMA); + } + appendWithQuotes(sb, columnNames.get(i)); + sb.append(SerDeUtils.COLON); + buildJSONString(sb, soi.getStructFieldData(obj, structFields.get(i)), + structFields.get(i).getFieldObjectInspector()); + } + sb.append(SerDeUtils.RBRACE); + } + + } catch (IOException e) { + LOG.warn("Error generating json text from object.", e); + throw new SerDeException(e); + } + return new Text(sb.toString()); + } + + private static StringBuilder appendWithQuotes(StringBuilder sb, String value) { + return sb == null ? null : sb.append(SerDeUtils.QUOTE).append(value).append(SerDeUtils.QUOTE); + } + + // TODO : code section copied over from SerDeUtils because of non-standard json production there + // should use quotes for all field names. We should fix this there, and then remove this copy. + // See http://jackson.codehaus.org/1.7.3/javadoc/org/codehaus/jackson/JsonParser.Feature.html#ALLOW_UNQUOTED_FIELD_NAMES + // for details - trying to enable Jackson to ignore that doesn't seem to work(compilation failure + // when attempting to use that feature, so having to change the production itself. + // Also, throws IOException when Binary is detected. + private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector oi) throws IOException { + + switch (oi.getCategory()) { + case PRIMITIVE: { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + if (o == null) { + sb.append("null"); + } else { + switch (poi.getPrimitiveCategory()) { + case BOOLEAN: { + boolean b = ((BooleanObjectInspector) poi).get(o); + sb.append(b ? "true" : "false"); + break; + } + case BYTE: { + sb.append(((ByteObjectInspector) poi).get(o)); + break; + } + case SHORT: { + sb.append(((ShortObjectInspector) poi).get(o)); + break; + } + case INT: { + sb.append(((IntObjectInspector) poi).get(o)); + break; + } + case LONG: { + sb.append(((LongObjectInspector) poi).get(o)); + break; + } + case FLOAT: { + sb.append(((FloatObjectInspector) poi).get(o)); + break; + } + case DOUBLE: { + sb.append(((DoubleObjectInspector) poi).get(o)); + break; + } + case STRING: { + String s = + SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(o)); + appendWithQuotes(sb, s); + break; + } + case BINARY: + byte[] b = ((BinaryObjectInspector) oi).getPrimitiveJavaObject(o); + Text txt = new Text(); + txt.set(b, 0, b.length); + appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString())); + break; + case DATE: + Date d = ((DateObjectInspector) poi).getPrimitiveJavaObject(o); + appendWithQuotes(sb, d.toString()); + break; + case TIMESTAMP: { + Timestamp t = ((TimestampObjectInspector) poi).getPrimitiveJavaObject(o); + appendWithQuotes(sb, t.toString()); + break; + } + case DECIMAL: + sb.append(((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(o)); + break; + case VARCHAR: { + String s = SerDeUtils.escapeString( + ((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(o).toString()); + appendWithQuotes(sb, s); + break; + } + case CHAR: { + //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13) + // HiveChar.toString() returns getPaddedValue() + String s = SerDeUtils.escapeString( + ((HiveCharObjectInspector) poi).getPrimitiveJavaObject(o).toString()); + appendWithQuotes(sb, s); + break; + } + default: + throw new RuntimeException("Unknown primitive type: " + poi.getPrimitiveCategory()); + } + } + break; + } + case LIST: { + ListObjectInspector loi = (ListObjectInspector) oi; + ObjectInspector listElementObjectInspector = loi + .getListElementObjectInspector(); + List olist = loi.getList(o); + if (olist == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACKET); + for (int i = 0; i < olist.size(); i++) { + if (i > 0) { + sb.append(SerDeUtils.COMMA); + } + buildJSONString(sb, olist.get(i), listElementObjectInspector); + } + sb.append(SerDeUtils.RBRACKET); + } + break; + } + case MAP: { + MapObjectInspector moi = (MapObjectInspector) oi; + ObjectInspector mapKeyObjectInspector = moi.getMapKeyObjectInspector(); + ObjectInspector mapValueObjectInspector = moi + .getMapValueObjectInspector(); + Map omap = moi.getMap(o); + if (omap == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACE); + boolean first = true; + for (Object entry : omap.entrySet()) { + if (first) { + first = false; + } else { + sb.append(SerDeUtils.COMMA); + } + Map.Entry e = (Map.Entry) entry; + StringBuilder keyBuilder = new StringBuilder(); + buildJSONString(keyBuilder, e.getKey(), mapKeyObjectInspector); + String keyString = keyBuilder.toString().trim(); + if ((!keyString.isEmpty()) && (keyString.charAt(0) != SerDeUtils.QUOTE)) { + appendWithQuotes(sb, keyString); + } else { + sb.append(keyString); + } + sb.append(SerDeUtils.COLON); + buildJSONString(sb, e.getValue(), mapValueObjectInspector); + } + sb.append(SerDeUtils.RBRACE); + } + break; + } + case STRUCT: { + StructObjectInspector soi = (StructObjectInspector) oi; + List structFields = soi.getAllStructFieldRefs(); + if (o == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACE); + for (int i = 0; i < structFields.size(); i++) { + if (i > 0) { + sb.append(SerDeUtils.COMMA); + } + appendWithQuotes(sb, structFields.get(i).getFieldName()); + sb.append(SerDeUtils.COLON); + buildJSONString(sb, soi.getStructFieldData(o, structFields.get(i)), + structFields.get(i).getFieldObjectInspector()); + } + sb.append(SerDeUtils.RBRACE); + } + break; + } + case UNION: { + UnionObjectInspector uoi = (UnionObjectInspector) oi; + if (o == null) { + sb.append("null"); + } else { + sb.append(SerDeUtils.LBRACE); + sb.append(uoi.getTag(o)); + sb.append(SerDeUtils.COLON); + buildJSONString(sb, uoi.getField(o), + uoi.getObjectInspectors().get(uoi.getTag(o))); + sb.append(SerDeUtils.RBRACE); + } + break; + } + default: + throw new RuntimeException("Unknown type in ObjectInspector!"); + } + } + + + /** + * Returns an object inspector for the specified schema that + * is capable of reading in the object representation of the JSON string + */ + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return cachedObjectInspector; + } + + @Override + public Class getSerializedClass() { + return Text.class; + } + + @Override + public SerDeStats getSerDeStats() { + // no support for statistics yet + return null; + } + +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreUtils.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreUtils.java index 8c159e9..742b6bf 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreUtils.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreUtils.java @@ -19,6 +19,7 @@ import org.apache.hadoop.hive.metastore.api.WMPoolSchedulingPolicy; +import com.google.common.base.Joiner; import com.google.common.base.Predicates; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -90,6 +91,7 @@ import java.util.Map.Entry; import java.util.SortedMap; import java.util.SortedSet; +import java.util.StringJoiner; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.Callable; @@ -1264,7 +1266,7 @@ public static Properties getPartSchemaFromTableSchema( if (sd.getBucketCols() != null && sd.getBucketCols().size() > 0) { schema.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.BUCKET_FIELD_NAME, - sd.getBucketCols().get(0)); + Joiner.on(",").join(sd.getBucketCols())); } // SerdeInfo @@ -1371,8 +1373,8 @@ public static Properties getSchemaWithoutCols(StorageDescriptor sd, .toString(sd.getNumBuckets())); if (sd.getBucketCols() != null && sd.getBucketCols().size() > 0) { schema.setProperty( - org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.BUCKET_FIELD_NAME, sd - .getBucketCols().get(0)); + org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.BUCKET_FIELD_NAME, + Joiner.on(",").join(sd.getBucketCols())); } if (sd.getSerdeInfo() != null) { for (Map.Entry param : sd.getSerdeInfo().getParameters().entrySet()) { diff --git a/streaming/pom.xml b/streaming/pom.xml index b58ec01..7b02e8b 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -56,12 +56,6 @@ ${project.version} - org.apache.hive.hcatalog - hive-hcatalog-core - true - ${project.version} - - org.apache.commons commons-lang3 true diff --git a/streaming/src/java/org/apache/hive/streaming/AbstractRecordWriter.java b/streaming/src/java/org/apache/hive/streaming/AbstractRecordWriter.java index 25998ae..643bcc4 100644 --- a/streaming/src/java/org/apache/hive/streaming/AbstractRecordWriter.java +++ b/streaming/src/java/org/apache/hive/streaming/AbstractRecordWriter.java @@ -19,17 +19,28 @@ package org.apache.hive.streaming; -import org.apache.hadoop.security.UserGroupInformation; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; + import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.JavaUtils; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStoreUtils; import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.AcidOutputFormat; import org.apache.hadoop.hive.ql.io.RecordUpdater; import org.apache.hadoop.hive.serde2.AbstractSerDe; @@ -38,162 +49,202 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector; import org.apache.hadoop.util.ReflectionUtils; -import org.apache.hive.hcatalog.common.HCatUtil; import org.apache.thrift.TException; - -import java.io.IOException; - -import java.security.PrivilegedExceptionAction; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Properties; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public abstract class AbstractRecordWriter implements RecordWriter { - static final private Logger LOG = LoggerFactory.getLogger(AbstractRecordWriter.class.getName()); - - private final HiveConf conf; - private final HiveEndPoint endPoint; - final Table tbl; - - private final IMetaStoreClient msClient; - final List bucketIds; - private ArrayList updaters = null; - - private final int totalBuckets; - /** - * Indicates whether target table is bucketed - */ - private final boolean isBucketed; + private static final Logger LOG = LoggerFactory.getLogger(AbstractRecordWriter.class.getName()); - private final Path partitionPath; - - private final AcidOutputFormat outf; - private Object[] bucketFieldData; // Pre-allocated in constructor. Updated on each write. + protected HiveConf conf; + private StreamingConnection conn; + protected Table tbl; + List inputColumns; + List inputTypes; + private String fullyQualifiedTableName; + private Map> updaters = new HashMap<>(); + private Map partitionPaths = new HashMap<>(); + private Set addedPartitions = new HashSet<>(); + // input OI includes table columns + partition columns + private StructObjectInspector inputRowObjectInspector; + // output OI strips off the partition columns and retains other columns + private ObjectInspector outputRowObjectInspector; + private List partitionColumns = new ArrayList<>(); + private ObjectInspector[] partitionObjInspectors = null; + private StructField[] partitionStructFields = null; + private Object[] partitionFieldData; + private ObjectInspector[] bucketObjInspectors = null; + private StructField[] bucketStructFields = null; + private Object[] bucketFieldData; + private List bucketIds = new ArrayList<>(); + private int totalBuckets; + private String defaultPartitionName; + private boolean isBucketed; + private AcidOutputFormat acidOutputFormat; private Long curBatchMinWriteId; private Long curBatchMaxWriteId; - private static final class TableWriterPair { - private final Table tbl; - private final Path partitionPath; - TableWriterPair(Table t, Path p) { - tbl = t; - partitionPath = p; + @Override + public void init(StreamingConnection conn, long minWriteId, long maxWriteId) throws StreamingException { + if (conn == null) { + throw new StreamingException("Streaming connection cannot be null during record writer initialization"); } - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #AbstractRecordWriter(HiveEndPoint, HiveConf, StreamingConnection)} - */ - protected AbstractRecordWriter(HiveEndPoint endPoint, HiveConf conf) - throws ConnectionError, StreamingException { - this(endPoint, conf, null); - } - protected AbstractRecordWriter(HiveEndPoint endPoint2, HiveConf conf, StreamingConnection conn) - throws StreamingException { - this.endPoint = endPoint2; - this.conf = conf!=null ? conf - : HiveEndPoint.createHiveConf(DelimitedInputWriter.class, endPoint.metaStoreUri); try { - msClient = HCatUtil.getHiveMetastoreClient(this.conf); - UserGroupInformation ugi = conn != null ? conn.getUserGroupInformation() : null; - if (ugi == null) { - this.tbl = msClient.getTable(endPoint.database, endPoint.table); - this.partitionPath = getPathForEndPoint(msClient, endPoint); - } else { - TableWriterPair twp = ugi.doAs( - new PrivilegedExceptionAction() { - @Override - public TableWriterPair run() throws Exception { - return new TableWriterPair(msClient.getTable(endPoint.database, endPoint.table), - getPathForEndPoint(msClient, endPoint)); - } - }); - this.tbl = twp.tbl; - this.partitionPath = twp.partitionPath; - } - this.isBucketed = tbl.getSd().getNumBuckets() > 0; - /** - * For unbucketed tables we have exactly 1 RecrodUpdater for each AbstractRecordWriter which - * ends up writing to a file bucket_000000 - * See also {@link #getBucket(Object)} - */ - this.totalBuckets = isBucketed ? tbl.getSd().getNumBuckets() : 1; - if(isBucketed) { - this.bucketIds = getBucketColIDs(tbl.getSd().getBucketCols(), tbl.getSd().getCols()); - this.bucketFieldData = new Object[bucketIds.size()]; - } - else { - bucketIds = Collections.emptyList(); + this.conn = conn; + this.curBatchMinWriteId = minWriteId; + this.curBatchMaxWriteId = maxWriteId; + this.conf = conn.getHiveConf(); + this.defaultPartitionName = conf.getVar(HiveConf.ConfVars.DEFAULTPARTITIONNAME); + final IMetaStoreClient msClient = HiveMetaStoreUtils.getHiveMetastoreClient(this.conf); + this.tbl = msClient.getTable(conn.getDatabase(), conn.getTable()); + this.inputColumns = tbl.getSd().getCols().stream().map(FieldSchema::getName).collect(Collectors.toList()); + this.inputTypes = tbl.getSd().getCols().stream().map(FieldSchema::getType).collect(Collectors.toList()); + if (conn.isPartitionedTable() && conn.isDynamicPartitioning()) { + this.partitionColumns = tbl.getPartitionKeys().stream().map(FieldSchema::getName).collect(Collectors.toList()); + this.inputColumns.addAll(partitionColumns); + this.inputTypes.addAll(tbl.getPartitionKeys().stream().map(FieldSchema::getType).collect(Collectors.toList())); } + this.fullyQualifiedTableName = Warehouse.getQualifiedName(conn.getDatabase(), conn.getTable()); String outFormatName = this.tbl.getSd().getOutputFormat(); - outf = (AcidOutputFormat) ReflectionUtils.newInstance(JavaUtils.loadClass(outFormatName), conf); - } catch(InterruptedException e) { - throw new StreamingException(endPoint2.toString(), e); + this.acidOutputFormat = (AcidOutputFormat) ReflectionUtils.newInstance(JavaUtils.loadClass(outFormatName), conf); } catch (MetaException | NoSuchObjectException e) { - throw new ConnectionError(endPoint2, e); + throw new ConnectionError(conn, e); } catch (TException | ClassNotFoundException | IOException e) { throw new StreamingException(e.getMessage(), e); } + + try { + final AbstractSerDe serDe = createSerde(); + this.inputRowObjectInspector = (StructObjectInspector) serDe.getObjectInspector(); + if (conn.isPartitionedTable() && conn.isDynamicPartitioning()) { + preparePartitioningFields(); + int dpStartCol = inputRowObjectInspector.getAllStructFieldRefs().size() - tbl.getPartitionKeys().size(); + this.outputRowObjectInspector = new SubStructObjectInspector(inputRowObjectInspector, 0, dpStartCol); + } else { + this.outputRowObjectInspector = inputRowObjectInspector; + } + prepareBucketingFields(); + } catch (SerDeException e) { + throw new StreamingException("Unable to create SerDe", e); + } + } + + private void prepareBucketingFields() { + this.isBucketed = tbl.getSd().getNumBuckets() > 0; + // For unbucketed tables we have exactly 1 RecordUpdater (until HIVE-19208) for each AbstractRecordWriter which + // ends up writing to a file bucket_000000. + // See also {@link #getBucket(Object)} + this.totalBuckets = isBucketed ? tbl.getSd().getNumBuckets() : 1; + if (isBucketed) { + this.bucketIds = getBucketColIDs(tbl.getSd().getBucketCols(), tbl.getSd().getCols()); + this.bucketFieldData = new Object[bucketIds.size()]; + this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, inputRowObjectInspector); + this.bucketStructFields = new StructField[bucketIds.size()]; + List allFields = inputRowObjectInspector.getAllStructFieldRefs(); + for (int i = 0; i < bucketIds.size(); i++) { + bucketStructFields[i] = allFields.get(bucketIds.get(i)); + } + } + } + + private void preparePartitioningFields() { + final int numPartitions = tbl.getPartitionKeys().size(); + this.partitionFieldData = new Object[numPartitions]; + this.partitionObjInspectors = new ObjectInspector[numPartitions]; + int startIdx = inputRowObjectInspector.getAllStructFieldRefs().size() - numPartitions; + int endIdx = inputRowObjectInspector.getAllStructFieldRefs().size(); + int j = 0; + for (int i = startIdx; i < endIdx; i++) { + StructField structField = inputRowObjectInspector.getAllStructFieldRefs().get(i); + partitionObjInspectors[j++] = structField.getFieldObjectInspector(); + } + this.partitionStructFields = new StructField[partitionColumns.size()]; + for (int i = 0; i < partitionColumns.size(); i++) { + String partCol = partitionColumns.get(i); + partitionStructFields[i] = inputRowObjectInspector.getStructFieldRef(partCol); + } } /** - * used to tag error msgs to provied some breadcrumbs + * used to tag error msgs to provided some breadcrumbs */ - String getWatermark() { - return partitionPath + " writeIds[" + curBatchMinWriteId + "," + curBatchMaxWriteId + "]"; + private String getWatermark(String partition) { + return partition + " writeIds[" + curBatchMinWriteId + "," + curBatchMaxWriteId + "]"; } + // return the column numbers of the bucketed columns private List getBucketColIDs(List bucketCols, List cols) { - ArrayList result = new ArrayList(bucketCols.size()); - HashSet bucketSet = new HashSet(bucketCols); + ArrayList result = new ArrayList<>(bucketCols.size()); + HashSet bucketSet = new HashSet<>(bucketCols); for (int i = 0; i < cols.size(); i++) { - if( bucketSet.contains(cols.get(i).getName()) ) { + if (bucketSet.contains(cols.get(i).getName())) { result.add(i); } } return result; } - /** - * Get the SerDe for the Objects created by {@link #encode}. This is public so that test - * frameworks can use it. - * @return serde - * @throws SerializationError - */ - public abstract AbstractSerDe getSerde() throws SerializationError; + public abstract AbstractSerDe createSerde() throws SerializationError; /** * Encode a record as an Object that Hive can read with the ObjectInspector associated with the - * serde returned by {@link #getSerde}. This is public so that test frameworks can use it. + * serde returned by {@link #createSerde}. This is public so that test frameworks can use it. + * * @param record record to be deserialized * @return deserialized record as an Object - * @throws SerializationError + * @throws SerializationError - any error during serialization or deserialization of record */ public abstract Object encode(byte[] record) throws SerializationError; - protected abstract ObjectInspector[] getBucketObjectInspectors(); - protected abstract StructObjectInspector getRecordObjectInspector(); - protected abstract StructField[] getBucketStructFields(); - // returns the bucket number to which the record belongs to - protected int getBucket(Object row) throws SerializationError { - if(!isBucketed) { + private int getBucket(Object row) { + if (!isBucketed) { return 0; } - ObjectInspector[] inspectors = getBucketObjectInspectors(); Object[] bucketFields = getBucketFields(row); - return ObjectInspectorUtils.getBucketNumber(bucketFields, inspectors, totalBuckets); + int bucketingVersion = Utilities.getBucketingVersion( + tbl.getParameters().get(hive_metastoreConstants.TABLE_BUCKETING_VERSION)); + + return bucketingVersion == 2 ? + ObjectInspectorUtils.getBucketNumber(bucketFields, bucketObjInspectors, totalBuckets) : + ObjectInspectorUtils.getBucketNumberOld(bucketFields, bucketObjInspectors, totalBuckets); + } + + private List getPartitionValues(final Object row) { + if (!conn.isPartitionedTable()) { + return null; + } + List partitionValues = new ArrayList<>(); + if (conn.isPartitionedTable() && conn.isDynamicPartitioning()) { + Object[] partitionFields = getPartitionFields(row); + for (int i = 0; i < partitionObjInspectors.length; i++) { + ObjectInspector oi = partitionObjInspectors[i]; + Object field = partitionFields[i]; + Object partitionValue = ObjectInspectorUtils.copyToStandardObject(field, oi, ObjectInspectorUtils + .ObjectInspectorCopyOption.WRITABLE); + if (partitionValue == null || partitionValue.toString().length() == 0) { + partitionValues.add(defaultPartitionName); + } else { + partitionValues.add(partitionValue.toString()); + } + } + } else { + partitionValues = conn.getStaticPartitionValues(); + } + return partitionValues; } @Override public void flush() throws StreamingIOFailure { try { - for (RecordUpdater updater : updaters) { - if (updater != null) { - updater.flush(); + for (Map.Entry> entry : updaters.entrySet()) { + LOG.info("Flushing record updater for partitions: {}", entry.getKey()); + for (RecordUpdater updater : entry.getValue()) { + if (updater != null) { + updater.flush(); + } } } } catch (IOException e) { @@ -202,123 +253,146 @@ public void flush() throws StreamingIOFailure { } @Override - public void clear() throws StreamingIOFailure { - } - - /** - * Creates a new record updater for the new batch - * @param minWriteId smallest writeid in the batch - * @param maxWriteID largest writeid in the batch - * @throws StreamingIOFailure if failed to create record updater - */ - @Override - public void newBatch(Long minWriteId, Long maxWriteID) - throws StreamingIOFailure, SerializationError { - curBatchMinWriteId = minWriteId; - curBatchMaxWriteId = maxWriteID; - updaters = new ArrayList(totalBuckets); - for (int bucket = 0; bucket < totalBuckets; bucket++) { - updaters.add(bucket, null);//so that get(i) returns null rather than ArrayOutOfBounds - } - } - - @Override - public void closeBatch() throws StreamingIOFailure { + public void close() throws StreamingIOFailure { boolean haveError = false; - for (RecordUpdater updater : updaters) { - if (updater != null) { - try { - //try not to leave any files open - updater.close(false); - } catch (Exception ex) { - haveError = true; - LOG.error("Unable to close " + updater + " due to: " + ex.getMessage(), ex); + String partition = null; + for (Map.Entry> entry : updaters.entrySet()) { + partition = entry.getKey(); + LOG.info("Closing updater for partitions: {}", partition); + for (RecordUpdater updater : entry.getValue()) { + if (updater != null) { + try { + //try not to leave any files open + updater.close(false); + } catch (Exception ex) { + haveError = true; + LOG.error("Unable to close " + updater + " due to: " + ex.getMessage(), ex); + } } } + entry.getValue().clear(); } updaters.clear(); - if(haveError) { - throw new StreamingIOFailure("Encountered errors while closing (see logs) " + getWatermark()); + if (haveError) { + throw new StreamingIOFailure("Encountered errors while closing (see logs) " + getWatermark(partition)); } } - protected static ObjectInspector[] getObjectInspectorsForBucketedCols(List bucketIds - , StructObjectInspector recordObjInspector) - throws SerializationError { + private static ObjectInspector[] getObjectInspectorsForBucketedCols(List bucketIds + , StructObjectInspector recordObjInspector) { ObjectInspector[] result = new ObjectInspector[bucketIds.size()]; for (int i = 0; i < bucketIds.size(); i++) { int bucketId = bucketIds.get(i); result[i] = - recordObjInspector.getAllStructFieldRefs().get( bucketId ).getFieldObjectInspector(); + recordObjInspector.getAllStructFieldRefs().get(bucketId).getFieldObjectInspector(); } return result; } - - private Object[] getBucketFields(Object row) throws SerializationError { - StructObjectInspector recordObjInspector = getRecordObjectInspector(); - StructField[] bucketStructFields = getBucketStructFields(); + private Object[] getBucketFields(Object row) { for (int i = 0; i < bucketIds.size(); i++) { - bucketFieldData[i] = recordObjInspector.getStructFieldData(row, bucketStructFields[i]); + bucketFieldData[i] = inputRowObjectInspector.getStructFieldData(row, bucketStructFields[i]); } return bucketFieldData; } - private RecordUpdater createRecordUpdater(int bucketId, Long minWriteId, Long maxWriteID) - throws IOException, SerializationError { + private Object[] getPartitionFields(Object row) { + for (int i = 0; i < partitionFieldData.length; i++) { + partitionFieldData[i] = inputRowObjectInspector.getStructFieldData(row, partitionStructFields[i]); + } + return partitionFieldData; + } + + @Override + public void write(final long writeId, final byte[] record) throws StreamingException { try { - // Initialize table properties from the table parameters. This is required because the table - // may define certain table parameters that may be required while writing. The table parameter - // 'transactional_properties' is one such example. - Properties tblProperties = new Properties(); - tblProperties.putAll(tbl.getParameters()); - return outf.getRecordUpdater(partitionPath, - new AcidOutputFormat.Options(conf) - .inspector(getSerde().getObjectInspector()) - .bucket(bucketId) - .tableProperties(tblProperties) - .minimumWriteId(minWriteId) - .maximumWriteId(maxWriteID) - .statementId(-1) - .finalDestination(partitionPath)); - } catch (SerDeException e) { - throw new SerializationError("Failed to get object inspector from Serde " - + getSerde().getClass().getName(), e); + Object encodedRow = encode(record); + int bucket = getBucket(encodedRow); + List partitionValues = getPartitionValues(encodedRow); + getRecordUpdater(partitionValues, bucket).insert(writeId, encodedRow); + } catch (IOException e) { + throw new StreamingIOFailure("Error writing record in transaction write id (" + + writeId + ")", e); } } - RecordUpdater getRecordUpdater(int bucketId) throws StreamingIOFailure, SerializationError { - RecordUpdater recordUpdater = updaters.get(bucketId); + @Override + public Set getPartitions() { + return addedPartitions; + } + + private RecordUpdater createRecordUpdater(final Path partitionPath, int bucketId, Long minWriteId, + Long maxWriteID) + throws IOException { + // Initialize table properties from the table parameters. This is required because the table + // may define certain table parameters that may be required while writing. The table parameter + // 'transactional_properties' is one such example. + Properties tblProperties = new Properties(); + tblProperties.putAll(tbl.getParameters()); + return acidOutputFormat.getRecordUpdater(partitionPath, + new AcidOutputFormat.Options(conf) + .inspector(outputRowObjectInspector) + .bucket(bucketId) + .tableProperties(tblProperties) + .minimumWriteId(minWriteId) + .maximumWriteId(maxWriteID) + .statementId(-1) + .finalDestination(partitionPath)); + } + + private RecordUpdater getRecordUpdater(List partitionValues, int bucketId) throws StreamingIOFailure { + RecordUpdater recordUpdater; + String key; + Path destLocation; + try { + key = partitionValues == null ? fullyQualifiedTableName : partitionValues.toString(); + // add partition in metastore for dynamic partition. We make a metastore call for every new partition value that + // we encounter even if partition already exists (exists check require a metastore call anyways). + if (partitionPaths.containsKey(key)) { + destLocation = partitionPaths.get(key); + } else { + // un-partitioned table + if (partitionValues == null) { + destLocation = new Path(tbl.getSd().getLocation()); + } else { + PartitionInfo partitionInfo = conn.createPartitionIfNotExists(partitionValues); + // collect the newly added partitions. connection.commitTransaction() will report the dynamically added + // partitions to TxnHandler + if (!partitionInfo.isExists()) { + addedPartitions.add(partitionInfo.getName()); + LOG.info("Created partition {} for table {}", partitionInfo.getName(), fullyQualifiedTableName); + } else { + LOG.info("Partition {} already exists for table {}", partitionInfo.getName(), fullyQualifiedTableName); + } + destLocation = new Path(partitionInfo.getPartitionLocation()); + } + partitionPaths.put(key, destLocation); + } + updaters.computeIfAbsent(key, k -> initializeBuckets()); + recordUpdater = updaters.get(key).get(bucketId); + } catch (StreamingException e) { + throw new StreamingIOFailure("Unable to create partition: " + partitionValues + "for " + conn, e); + } if (recordUpdater == null) { try { - recordUpdater = createRecordUpdater(bucketId, curBatchMinWriteId, curBatchMaxWriteId); + recordUpdater = createRecordUpdater(destLocation, bucketId, curBatchMinWriteId, curBatchMaxWriteId); } catch (IOException e) { - String errMsg = "Failed creating RecordUpdater for " + getWatermark(); + String errMsg = "Failed creating RecordUpdater for " + getWatermark(destLocation.toString()); LOG.error(errMsg, e); throw new StreamingIOFailure(errMsg, e); } - updaters.set(bucketId, recordUpdater); + List partitionUpdaters = updaters.get(key); + partitionUpdaters.set(bucketId, recordUpdater); } return recordUpdater; } - private Path getPathForEndPoint(IMetaStoreClient msClient, HiveEndPoint endPoint) - throws StreamingException { - try { - String location; - if(endPoint.partitionVals==null || endPoint.partitionVals.isEmpty() ) { - location = msClient.getTable(endPoint.database,endPoint.table) - .getSd().getLocation(); - } else { - location = msClient.getPartition(endPoint.database, endPoint.table, - endPoint.partitionVals).getSd().getLocation(); - } - return new Path(location); - } catch (TException e) { - throw new StreamingException(e.getMessage() - + ". Unable to get path for end point: " - + endPoint.partitionVals, e); + private List initializeBuckets() { + List result = new ArrayList<>(totalBuckets); + for (int bucket = 0; bucket < totalBuckets; bucket++) { + result.add(bucket, null); //so that get(i) returns null rather than ArrayOutOfBounds } + return result; } } diff --git a/streaming/src/java/org/apache/hive/streaming/ConnectionError.java b/streaming/src/java/org/apache/hive/streaming/ConnectionError.java index 668bffb..9b6a6aa 100644 --- a/streaming/src/java/org/apache/hive/streaming/ConnectionError.java +++ b/streaming/src/java/org/apache/hive/streaming/ConnectionError.java @@ -20,15 +20,15 @@ public class ConnectionError extends StreamingException { - public ConnectionError(String msg) { + ConnectionError(String msg) { super(msg); } - public ConnectionError(String msg, Exception innerEx) { + ConnectionError(String msg, Exception innerEx) { super(msg, innerEx); } - public ConnectionError(HiveEndPoint endPoint, Exception innerEx) { + ConnectionError(StreamingConnection endPoint, Exception innerEx) { super("Error connecting to " + endPoint + (innerEx == null ? "" : ": " + innerEx.getMessage()), innerEx); } diff --git a/streaming/src/java/org/apache/hive/streaming/ConnectionInfo.java b/streaming/src/java/org/apache/hive/streaming/ConnectionInfo.java new file mode 100644 index 0000000..ca8babf --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/ConnectionInfo.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +import java.util.List; + +/** + * Helper interface to get connection related information. + */ +public interface ConnectionInfo { + + /** + * Get metastore URI that metastore client uses. + * + * @return - metastore URI used by client + */ + String getMetastoreUri(); + + /** + * Get the database used by streaming connection. + * + * @return - database + */ + String getDatabase(); + + /** + * Get the table used by streaming connection. + * + * @return - table + */ + String getTable(); + + /** + * Get any static partitions specified during streaming connection creation. + * + * @return - static partition values + */ + List getStaticPartitionValues(); + + /** + * Get if the specified table is partitioned table or not. + * + * @return - true if partitioned table else false + */ + boolean isPartitionedTable(); + + /** + * Get if dynamic partitioning is used. + * + * @return - true if dynamic partitioning case else false + */ + boolean isDynamicPartitioning(); + + /** + * Get agent info that is set during streaming connection. + * + * @return - agent info + */ + String getAgentInfo(); +} diff --git a/streaming/src/java/org/apache/hive/streaming/DelimitedInputWriter.java b/streaming/src/java/org/apache/hive/streaming/DelimitedInputWriter.java deleted file mode 100644 index 898b3f9..0000000 --- a/streaming/src/java/org/apache/hive/streaming/DelimitedInputWriter.java +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - - -import com.google.common.annotations.VisibleForTesting; -import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.Table; -import org.apache.hadoop.hive.serde2.AbstractSerDe; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; -import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; -import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.io.BytesWritable; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - -/** - * Streaming Writer handles delimited input (eg. CSV). - * Delimited input is parsed & reordered to match column order in table - * Uses Lazy Simple Serde to process delimited input - */ -public class DelimitedInputWriter extends AbstractRecordWriter { - private final boolean reorderingNeeded; - private String delimiter; - private char serdeSeparator; - private int[] fieldToColMapping; - private final ArrayList tableColumns; - private LazySimpleSerDe serde = null; - - private final LazySimpleStructObjectInspector recordObjInspector; - private final ObjectInspector[] bucketObjInspectors; - private final StructField[] bucketStructFields; - - static final private Logger LOG = LoggerFactory.getLogger(DelimitedInputWriter.class.getName()); - - /** Constructor. Uses default separator of the LazySimpleSerde - * @param colNamesForFields Column name assignment for input fields. nulls or empty - * strings in the array indicates the fields to be skipped - * @param delimiter input field delimiter - * @param endPoint Hive endpoint - * @throws ConnectionError Problem talking to Hive - * @throws ClassNotFoundException Serde class not found - * @throws SerializationError Serde initialization/interaction failed - * @throws StreamingException Problem acquiring file system path for partition - * @throws InvalidColumn any element in colNamesForFields refers to a non existing column - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, StreamingConnection conn) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - this(colNamesForFields, delimiter, endPoint, null, conn); - } - /** Constructor. Uses default separator of the LazySimpleSerde - * @param colNamesForFields Column name assignment for input fields. nulls or empty - * strings in the array indicates the fields to be skipped - * @param delimiter input field delimiter - * @param endPoint Hive endpoint - * @param conf a Hive conf object. Can be null if not using advanced hive settings. - * @throws ConnectionError Problem talking to Hive - * @throws ClassNotFoundException Serde class not found - * @throws SerializationError Serde initialization/interaction failed - * @throws StreamingException Problem acquiring file system path for partition - * @throws InvalidColumn any element in colNamesForFields refers to a non existing column - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - this(colNamesForFields, delimiter, endPoint, conf, - (char) LazySerDeParameters.DefaultSeparators[0], conn); - } - /** - * Constructor. Allows overriding separator of the LazySimpleSerde - * @param colNamesForFields Column name assignment for input fields - * @param delimiter input field delimiter - * @param endPoint Hive endpoint - * @param conf a Hive conf object. Set to null if not using advanced hive settings. - * @param serdeSeparator separator used when encoding data that is fed into the - * LazySimpleSerde. Ensure this separator does not occur - * in the field data - * @param conn connection this Writer is to be used with - * @throws ConnectionError Problem talking to Hive - * @throws ClassNotFoundException Serde class not found - * @throws SerializationError Serde initialization/interaction failed - * @throws StreamingException Problem acquiring file system path for partition - * @throws InvalidColumn any element in colNamesForFields refers to a non existing column - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, HiveConf conf, char serdeSeparator, StreamingConnection conn) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - super(endPoint, conf, conn); - this.tableColumns = getCols(tbl); - this.serdeSeparator = serdeSeparator; - this.delimiter = delimiter; - this.fieldToColMapping = getFieldReordering(colNamesForFields, getTableColumns()); - this.reorderingNeeded = isReorderingNeeded(delimiter, getTableColumns()); - LOG.debug("Field reordering needed = " + this.reorderingNeeded + ", for endpoint " + endPoint); - this.serdeSeparator = serdeSeparator; - this.serde = createSerde(tbl, conf, serdeSeparator); - - // get ObjInspectors for entire record and bucketed cols - try { - this.recordObjInspector = (LazySimpleStructObjectInspector) serde.getObjectInspector(); - this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, recordObjInspector); - } catch (SerDeException e) { - throw new SerializationError("Unable to get ObjectInspector for bucket columns", e); - } - - // get StructFields for bucketed cols - bucketStructFields = new StructField[bucketIds.size()]; - List allFields = recordObjInspector.getAllStructFieldRefs(); - for (int i = 0; i < bucketIds.size(); i++) { - bucketStructFields[i] = allFields.get(bucketIds.get(i)); - } - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, StreamingConnection)} - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - this(colNamesForFields, delimiter, endPoint, null, null); - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, HiveConf, StreamingConnection)} - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, HiveConf conf) - throws ClassNotFoundException, ConnectionError, SerializationError, - InvalidColumn, StreamingException { - this(colNamesForFields, delimiter, endPoint, conf, - (char) LazySerDeParameters.DefaultSeparators[0], null); - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, HiveConf, char, StreamingConnection)} - */ - public DelimitedInputWriter(String[] colNamesForFields, String delimiter, - HiveEndPoint endPoint, HiveConf conf, char serdeSeparator) - throws ClassNotFoundException, StreamingException { - this(colNamesForFields, delimiter, endPoint, conf, serdeSeparator, null); - } - - private boolean isReorderingNeeded(String delimiter, ArrayList tableColumns) { - return !( delimiter.equals(String.valueOf(getSerdeSeparator())) - && areFieldsInColOrder(fieldToColMapping) - && tableColumns.size()>=fieldToColMapping.length ); - } - - private static boolean areFieldsInColOrder(int[] fieldToColMapping) { - for(int i=0; i tableColNames) - throws InvalidColumn { - int[] result = new int[ colNamesForFields.length ]; - for(int i=0; itableColNames.size()) { - throw new InvalidColumn("Number of field names exceeds the number of columns in table"); - } - return result; - } - - // Reorder fields in record based on the order of columns in the table - protected byte[] reorderFields(byte[] record) throws UnsupportedEncodingException { - if(!reorderingNeeded) { - return record; - } - String[] reorderedFields = new String[getTableColumns().size()]; - String decoded = new String(record); - String[] fields = decoded.split(delimiter,-1); - for (int i=0; i getTableColumns() { - return tableColumns; - } - - @Override - public void write(long writeId, byte[] record) - throws SerializationError, StreamingIOFailure { - try { - byte[] orderedFields = reorderFields(record); - Object encodedRow = encode(orderedFields); - int bucket = getBucket(encodedRow); - getRecordUpdater(bucket).insert(writeId, encodedRow); - } catch (IOException e) { - throw new StreamingIOFailure("Error writing record in transaction write id (" - + writeId + ")", e); - } - } - - @Override - public AbstractSerDe getSerde() { - return serde; - } - - protected LazySimpleStructObjectInspector getRecordObjectInspector() { - return recordObjInspector; - } - - @Override - protected StructField[] getBucketStructFields() { - return bucketStructFields; - } - - protected ObjectInspector[] getBucketObjectInspectors() { - return bucketObjInspectors; - } - - @Override - public Object encode(byte[] record) throws SerializationError { - try { - BytesWritable blob = new BytesWritable(); - blob.set(record, 0, record.length); - return serde.deserialize(blob); - } catch (SerDeException e) { - throw new SerializationError("Unable to convert byte[] record into Object", e); - } - } - - /** - * Creates LazySimpleSerde - * @return - * @throws SerializationError if serde could not be initialized - * @param tbl - */ - protected static LazySimpleSerDe createSerde(Table tbl, HiveConf conf, char serdeSeparator) - throws SerializationError { - try { - Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); - tableProps.setProperty("field.delim", String.valueOf(serdeSeparator)); - LazySimpleSerDe serde = new LazySimpleSerDe(); - SerDeUtils.initializeSerDe(serde, conf, tableProps, null); - return serde; - } catch (SerDeException e) { - throw new SerializationError("Error initializing serde", e); - } - } - - private ArrayList getCols(Table table) { - List cols = table.getSd().getCols(); - ArrayList colNames = new ArrayList(cols.size()); - for (FieldSchema col : cols) { - colNames.add(col.getName().toLowerCase()); - } - return colNames; - } - - public char getSerdeSeparator() { - return serdeSeparator; - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/HeartBeatFailure.java b/streaming/src/java/org/apache/hive/streaming/HeartBeatFailure.java deleted file mode 100644 index b1f9520..0000000 --- a/streaming/src/java/org/apache/hive/streaming/HeartBeatFailure.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -import java.util.Collection; -import java.util.Set; - -public class HeartBeatFailure extends StreamingException { - private Collection abortedTxns; - private Collection nosuchTxns; - - public HeartBeatFailure(Collection abortedTxns, Set nosuchTxns) { - super("Heart beat error. InvalidTxns: " + nosuchTxns + ". AbortedTxns: " + abortedTxns); - this.abortedTxns = abortedTxns; - this.nosuchTxns = nosuchTxns; - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/HiveEndPoint.java b/streaming/src/java/org/apache/hive/streaming/HiveEndPoint.java deleted file mode 100644 index b04e137..0000000 --- a/streaming/src/java/org/apache/hive/streaming/HiveEndPoint.java +++ /dev/null @@ -1,1117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -import java.io.IOException; -import java.security.PrivilegedExceptionAction; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hive.cli.CliSessionState; -import org.apache.hadoop.hive.common.JavaUtils; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.IMetaStoreClient; -import org.apache.hadoop.hive.metastore.LockComponentBuilder; -import org.apache.hadoop.hive.metastore.LockRequestBuilder; -import org.apache.hadoop.hive.metastore.Warehouse; -import org.apache.hadoop.hive.metastore.api.DataOperationType; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.HeartbeatTxnRangeResponse; -import org.apache.hadoop.hive.metastore.api.LockRequest; -import org.apache.hadoop.hive.metastore.api.LockResponse; -import org.apache.hadoop.hive.metastore.api.LockState; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; -import org.apache.hadoop.hive.metastore.api.NoSuchTxnException; -import org.apache.hadoop.hive.metastore.api.Table; -import org.apache.hadoop.hive.metastore.api.TxnAbortedException; -import org.apache.hadoop.hive.metastore.api.TxnToWriteId; -import org.apache.hadoop.hive.ql.DriverFactory; -import org.apache.hadoop.hive.ql.IDriver; -import org.apache.hadoop.hive.ql.io.AcidUtils; -import org.apache.hadoop.hive.ql.session.SessionState; -import org.apache.hadoop.security.UserGroupInformation; -import org.apache.hive.hcatalog.common.HCatUtil; -import org.apache.thrift.TException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Information about the hive end point (i.e. table or partition) to write to. - * A light weight object that does NOT internally hold on to resources such as - * network connections. It can be stored in Hashed containers such as sets and hash tables. - */ -public class HiveEndPoint { - public final String metaStoreUri; - public final String database; - public final String table; - public final ArrayList partitionVals; - - - static final private Logger LOG = LoggerFactory.getLogger(HiveEndPoint.class.getName()); - - /** - * - * @param metaStoreUri URI of the metastore to connect to eg: thrift://localhost:9083 - * @param database Name of the Hive database - * @param table Name of table to stream to - * @param partitionVals Indicates the specific partition to stream to. Can be null or empty List - * if streaming to a table without partitions. The order of values in this - * list must correspond exactly to the order of partition columns specified - * during the table creation. E.g. For a table partitioned by - * (continent string, country string), partitionVals could be the list - * ("Asia", "India"). - */ - public HiveEndPoint(String metaStoreUri - , String database, String table, List partitionVals) { - this.metaStoreUri = metaStoreUri; - if (database==null) { - throw new IllegalArgumentException("Database cannot be null for HiveEndPoint"); - } - this.database = database; - this.table = table; - if (table==null) { - throw new IllegalArgumentException("Table cannot be null for HiveEndPoint"); - } - this.partitionVals = partitionVals==null ? new ArrayList() - : new ArrayList( partitionVals ); - } - - - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #newConnection(boolean, String)} - */ - @Deprecated - public StreamingConnection newConnection(final boolean createPartIfNotExists) - throws ConnectionError, InvalidPartition, InvalidTable, PartitionCreationFailed - , ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, null, null, null); - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #newConnection(boolean, HiveConf, String)} - */ - @Deprecated - public StreamingConnection newConnection(final boolean createPartIfNotExists, HiveConf conf) - throws ConnectionError, InvalidPartition, InvalidTable, PartitionCreationFailed - , ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, conf, null, null); - } - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #newConnection(boolean, HiveConf, UserGroupInformation, String)} - */ - @Deprecated - public StreamingConnection newConnection(final boolean createPartIfNotExists, final HiveConf conf, - final UserGroupInformation authenticatedUser) - throws ConnectionError, InvalidPartition, - InvalidTable, PartitionCreationFailed, ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, conf, authenticatedUser, null); - } - /** - * Acquire a new connection to MetaStore for streaming - * @param createPartIfNotExists If true, the partition specified in the endpoint - * will be auto created if it does not exist - * @param agentInfo should uniquely identify the process/entity that is using this batch. This - * should be something that can be correlated with calling application log files - * and/or monitoring consoles. - * @return - * @throws ConnectionError if problem connecting - * @throws InvalidPartition if specified partition is not valid (createPartIfNotExists = false) - * @throws ImpersonationFailed if not able to impersonate 'proxyUser' - * @throws PartitionCreationFailed if failed to create partition - * @throws InterruptedException - */ - public StreamingConnection newConnection(final boolean createPartIfNotExists, String agentInfo) - throws ConnectionError, InvalidPartition, InvalidTable, PartitionCreationFailed - , ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, null, null, agentInfo); - } - - /** - * Acquire a new connection to MetaStore for streaming - * @param createPartIfNotExists If true, the partition specified in the endpoint - * will be auto created if it does not exist - * @param conf HiveConf object, set it to null if not using advanced hive settings. - * @param agentInfo should uniquely identify the process/entity that is using this batch. This - * should be something that can be correlated with calling application log files - * and/or monitoring consoles. - * @return - * @throws ConnectionError if problem connecting - * @throws InvalidPartition if specified partition is not valid (createPartIfNotExists = false) - * @throws ImpersonationFailed if not able to impersonate 'proxyUser' - * @throws PartitionCreationFailed if failed to create partition - * @throws InterruptedException - */ - public StreamingConnection newConnection(final boolean createPartIfNotExists, HiveConf conf, String agentInfo) - throws ConnectionError, InvalidPartition, InvalidTable, PartitionCreationFailed - , ImpersonationFailed , InterruptedException { - return newConnection(createPartIfNotExists, conf, null, agentInfo); - } - - /** - * Acquire a new connection to MetaStore for streaming. To connect using Kerberos, - * 'authenticatedUser' argument should have been used to do a kerberos login. Additionally the - * 'hive.metastore.kerberos.principal' setting should be set correctly either in hive-site.xml or - * in the 'conf' argument (if not null). If using hive-site.xml, it should be in classpath. - * - * @param createPartIfNotExists If true, the partition specified in the endpoint - * will be auto created if it does not exist - * @param conf HiveConf object to be used for the connection. Can be null. - * @param authenticatedUser UserGroupInformation object obtained from successful authentication. - * Uses non-secure mode if this argument is null. - * @param agentInfo should uniquely identify the process/entity that is using this batch. This - * should be something that can be correlated with calling application log files - * and/or monitoring consoles. - * @return - * @throws ConnectionError if there is a connection problem - * @throws InvalidPartition if specified partition is not valid (createPartIfNotExists = false) - * @throws ImpersonationFailed if not able to impersonate 'username' - * @throws PartitionCreationFailed if failed to create partition - * @throws InterruptedException - */ - public StreamingConnection newConnection(final boolean createPartIfNotExists, final HiveConf conf, - final UserGroupInformation authenticatedUser, final String agentInfo) - throws ConnectionError, InvalidPartition, - InvalidTable, PartitionCreationFailed, ImpersonationFailed , InterruptedException { - - if( authenticatedUser==null ) { - return newConnectionImpl(authenticatedUser, createPartIfNotExists, conf, agentInfo); - } - - try { - return authenticatedUser.doAs ( - new PrivilegedExceptionAction() { - @Override - public StreamingConnection run() - throws ConnectionError, InvalidPartition, InvalidTable - , PartitionCreationFailed { - return newConnectionImpl(authenticatedUser, createPartIfNotExists, conf, agentInfo); - } - } - ); - } catch (IOException e) { - throw new ConnectionError("Failed to connect as : " + authenticatedUser.getShortUserName(), e); - } - } - - private StreamingConnection newConnectionImpl(UserGroupInformation ugi, - boolean createPartIfNotExists, HiveConf conf, String agentInfo) - throws ConnectionError, InvalidPartition, InvalidTable - , PartitionCreationFailed { - return new ConnectionImpl(this, ugi, conf, createPartIfNotExists, agentInfo); - } - - private static UserGroupInformation getUserGroupInfo(String user) - throws ImpersonationFailed { - try { - return UserGroupInformation.createProxyUser( - user, UserGroupInformation.getLoginUser()); - } catch (IOException e) { - LOG.error("Unable to get UserGroupInfo for user : " + user, e); - throw new ImpersonationFailed(user,e); - } - } - - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - - HiveEndPoint endPoint = (HiveEndPoint) o; - - if (database != null - ? !database.equals(endPoint.database) - : endPoint.database != null ) { - return false; - } - if (metaStoreUri != null - ? !metaStoreUri.equals(endPoint.metaStoreUri) - : endPoint.metaStoreUri != null ) { - return false; - } - if (!partitionVals.equals(endPoint.partitionVals)) { - return false; - } - if (table != null ? !table.equals(endPoint.table) : endPoint.table != null) { - return false; - } - - return true; - } - - @Override - public int hashCode() { - int result = metaStoreUri != null ? metaStoreUri.hashCode() : 0; - result = 31 * result + (database != null ? database.hashCode() : 0); - result = 31 * result + (table != null ? table.hashCode() : 0); - result = 31 * result + partitionVals.hashCode(); - return result; - } - - @Override - public String toString() { - return "{" + - "metaStoreUri='" + metaStoreUri + '\'' + - ", database='" + database + '\'' + - ", table='" + table + '\'' + - ", partitionVals=" + partitionVals + " }"; - } - - - private static class ConnectionImpl implements StreamingConnection { - private final IMetaStoreClient msClient; - private final IMetaStoreClient heartbeaterMSClient; - private final HiveEndPoint endPt; - private final UserGroupInformation ugi; - private final String username; - private final boolean secureMode; - private final String agentInfo; - - /** - * @param endPoint end point to connect to - * @param ugi on behalf of whom streaming is done. cannot be null - * @param conf HiveConf object - * @param createPart create the partition if it does not exist - * @throws ConnectionError if there is trouble connecting - * @throws InvalidPartition if specified partition does not exist (and createPart=false) - * @throws InvalidTable if specified table does not exist - * @throws PartitionCreationFailed if createPart=true and not able to create partition - */ - private ConnectionImpl(HiveEndPoint endPoint, UserGroupInformation ugi, - HiveConf conf, boolean createPart, String agentInfo) - throws ConnectionError, InvalidPartition, InvalidTable - , PartitionCreationFailed { - this.endPt = endPoint; - this.ugi = ugi; - this.agentInfo = agentInfo; - this.username = ugi==null ? System.getProperty("user.name") : ugi.getShortUserName(); - if (conf==null) { - conf = HiveEndPoint.createHiveConf(this.getClass(), endPoint.metaStoreUri); - } - else { - overrideConfSettings(conf); - } - this.secureMode = ugi==null ? false : ugi.hasKerberosCredentials(); - this.msClient = getMetaStoreClient(endPoint, conf, secureMode); - // We use a separate metastore client for heartbeat calls to ensure heartbeat RPC calls are - // isolated from the other transaction related RPC calls. - this.heartbeaterMSClient = getMetaStoreClient(endPoint, conf, secureMode); - checkEndPoint(endPoint, msClient); - if (createPart && !endPoint.partitionVals.isEmpty()) { - createPartitionIfNotExists(endPoint, msClient, conf); - } - } - - /** - * Checks the validity of endpoint - * @param endPoint the HiveEndPoint to be checked - * @param msClient the metastore client - * @throws InvalidTable - */ - private void checkEndPoint(HiveEndPoint endPoint, IMetaStoreClient msClient) - throws InvalidTable, ConnectionError { - Table t; - try { - t = msClient.getTable(endPoint.database, endPoint.table); - } catch (Exception e) { - LOG.warn("Unable to check the endPoint: " + endPoint, e); - throw new InvalidTable(endPoint.database, endPoint.table, e); - } - // 1 - check that the table is Acid - if (!AcidUtils.isFullAcidTable(t)) { - LOG.error("HiveEndPoint " + endPoint + " must use an acid table"); - throw new InvalidTable(endPoint.database, endPoint.table, "is not an Acid table"); - } - - // 2 - check if partitionvals are legitimate - if (t.getPartitionKeys() != null && !t.getPartitionKeys().isEmpty() - && endPoint.partitionVals.isEmpty()) { - // Invalid if table is partitioned, but endPoint's partitionVals is empty - String errMsg = "HiveEndPoint " + endPoint + " doesn't specify any partitions for " + - "partitioned table"; - LOG.error(errMsg); - throw new ConnectionError(errMsg); - } - if ((t.getPartitionKeys() == null || t.getPartitionKeys().isEmpty()) - && !endPoint.partitionVals.isEmpty()) { - // Invalid if table is not partitioned, but endPoint's partitionVals is not empty - String errMsg = "HiveEndPoint" + endPoint + " specifies partitions for unpartitioned table"; - LOG.error(errMsg); - throw new ConnectionError(errMsg); - } - } - - /** - * Close connection - */ - @Override - public void close() { - if (ugi==null) { - msClient.close(); - heartbeaterMSClient.close(); - return; - } - try { - ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public Void run() throws Exception { - msClient.close(); - heartbeaterMSClient.close(); - return null; - } - } ); - try { - FileSystem.closeAllForUGI(ugi); - } catch (IOException exception) { - LOG.error("Could not clean up file-system handles for UGI: " + ugi, exception); - } - } catch (IOException e) { - LOG.error("Error closing connection to " + endPt, e); - } catch (InterruptedException e) { - LOG.error("Interrupted when closing connection to " + endPt, e); - } - } - - @Override - public UserGroupInformation getUserGroupInformation() { - return ugi; - } - - /** - * Acquires a new batch of transactions from Hive. - * - * @param numTransactions is a hint from client indicating how many transactions client needs. - * @param recordWriter Used to write record. The same writer instance can - * be shared with another TransactionBatch (to the same endpoint) - * only after the first TransactionBatch has been closed. - * Writer will be closed when the TransactionBatch is closed. - * @return - * @throws StreamingIOFailure if failed to create new RecordUpdater for batch - * @throws TransactionBatchUnAvailable if failed to acquire a new Transaction batch - * @throws ImpersonationFailed failed to run command as proxyUser - * @throws InterruptedException - */ - @Override - public TransactionBatch fetchTransactionBatch(final int numTransactions, - final RecordWriter recordWriter) - throws StreamingException, TransactionBatchUnAvailable, ImpersonationFailed - , InterruptedException { - if (ugi==null) { - return fetchTransactionBatchImpl(numTransactions, recordWriter); - } - try { - return ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public TransactionBatch run() throws StreamingException, InterruptedException { - return fetchTransactionBatchImpl(numTransactions, recordWriter); - } - } - ); - } catch (IOException e) { - throw new ImpersonationFailed("Failed to fetch Txn Batch as user '" + ugi.getShortUserName() - + "' when acquiring Transaction Batch on endPoint " + endPt, e); - } - } - - private TransactionBatch fetchTransactionBatchImpl(int numTransactions, - RecordWriter recordWriter) - throws StreamingException, TransactionBatchUnAvailable, InterruptedException { - return new TransactionBatchImpl(username, ugi, endPt, numTransactions, msClient, - heartbeaterMSClient, recordWriter, agentInfo); - } - - - private static void createPartitionIfNotExists(HiveEndPoint ep, - IMetaStoreClient msClient, HiveConf conf) - throws InvalidTable, PartitionCreationFailed { - if (ep.partitionVals.isEmpty()) { - return; - } - SessionState localSession = null; - if(SessionState.get() == null) { - localSession = SessionState.start(new CliSessionState(conf)); - } - IDriver driver = DriverFactory.newDriver(conf); - - try { - if (LOG.isDebugEnabled()) { - LOG.debug("Attempting to create partition (if not existent) " + ep); - } - - List partKeys = msClient.getTable(ep.database, ep.table) - .getPartitionKeys(); - runDDL(driver, "use " + ep.database); - String query = "alter table " + ep.table + " add if not exists partition " - + partSpecStr(partKeys, ep.partitionVals); - runDDL(driver, query); - } catch (MetaException e) { - LOG.error("Failed to create partition : " + ep, e); - throw new PartitionCreationFailed(ep, e); - } catch (NoSuchObjectException e) { - LOG.error("Failed to create partition : " + ep, e); - throw new InvalidTable(ep.database, ep.table); - } catch (TException e) { - LOG.error("Failed to create partition : " + ep, e); - throw new PartitionCreationFailed(ep, e); - } catch (QueryFailedException e) { - LOG.error("Failed to create partition : " + ep, e); - throw new PartitionCreationFailed(ep, e); - } finally { - driver.close(); - try { - if(localSession != null) { - localSession.close(); - } - } catch (IOException e) { - LOG.warn("Error closing SessionState used to run Hive DDL."); - } - } - } - - private static boolean runDDL(IDriver driver, String sql) throws QueryFailedException { - if (LOG.isDebugEnabled()) { - LOG.debug("Running Hive Query: " + sql); - } - driver.run(sql); - return true; - } - - private static String partSpecStr(List partKeys, ArrayList partVals) { - if (partKeys.size()!=partVals.size()) { - throw new IllegalArgumentException("Partition values:" + partVals + - ", does not match the partition Keys in table :" + partKeys ); - } - StringBuilder buff = new StringBuilder(partKeys.size()*20); - buff.append(" ( "); - int i=0; - for (FieldSchema schema : partKeys) { - buff.append(schema.getName()); - buff.append("='"); - buff.append(partVals.get(i)); - buff.append("'"); - if (i!=partKeys.size()-1) { - buff.append(","); - } - ++i; - } - buff.append(" )"); - return buff.toString(); - } - - private static IMetaStoreClient getMetaStoreClient(HiveEndPoint endPoint, HiveConf conf, boolean secureMode) - throws ConnectionError { - - if (endPoint.metaStoreUri!= null) { - conf.setVar(HiveConf.ConfVars.METASTOREURIS, endPoint.metaStoreUri); - } - if(secureMode) { - conf.setBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL,true); - } - try { - return HCatUtil.getHiveMetastoreClient(conf); - } catch (MetaException e) { - throw new ConnectionError("Error connecting to Hive Metastore URI: " - + endPoint.metaStoreUri + ". " + e.getMessage(), e); - } catch (IOException e) { - throw new ConnectionError("Error connecting to Hive Metastore URI: " - + endPoint.metaStoreUri + ". " + e.getMessage(), e); - } - } - } // class ConnectionImpl - - private static class TransactionBatchImpl implements TransactionBatch { - private final String username; - private final UserGroupInformation ugi; - private final HiveEndPoint endPt; - private final IMetaStoreClient msClient; - private final IMetaStoreClient heartbeaterMSClient; - private final RecordWriter recordWriter; - private final List txnToWriteIds; - - //volatile because heartbeat() may be in a "different" thread; updates of this are "piggybacking" - private volatile int currentTxnIndex = -1; - private final String partNameForLock; - //volatile because heartbeat() may be in a "different" thread - private volatile TxnState state; - private LockRequest lockRequest = null; - /** - * once any operation on this batch encounters a system exception - * (e.g. IOException on write) it's safest to assume that we can't write to the - * file backing this batch any more. This guards important public methods - */ - private volatile boolean isClosed = false; - private final String agentInfo; - /** - * Tracks the state of each transaction - */ - private final TxnState[] txnStatus; - /** - * ID of the last txn used by {@link #beginNextTransactionImpl()} - */ - private long lastTxnUsed; - - /** - * Represents a batch of transactions acquired from MetaStore - * - * @throws StreamingException if failed to create new RecordUpdater for batch - * @throws TransactionBatchUnAvailable if failed to acquire a new Transaction batch - */ - private TransactionBatchImpl(final String user, UserGroupInformation ugi, HiveEndPoint endPt, - final int numTxns, final IMetaStoreClient msClient, - final IMetaStoreClient heartbeaterMSClient, RecordWriter recordWriter, String agentInfo) - throws StreamingException, TransactionBatchUnAvailable, InterruptedException { - boolean success = false; - try { - if ( endPt.partitionVals!=null && !endPt.partitionVals.isEmpty() ) { - Table tableObj = msClient.getTable(endPt.database, endPt.table); - List partKeys = tableObj.getPartitionKeys(); - partNameForLock = Warehouse.makePartName(partKeys, endPt.partitionVals); - } else { - partNameForLock = null; - } - this.username = user; - this.ugi = ugi; - this.endPt = endPt; - this.msClient = msClient; - this.heartbeaterMSClient = heartbeaterMSClient; - this.recordWriter = recordWriter; - this.agentInfo = agentInfo; - - List txnIds = openTxnImpl(msClient, user, numTxns, ugi); - txnToWriteIds = allocateWriteIdsImpl(msClient, txnIds, ugi); - assert(txnToWriteIds.size() == numTxns); - - txnStatus = new TxnState[numTxns]; - for(int i = 0; i < txnStatus.length; i++) { - assert(txnToWriteIds.get(i).getTxnId() == txnIds.get(i)); - txnStatus[i] = TxnState.OPEN;//Open matches Metastore state - } - this.state = TxnState.INACTIVE; - - // The Write Ids returned for the transaction batch is also sequential - recordWriter.newBatch(txnToWriteIds.get(0).getWriteId(), txnToWriteIds.get(numTxns-1).getWriteId()); - success = true; - } catch (TException e) { - throw new TransactionBatchUnAvailable(endPt, e); - } catch (IOException e) { - throw new TransactionBatchUnAvailable(endPt, e); - } - finally { - //clean up if above throws - markDead(success); - } - } - - private List openTxnImpl(final IMetaStoreClient msClient, final String user, final int numTxns, UserGroupInformation ugi) - throws IOException, TException, InterruptedException { - if(ugi==null) { - return msClient.openTxns(user, numTxns).getTxn_ids(); - } - return (List) ugi.doAs(new PrivilegedExceptionAction() { - @Override - public Object run() throws Exception { - return msClient.openTxns(user, numTxns).getTxn_ids(); - } - }); - } - - private List allocateWriteIdsImpl(final IMetaStoreClient msClient, - final List txnIds, UserGroupInformation ugi) - throws IOException, TException, InterruptedException { - if(ugi==null) { - return msClient.allocateTableWriteIdsBatch(txnIds, endPt.database, endPt.table); - } - return (List) ugi.doAs(new PrivilegedExceptionAction() { - @Override - public Object run() throws Exception { - return msClient.allocateTableWriteIdsBatch(txnIds, endPt.database, endPt.table); - } - }); - } - - @Override - public String toString() { - if (txnToWriteIds==null || txnToWriteIds.isEmpty()) { - return "{}"; - } - StringBuilder sb = new StringBuilder(" TxnStatus["); - for(TxnState state : txnStatus) { - //'state' should not be null - future proofing - sb.append(state == null ? "N" : state); - } - sb.append("] LastUsed ").append(JavaUtils.txnIdToString(lastTxnUsed)); - return "TxnId/WriteIds=[" + txnToWriteIds.get(0).getTxnId() - + "/" + txnToWriteIds.get(0).getWriteId() - + "..." - + txnToWriteIds.get(txnToWriteIds.size()-1).getTxnId() - + "/" + txnToWriteIds.get(txnToWriteIds.size()-1).getWriteId() - + "] on endPoint = " + endPt + "; " + sb; - } - - /** - * Activate the next available transaction in the current transaction batch - * @throws TransactionError failed to switch to next transaction - */ - @Override - public void beginNextTransaction() throws TransactionError, ImpersonationFailed, - InterruptedException { - checkIsClosed(); - if (ugi==null) { - beginNextTransactionImpl(); - return; - } - try { - ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public Void run() throws TransactionError { - beginNextTransactionImpl(); - return null; - } - } - ); - } catch (IOException e) { - throw new ImpersonationFailed("Failed switching to next Txn as user '" + username + - "' in Txn batch :" + this, e); - } - } - - private void beginNextTransactionImpl() throws TransactionError { - state = TxnState.INACTIVE;//clear state from previous txn - - if ((currentTxnIndex + 1) >= txnToWriteIds.size()) { - throw new InvalidTrasactionState("No more transactions available in" + - " current batch for end point : " + endPt); - } - ++currentTxnIndex; - state = TxnState.OPEN; - lastTxnUsed = getCurrentTxnId(); - lockRequest = createLockRequest(endPt, partNameForLock, username, getCurrentTxnId(), agentInfo); - try { - LockResponse res = msClient.lock(lockRequest); - if (res.getState() != LockState.ACQUIRED) { - throw new TransactionError("Unable to acquire lock on " + endPt); - } - } catch (TException e) { - throw new TransactionError("Unable to acquire lock on " + endPt, e); - } - } - - /** - * Get Id of currently open transaction. - * @return -1 if there is no open TX - */ - @Override - public Long getCurrentTxnId() { - if (currentTxnIndex >= 0) { - return txnToWriteIds.get(currentTxnIndex).getTxnId(); - } - return -1L; - } - - /** - * Get Id of currently open transaction. - * @return -1 if there is no open TX - */ - @Override - public Long getCurrentWriteId() { - if (currentTxnIndex >= 0) { - return txnToWriteIds.get(currentTxnIndex).getWriteId(); - } - return -1L; - } - - /** - * get state of current transaction - * @return - */ - @Override - public TxnState getCurrentTransactionState() { - return state; - } - - /** - * Remaining transactions are the ones that are not committed or aborted or active. - * Active transaction is not considered part of remaining txns. - * @return number of transactions remaining this batch. - */ - @Override - public int remainingTransactions() { - if (currentTxnIndex>=0) { - return txnToWriteIds.size() - currentTxnIndex -1; - } - return txnToWriteIds.size(); - } - - - /** - * Write record using RecordWriter - * @param record the data to be written - * @throws StreamingIOFailure I/O failure - * @throws SerializationError serialization error - * @throws ImpersonationFailed error writing on behalf of proxyUser - * @throws InterruptedException - */ - @Override - public void write(final byte[] record) - throws StreamingException, InterruptedException { - write(Collections.singletonList(record)); - } - private void checkIsClosed() throws IllegalStateException { - if(isClosed) { - throw new IllegalStateException("TransactionBatch " + toString() + " has been closed()"); - } - } - /** - * A transaction batch opens a single HDFS file and writes multiple transaction to it. If there is any issue - * with the write, we can't continue to write to the same file any as it may be corrupted now (at the tail). - * This ensures that a client can't ignore these failures and continue to write. - */ - private void markDead(boolean success) { - if(success) { - return; - } - isClosed = true;//also ensures that heartbeat() is no-op since client is likely doing it async - try { - abort(true);//abort all remaining txns - } - catch(Exception ex) { - LOG.error("Fatal error on " + toString() + "; cause " + ex.getMessage(), ex); - } - try { - closeImpl(); - } - catch (Exception ex) { - LOG.error("Fatal error on " + toString() + "; cause " + ex.getMessage(), ex); - } - } - - - /** - * Write records using RecordWriter - * @param records collection of rows to be written - * @throws StreamingException serialization error - * @throws ImpersonationFailed error writing on behalf of proxyUser - * @throws InterruptedException - */ - @Override - public void write(final Collection records) - throws StreamingException, InterruptedException, - ImpersonationFailed { - checkIsClosed(); - boolean success = false; - try { - if (ugi == null) { - writeImpl(records); - } else { - ugi.doAs( - new PrivilegedExceptionAction() { - @Override - public Void run() throws StreamingException { - writeImpl(records); - return null; - } - } - ); - } - success = true; - } catch(SerializationError ex) { - //this exception indicates that a {@code record} could not be parsed and the - //caller can decide whether to drop it or send it to dead letter queue. - //rolling back the txn and retrying won't help since the tuple will be exactly the same - //when it's replayed. - success = true; - throw ex; - } catch(IOException e){ - throw new ImpersonationFailed("Failed writing as user '" + username + - "' to endPoint :" + endPt + ". Transaction Id: " - + getCurrentTxnId(), e); - } - finally { - markDead(success); - } - } - - private void writeImpl(Collection records) - throws StreamingException { - for (byte[] record : records) { - recordWriter.write(getCurrentWriteId(), record); - } - } - - - /** - * Commit the currently open transaction - * @throws TransactionError - * @throws StreamingIOFailure if flushing records failed - * @throws ImpersonationFailed if - * @throws InterruptedException - */ - @Override - public void commit() throws TransactionError, StreamingException, - ImpersonationFailed, InterruptedException { - checkIsClosed(); - boolean success = false; - try { - if (ugi == null) { - commitImpl(); - } - else { - ugi.doAs( - new PrivilegedExceptionAction() { - @Override - public Void run() throws StreamingException { - commitImpl(); - return null; - } - } - ); - } - success = true; - } catch (IOException e) { - throw new ImpersonationFailed("Failed committing Txn ID " + getCurrentTxnId() + " as user '" - + username + "'on endPoint :" + endPt + ". Transaction Id: ", e); - } - finally { - markDead(success); - } - } - - private void commitImpl() throws TransactionError, StreamingException { - try { - recordWriter.flush(); - msClient.commitTxn(txnToWriteIds.get(currentTxnIndex).getTxnId()); - state = TxnState.COMMITTED; - txnStatus[currentTxnIndex] = TxnState.COMMITTED; - } catch (NoSuchTxnException e) { - throw new TransactionError("Invalid transaction id : " - + getCurrentTxnId(), e); - } catch (TxnAbortedException e) { - throw new TransactionError("Aborted transaction cannot be committed" - , e); - } catch (TException e) { - throw new TransactionError("Unable to commit transaction" - + getCurrentTxnId(), e); - } - } - - /** - * Abort the currently open transaction - * @throws TransactionError - */ - @Override - public void abort() throws TransactionError, StreamingException - , ImpersonationFailed, InterruptedException { - if(isClosed) { - /** - * isDead is only set internally by this class. {@link #markDead(boolean)} will abort all - * remaining txns, so make this no-op to make sure that a well-behaved client that calls abort() - * error doesn't get misleading errors - */ - return; - } - abort(false); - } - private void abort(final boolean abortAllRemaining) throws TransactionError, StreamingException - , ImpersonationFailed, InterruptedException { - if (ugi==null) { - abortImpl(abortAllRemaining); - return; - } - try { - ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public Void run() throws StreamingException { - abortImpl(abortAllRemaining); - return null; - } - } - ); - } catch (IOException e) { - throw new ImpersonationFailed("Failed aborting Txn " + getCurrentTxnId() + " as user '" - + username + "' on endPoint :" + endPt, e); - } - } - - private void abortImpl(boolean abortAllRemaining) throws TransactionError, StreamingException { - try { - if(abortAllRemaining) { - //when last txn finished (abort/commit) the currentTxnIndex is pointing at that txn - //so we need to start from next one, if any. Also if batch was created but - //fetchTransactionBatch() was never called, we want to start with first txn - int minOpenTxnIndex = Math.max(currentTxnIndex + - (state == TxnState.ABORTED || state == TxnState.COMMITTED ? 1 : 0), 0); - for(currentTxnIndex = minOpenTxnIndex; - currentTxnIndex < txnToWriteIds.size(); currentTxnIndex++) { - msClient.rollbackTxn(txnToWriteIds.get(currentTxnIndex).getTxnId()); - txnStatus[currentTxnIndex] = TxnState.ABORTED; - } - currentTxnIndex--;//since the loop left it == txnToWriteIds.size() - } - else { - if (getCurrentTxnId() > 0) { - msClient.rollbackTxn(getCurrentTxnId()); - txnStatus[currentTxnIndex] = TxnState.ABORTED; - } - } - state = TxnState.ABORTED; - recordWriter.clear(); - } catch (NoSuchTxnException e) { - throw new TransactionError("Unable to abort invalid transaction id : " - + getCurrentTxnId(), e); - } catch (TException e) { - throw new TransactionError("Unable to abort transaction id : " - + getCurrentTxnId(), e); - } - } - - @Override - public void heartbeat() throws StreamingException, HeartBeatFailure { - if(isClosed) { - return; - } - if(state != TxnState.OPEN && currentTxnIndex >= txnToWriteIds.size() - 1) { - //here means last txn in the batch is resolved but the close() hasn't been called yet so - //there is nothing to heartbeat - return; - } - //if here after commit()/abort() but before next beginNextTransaction(), currentTxnIndex still - //points at the last txn which we don't want to heartbeat - Long first = txnToWriteIds.get(state == TxnState.OPEN ? currentTxnIndex : currentTxnIndex + 1).getTxnId(); - Long last = txnToWriteIds.get(txnToWriteIds.size()-1).getTxnId(); - try { - HeartbeatTxnRangeResponse resp = heartbeaterMSClient.heartbeatTxnRange(first, last); - if (!resp.getAborted().isEmpty() || !resp.getNosuch().isEmpty()) { - throw new HeartBeatFailure(resp.getAborted(), resp.getNosuch()); - } - } catch (TException e) { - throw new StreamingException("Failure to heartbeat on ids (" + first + "src/gen/thrift" - + last + ") on end point : " + endPt ); - } - } - - @Override - public boolean isClosed() { - return isClosed; - } - /** - * Close the TransactionBatch. This will abort any still open txns in this batch. - * @throws StreamingIOFailure I/O failure when closing transaction batch - */ - @Override - public void close() throws StreamingException, ImpersonationFailed, InterruptedException { - if(isClosed) { - return; - } - isClosed = true; - abortImpl(true);//abort proactively so that we don't wait for timeout - closeImpl();//perhaps we should add a version of RecordWriter.closeBatch(boolean abort) which - //will call RecordUpdater.close(boolean abort) - } - private void closeImpl() throws StreamingException, InterruptedException{ - state = TxnState.INACTIVE; - if(ugi == null) { - recordWriter.closeBatch(); - return; - } - try { - ugi.doAs ( - new PrivilegedExceptionAction() { - @Override - public Void run() throws StreamingException { - recordWriter.closeBatch(); - return null; - } - } - ); - try { - FileSystem.closeAllForUGI(ugi); - } catch (IOException exception) { - LOG.error("Could not clean up file-system handles for UGI: " + ugi, exception); - } - } catch (IOException e) { - throw new ImpersonationFailed("Failed closing Txn Batch as user '" + username + - "' on endPoint :" + endPt, e); - } - } - - private static LockRequest createLockRequest(final HiveEndPoint hiveEndPoint, - String partNameForLock, String user, long txnId, String agentInfo) { - LockRequestBuilder rqstBuilder = agentInfo == null ? - new LockRequestBuilder() : new LockRequestBuilder(agentInfo); - rqstBuilder.setUser(user); - rqstBuilder.setTransactionId(txnId); - - LockComponentBuilder lockCompBuilder = new LockComponentBuilder() - .setDbName(hiveEndPoint.database) - .setTableName(hiveEndPoint.table) - .setShared() - .setOperationType(DataOperationType.INSERT); - if (partNameForLock!=null && !partNameForLock.isEmpty() ) { - lockCompBuilder.setPartitionName(partNameForLock); - } - rqstBuilder.addLockComponent(lockCompBuilder.build()); - - return rqstBuilder.build(); - } - } // class TransactionBatchImpl - - static HiveConf createHiveConf(Class clazz, String metaStoreUri) { - HiveConf conf = new HiveConf(clazz); - if (metaStoreUri!= null) { - setHiveConf(conf, HiveConf.ConfVars.METASTOREURIS, metaStoreUri); - } - HiveEndPoint.overrideConfSettings(conf); - return conf; - } - - private static void overrideConfSettings(HiveConf conf) { - setHiveConf(conf, HiveConf.ConfVars.HIVE_TXN_MANAGER, - "org.apache.hadoop.hive.ql.lockmgr.DbTxnManager"); - setHiveConf(conf, HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, true); - setHiveConf(conf, HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI, true); - // Avoids creating Tez Client sessions internally as it takes much longer currently - setHiveConf(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE, "mr"); - } - - private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var, String value) { - if( LOG.isDebugEnabled() ) { - LOG.debug("Overriding HiveConf setting : " + var + " = " + value); - } - conf.setVar(var, value); - } - - private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var, boolean value) { - if( LOG.isDebugEnabled() ) { - LOG.debug("Overriding HiveConf setting : " + var + " = " + value); - } - conf.setBoolVar(var, value); - } - -} // class HiveEndPoint diff --git a/streaming/src/java/org/apache/hive/streaming/HiveStreamingConnection.java b/streaming/src/java/org/apache/hive/streaming/HiveStreamingConnection.java new file mode 100644 index 0000000..7bdcba4 --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/HiveStreamingConnection.java @@ -0,0 +1,1070 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +import java.io.IOException; +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; + +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.JavaUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStoreUtils; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.LockComponentBuilder; +import org.apache.hadoop.hive.metastore.LockRequestBuilder; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.DataOperationType; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.HeartbeatTxnRangeResponse; +import org.apache.hadoop.hive.metastore.api.LockRequest; +import org.apache.hadoop.hive.metastore.api.LockResponse; +import org.apache.hadoop.hive.metastore.api.LockState; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.NoSuchTxnException; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.TxnAbortedException; +import org.apache.hadoop.hive.metastore.api.TxnToWriteId; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +import org.apache.hadoop.hive.ql.lockmgr.LockException; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.plan.AddPartitionDesc; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +/** + * Streaming connection implementation for hive. To create a streaming connection, use the builder API + * to create record writer first followed by the connection itself. Once connection is created, clients can + * begin a transaction, keep writing using the connection, commit the transaction and close connection when done. + * To bind to the correct metastore, HiveConf object has to be created from hive-site.xml or HIVE_CONF_DIR. + * If hive conf is manually created, metastore uri has to be set correctly. If hive conf object is not specified, + * "thrift://localhost:9083" will be used as default. + *

+ * NOTE: The streaming connection APIs and record writer APIs are not thread-safe. Streaming connection creation, + * begin/commit/abort transactions, write and close has to be called in the same thread. If close() or + * abortTransaction() has to be triggered from a separate thread it has to be co-ordinated via external variables or + * synchronization mechanism + *

+ * Example usage: + *
{@code
+ * // create delimited record writer whose schema exactly matches table schema
+ * StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder()
+ *                                      .withFieldDelimiter(',')
+ *                                      .build();
+ *
+ * // create and open streaming connection (default.src table has to exist already)
+ * StreamingConnection connection = HiveStreamingConnection.newBuilder()
+ *                                    .withDatabase("default")
+ *                                    .withTable("src")
+ *                                    .withAgentInfo("nifi-agent")
+ *                                    .withRecordWriter(writer)
+ *                                    .withHiveConf(hiveConf)
+ *                                    .connect();
+ *
+ * // begin a transaction, write records and commit 1st transaction
+ * connection.beginTransaction();
+ * connection.write("key1,val1".getBytes());
+ * connection.write("key2,val2".getBytes());
+ * connection.commitTransaction();
+ *
+ * // begin another transaction, write more records and commit 2nd transaction
+ * connection.beginTransaction();
+ * connection.write("key3,val3".getBytes());
+ * connection.write("key4,val4".getBytes());
+ * connection.commitTransaction();
+ *
+ * // close the streaming connection
+ * connection.close();
+ * }
+ * 
+ */ +public class HiveStreamingConnection implements StreamingConnection { + private static final Logger LOG = LoggerFactory.getLogger(HiveStreamingConnection.class.getName()); + + private static final String DEFAULT_METASTORE_URI = "thrift://localhost:9083"; + private static final int DEFAULT_TRANSACTION_BATCH_SIZE = 1; + private static final int DEFAULT_HEARTBEAT_INTERVAL = 60 * 1000; + private static final boolean DEFAULT_STREAMING_OPTIMIZATIONS_ENABLED = true; + + public enum TxnState { + INACTIVE("I"), OPEN("O"), COMMITTED("C"), ABORTED("A"); + + private final String code; + + TxnState(String code) { + this.code = code; + } + + public String toString() { + return code; + } + } + + // fields populated from builder + private String database; + private String table; + private List staticPartitionValues; + private String agentInfo; + private int transactionBatchSize; + private RecordWriter recordWriter; + private TransactionBatch currentTransactionBatch; + private HiveConf conf; + private boolean streamingOptimizations; + private AtomicBoolean isConnectionClosed = new AtomicBoolean(false); + + // internal fields + private boolean isPartitionedTable; + private IMetaStoreClient msClient; + private IMetaStoreClient heartbeatMSClient; + private final String username; + private final boolean secureMode; + private Table tableObject = null; + private String metastoreUri; + + private HiveStreamingConnection(Builder builder) throws StreamingException { + this.database = builder.database.toLowerCase(); + this.table = builder.table.toLowerCase(); + this.staticPartitionValues = builder.staticPartitionValues; + this.conf = builder.hiveConf; + this.agentInfo = builder.agentInfo; + this.streamingOptimizations = builder.streamingOptimizations; + UserGroupInformation loggedInUser = null; + try { + loggedInUser = UserGroupInformation.getLoginUser(); + } catch (IOException e) { + LOG.warn("Unable to get logged in user via UGI. err: {}", e.getMessage()); + } + if (loggedInUser == null) { + this.username = System.getProperty("user.name"); + this.secureMode = false; + } else { + this.username = loggedInUser.getShortUserName(); + this.secureMode = loggedInUser.hasKerberosCredentials(); + } + this.transactionBatchSize = builder.transactionBatchSize; + this.recordWriter = builder.recordWriter; + if (agentInfo == null) { + try { + agentInfo = username + ":" + InetAddress.getLocalHost().getHostName() + ":" + Thread.currentThread().getName(); + } catch (UnknownHostException e) { + // ignore and use UUID instead + this.agentInfo = UUID.randomUUID().toString(); + } + } + if (conf == null) { + conf = createHiveConf(this.getClass(), DEFAULT_METASTORE_URI); + } + overrideConfSettings(conf); + this.metastoreUri = conf.get(MetastoreConf.ConfVars.THRIFT_URIS.getHiveName()); + this.msClient = getMetaStoreClient(conf, metastoreUri, secureMode); + // We use a separate metastore client for heartbeat calls to ensure heartbeat RPC calls are + // isolated from the other transaction related RPC calls. + this.heartbeatMSClient = getMetaStoreClient(conf, metastoreUri, secureMode); + validateTable(); + LOG.info("STREAMING CONNECTION INFO: {}", toConnectionInfoString()); + } + + public static Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + private String database; + private String table; + private List staticPartitionValues; + private String agentInfo; + private HiveConf hiveConf; + private int transactionBatchSize = DEFAULT_TRANSACTION_BATCH_SIZE; + private boolean streamingOptimizations = DEFAULT_STREAMING_OPTIMIZATIONS_ENABLED; + private RecordWriter recordWriter; + + /** + * Specify database to use for streaming connection. + * + * @param database - db name + * @return - builder + */ + public Builder withDatabase(final String database) { + this.database = database; + return this; + } + + /** + * Specify table to use for streaming connection. + * + * @param table - table name + * @return - builder + */ + public Builder withTable(final String table) { + this.table = table; + return this; + } + + /** + * Specify the name of partition to use for streaming connection. + * + * @param staticPartitionValues - static partition values + * @return - builder + */ + public Builder withStaticPartitionValues(final List staticPartitionValues) { + this.staticPartitionValues = staticPartitionValues == null ? null : new ArrayList<>(staticPartitionValues); + return this; + } + + /** + * Specify agent info to use for streaming connection. + * + * @param agentInfo - agent info + * @return - builder + */ + public Builder withAgentInfo(final String agentInfo) { + this.agentInfo = agentInfo; + return this; + } + + /** + * Specify hive configuration object to use for streaming connection. + * Generate this object by point to already existing hive-site.xml or HIVE_CONF_DIR. + * Make sure if metastore URI has been set correctly else thrift://localhost:9083 will be + * used as default. + * + * @param hiveConf - hive conf object + * @return - builder + */ + public Builder withHiveConf(final HiveConf hiveConf) { + this.hiveConf = hiveConf; + return this; + } + + /** + * Transaction batch size to use (default value is 10). This is expert level configuration. + * For every transaction batch a delta directory will be created which will impact + * when compaction will trigger. + * NOTE: This is evolving API and is subject to change/might not be honored in future releases. + * + * @param transactionBatchSize - transaction batch size + * @return - builder + */ + @InterfaceStability.Evolving + public Builder withTransactionBatchSize(final int transactionBatchSize) { + this.transactionBatchSize = transactionBatchSize; + return this; + } + + /** + * Whether to enable streaming optimizations. This is expert level configurations. + * Disabling streaming optimizations will have significant impact to performance and memory consumption. + * + * @param enable - flag to enable or not + * @return - builder + */ + public Builder withStreamingOptimizations(final boolean enable) { + this.streamingOptimizations = enable; + return this; + } + + /** + * Record writer to use for writing records to destination table. + * + * @param recordWriter - record writer + * @return - builder + */ + public Builder withRecordWriter(final RecordWriter recordWriter) { + this.recordWriter = recordWriter; + return this; + } + + /** + * Returning a streaming connection to hive. + * + * @return - hive streaming connection + */ + public HiveStreamingConnection connect() throws StreamingException { + if (database == null) { + throw new StreamingException("Database cannot be null for streaming connection"); + } + if (table == null) { + throw new StreamingException("Table cannot be null for streaming connection"); + } + if (recordWriter == null) { + throw new StreamingException("Record writer cannot be null for streaming connection"); + } + return new HiveStreamingConnection(this); + } + } + + private void setPartitionedTable(boolean isPartitionedTable) { + this.isPartitionedTable = isPartitionedTable; + } + + @Override + public String toString() { + return "{ metaStoreUri: " + metastoreUri + ", database: " + database + ", table: " + table + " }"; + } + + private String toConnectionInfoString() { + return "{ metastore-uri: " + metastoreUri + ", " + + "database: " + database + ", " + + "table: " + table + ", " + + "partitioned-table: " + isPartitionedTable() + ", " + + "dynamic-partitioning: " + isDynamicPartitioning() + ", " + + "username: " + username + ", " + + "secure-mode: " + secureMode + ", " + + "record-writer: " + recordWriter.getClass().getSimpleName() + ", " + + "agent-info: " + agentInfo + " }"; + } + + @VisibleForTesting + String toTransactionString() { + return currentTransactionBatch == null ? "" : currentTransactionBatch.toString(); + } + + @Override + public PartitionInfo createPartitionIfNotExists(final List partitionValues) throws StreamingException { + String partLocation = null; + String partName = null; + boolean exists = false; + try { + Map partSpec = Warehouse.makeSpecFromValues(tableObject.getPartitionKeys(), partitionValues); + AddPartitionDesc addPartitionDesc = new AddPartitionDesc(database, table, true); + partName = Warehouse.makePartName(tableObject.getPartitionKeys(), partitionValues); + partLocation = new Path(tableObject.getDataLocation(), Warehouse.makePartPath(partSpec)).toString(); + addPartitionDesc.addPartition(partSpec, partLocation); + Partition partition = Hive.convertAddSpecToMetaPartition(tableObject, addPartitionDesc.getPartition(0), conf); + msClient.add_partition(partition); + } catch (AlreadyExistsException e) { + exists = true; + } catch (HiveException | TException e) { + throw new StreamingException("Unable to creation partition for values: " + partitionValues + " connection: " + + toConnectionInfoString()); + } + return new PartitionInfo(partName, partLocation, exists); + } + + private void validateTable() throws InvalidTable, ConnectionError { + try { + tableObject = new Table(msClient.getTable(database, table)); + } catch (Exception e) { + LOG.warn("Unable to validate the table for connection: " + toConnectionInfoString(), e); + throw new InvalidTable(database, table, e); + } + // 1 - check that the table is Acid + if (!AcidUtils.isFullAcidTable(tableObject)) { + LOG.error("HiveEndPoint " + this + " must use an acid table"); + throw new InvalidTable(database, table, "is not an Acid table"); + } + + if (tableObject.getPartitionKeys() != null && !tableObject.getPartitionKeys().isEmpty()) { + setPartitionedTable(true); + } else { + setPartitionedTable(false); + } + + // partition values are specified on non-partitioned table + if (!isPartitionedTable() && (staticPartitionValues != null && !staticPartitionValues.isEmpty())) { + // Invalid if table is not partitioned, but endPoint's partitionVals is not empty + String errMsg = this.toString() + " specifies partitions for un-partitioned table"; + LOG.error(errMsg); + throw new ConnectionError(errMsg); + } + } + + /** + * Class to atomically set min and max value of a transaction range. + */ + private static class AtomicTransactionRange { + private static class Range { + private long min; + private long max; + + Range(long min, long max) { + this.min = min; + this.max = max; + } + + @Override + public String toString() { + return "[" + min + ":" + max + "]"; + } + } + + private final AtomicReference atomicRange; + + AtomicTransactionRange(long initialMin, long initialMax) { + this.atomicRange = new AtomicReference<>(new Range(initialMin, initialMax)); + } + + public long getMin() { + return atomicRange.get().min; + } + + public long getMax() { + return atomicRange.get().max; + } + + public void setMin(long newMin) { + while (true) { + Range oldRange = atomicRange.get(); + Range newRange = new Range(newMin, oldRange.max); + if (atomicRange.compareAndSet(oldRange, newRange)) { + // CAS successful, return + return; + } + } + } + + public void setMax(long newMax) { + while (true) { + Range oldRange = atomicRange.get(); + Range newRange = new Range(oldRange.min, newMax); + if (atomicRange.compareAndSet(oldRange, newRange)) { + // CAS successful, return + return; + } + } + } + + @Override + public String toString() { + return atomicRange.get().toString(); + } + } + + private static class HeartbeatRunnable implements Runnable { + private final AtomicTransactionRange transactionRange; + private final IMetaStoreClient heartbeatMSClient; + + HeartbeatRunnable(final IMetaStoreClient heartbeatMSClient, + final AtomicTransactionRange transactionRange) { + this.heartbeatMSClient = heartbeatMSClient; + this.transactionRange = transactionRange; + } + + @Override + public void run() { + try { + if (transactionRange.getMin() > 0) { + HeartbeatTxnRangeResponse resp = heartbeatMSClient.heartbeatTxnRange(transactionRange.getMin(), + transactionRange.getMax()); + if (!resp.getAborted().isEmpty() || !resp.getNosuch().isEmpty()) { + // ignoring the errors here as heart beating is best effort + LOG.warn("Heartbeat failure: {}. Ignoring..", resp.toString()); + } else { + LOG.info("Heartbeat sent for range: {}", transactionRange); + } + } + } catch (TException e) { + LOG.warn("Failure to heartbeat for transaction range: " + transactionRange, e); + } + } + } + + private void beginNextTransaction() throws StreamingException { + if (currentTransactionBatch == null) { + currentTransactionBatch = createNewTransactionBatch(); + LOG.info("Opened new transaction batch {}", currentTransactionBatch); + } + + if (currentTransactionBatch.isClosed()) { + throw new IllegalStateException("Cannot begin next transaction on a closed streaming connection"); + } + + if (currentTransactionBatch.remainingTransactions() == 0) { + LOG.info("Transaction batch {} is done. Rolling over to next transaction batch.", + currentTransactionBatch); + currentTransactionBatch.close(); + currentTransactionBatch = createNewTransactionBatch(); + LOG.info("Rolled over to new transaction batch {}", currentTransactionBatch); + } + currentTransactionBatch.beginNextTransaction(); + } + + private TransactionBatch createNewTransactionBatch() throws StreamingException { + return new TransactionBatch(this); + } + + private void checkClosedState() throws StreamingException { + if (isConnectionClosed.get()) { + throw new StreamingException("Streaming connection is closed already."); + } + } + + private void checkState() throws StreamingException { + checkClosedState(); + if (currentTransactionBatch == null) { + throw new StreamingException("Transaction batch is null. Missing beginTransaction?"); + } + if (currentTransactionBatch.state != TxnState.OPEN) { + throw new StreamingException("Transaction state is not OPEN. Missing beginTransaction?"); + } + } + + @Override + public void write(final byte[] record) throws StreamingException { + checkState(); + currentTransactionBatch.write(record); + } + + @Override + public void beginTransaction() throws StreamingException { + checkClosedState(); + beginNextTransaction(); + } + + @Override + public void commitTransaction() throws StreamingException { + checkState(); + currentTransactionBatch.commit(); + } + + @Override + public void abortTransaction() throws StreamingException { + checkState(); + currentTransactionBatch.abort(); + } + + @Override + public void close() { + if (isConnectionClosed.get()) { + return; + } + isConnectionClosed.set(true); + try { + if (currentTransactionBatch != null) { + currentTransactionBatch.close(); + } + } catch (StreamingException e) { + LOG.error("Unable to close current transaction batch: " + currentTransactionBatch, e); + } finally { + msClient.close(); + heartbeatMSClient.close(); + } + } + + private static IMetaStoreClient getMetaStoreClient(HiveConf conf, String metastoreUri, boolean secureMode) + throws ConnectionError { + if (metastoreUri != null) { + conf.set(MetastoreConf.ConfVars.THRIFT_URIS.getHiveName(), metastoreUri); + } + if (secureMode) { + conf.setBoolean(MetastoreConf.ConfVars.USE_THRIFT_SASL.getHiveName(), true); + } + try { + return HiveMetaStoreUtils.getHiveMetastoreClient(conf); + } catch (MetaException | IOException e) { + throw new ConnectionError("Error connecting to Hive Metastore URI: " + + metastoreUri + ". " + e.getMessage(), e); + } + } + + @VisibleForTesting + TxnState getCurrentTransactionState() { + return currentTransactionBatch.getCurrentTransactionState(); + } + + @VisibleForTesting + int remainingTransactions() { + return currentTransactionBatch.remainingTransactions(); + } + + @VisibleForTesting + Long getCurrentTxnId() { + return currentTransactionBatch.getCurrentTxnId(); + } + + private static class TransactionBatch { + private String username; + private HiveStreamingConnection conn; + private IMetaStoreClient msClient; + private ScheduledExecutorService scheduledExecutorService; + private RecordWriter recordWriter; + private String partNameForLock = null; + private List txnToWriteIds; + private int currentTxnIndex = -1; + private TxnState state; + private LockRequest lockRequest = null; + // this is the only requirement for heartbeat thread + private final AtomicTransactionRange atomicTxnRange; + private IMetaStoreClient heartbeatMSClient; + /** + * once any operation on this batch encounters a system exception + * (e.g. IOException on write) it's safest to assume that we can't write to the + * file backing this batch any more. This guards important public methods + */ + private final AtomicBoolean isTxnClosed = new AtomicBoolean(false); + private String agentInfo; + private int numTxns; + /** + * Tracks the state of each transaction + */ + private TxnState[] txnStatus; + /** + * ID of the last txn used by {@link #beginNextTransactionImpl()} + */ + private long lastTxnUsed; + + /** + * Represents a batch of transactions acquired from MetaStore + * + * @param conn - hive streaming connection + * @throws StreamingException if failed to create new RecordUpdater for batch + */ + private TransactionBatch(HiveStreamingConnection conn) throws StreamingException { + boolean success = false; + try { + if (conn.isPartitionedTable() && !conn.isDynamicPartitioning()) { + List partKeys = conn.tableObject.getPartitionKeys(); + partNameForLock = Warehouse.makePartName(partKeys, conn.staticPartitionValues); + } + this.conn = conn; + this.username = conn.username; + this.msClient = conn.msClient; + this.heartbeatMSClient = conn.heartbeatMSClient; + this.recordWriter = conn.recordWriter; + this.agentInfo = conn.agentInfo; + this.numTxns = conn.transactionBatchSize; + + setupHeartBeatThread(); + + List txnIds = openTxnImpl(msClient, username, numTxns); + txnToWriteIds = allocateWriteIdsImpl(msClient, txnIds); + assert (txnToWriteIds.size() == numTxns); + + txnStatus = new TxnState[numTxns]; + for (int i = 0; i < txnStatus.length; i++) { + assert (txnToWriteIds.get(i).getTxnId() == txnIds.get(i)); + txnStatus[i] = TxnState.OPEN; //Open matches Metastore state + } + this.state = TxnState.INACTIVE; + + // initialize record writer with connection and write id info + recordWriter.init(conn, txnToWriteIds.get(0).getWriteId(), txnToWriteIds.get(numTxns - 1).getWriteId()); + atomicTxnRange = new AtomicTransactionRange(txnIds.get(0), txnIds.get(txnIds.size() - 1)); + success = true; + } catch (TException e) { + throw new StreamingException(conn.toString(), e); + } finally { + //clean up if above throws + markDead(success); + } + } + + private void setupHeartBeatThread() { + // start heartbeat thread + ThreadFactory threadFactory = new ThreadFactoryBuilder().setDaemon(true) + .setNameFormat("HiveStreamingConnection-Heartbeat-Thread") + .build(); + this.scheduledExecutorService = Executors.newSingleThreadScheduledExecutor(threadFactory); + long heartBeatInterval; + long initialDelay; + try { + // if HIVE_TXN_TIMEOUT is defined, heartbeat interval will be HIVE_TXN_TIMEOUT/2 + heartBeatInterval = DbTxnManager.getHeartbeatInterval(conn.conf); + } catch (LockException e) { + heartBeatInterval = DEFAULT_HEARTBEAT_INTERVAL; + } + // to introduce some randomness and to avoid hammering the metastore at the same time (same logic as DbTxnManager) + initialDelay = (long) (heartBeatInterval * 0.75 * Math.random()); + LOG.info("Starting heartbeat thread with interval: {} ms initialDelay: {} ms for agentInfo: {}", + heartBeatInterval, initialDelay, conn.agentInfo); + Runnable runnable = new HeartbeatRunnable(heartbeatMSClient, atomicTxnRange); + this.scheduledExecutorService.scheduleWithFixedDelay(runnable, initialDelay, heartBeatInterval, TimeUnit + .MILLISECONDS); + } + + private List openTxnImpl(final IMetaStoreClient msClient, final String user, final int numTxns) + throws TException { + return msClient.openTxns(user, numTxns).getTxn_ids(); + } + + private List allocateWriteIdsImpl(final IMetaStoreClient msClient, + final List txnIds) throws TException { + return msClient.allocateTableWriteIdsBatch(txnIds, conn.database, conn.table); + } + + @Override + public String toString() { + if (txnToWriteIds == null || txnToWriteIds.isEmpty()) { + return "{}"; + } + StringBuilder sb = new StringBuilder(" TxnStatus["); + for (TxnState state : txnStatus) { + //'state' should not be null - future proofing + sb.append(state == null ? "N" : state); + } + sb.append("] LastUsed ").append(JavaUtils.txnIdToString(lastTxnUsed)); + return "TxnId/WriteIds=[" + txnToWriteIds.get(0).getTxnId() + + "/" + txnToWriteIds.get(0).getWriteId() + + "..." + + txnToWriteIds.get(txnToWriteIds.size() - 1).getTxnId() + + "/" + txnToWriteIds.get(txnToWriteIds.size() - 1).getWriteId() + + "] on connection = " + conn + "; " + sb; + } + + private void beginNextTransaction() throws StreamingException { + checkIsClosed(); + beginNextTransactionImpl(); + } + + private void beginNextTransactionImpl() throws TransactionError { + state = TxnState.INACTIVE;//clear state from previous txn + if ((currentTxnIndex + 1) >= txnToWriteIds.size()) { + throw new InvalidTransactionState("No more transactions available in" + + " next batch for connection: " + conn + " user: " + username); + } + currentTxnIndex++; + state = TxnState.OPEN; + lastTxnUsed = getCurrentTxnId(); + lockRequest = createLockRequest(conn, partNameForLock, username, getCurrentTxnId(), agentInfo); + try { + LockResponse res = msClient.lock(lockRequest); + if (res.getState() != LockState.ACQUIRED) { + throw new TransactionError("Unable to acquire lock on " + conn); + } + } catch (TException e) { + throw new TransactionError("Unable to acquire lock on " + conn, e); + } + } + + long getCurrentTxnId() { + if (currentTxnIndex >= 0) { + return txnToWriteIds.get(currentTxnIndex).getTxnId(); + } + return -1L; + } + + long getCurrentWriteId() { + if (currentTxnIndex >= 0) { + return txnToWriteIds.get(currentTxnIndex).getWriteId(); + } + return -1L; + } + + TxnState getCurrentTransactionState() { + return state; + } + + int remainingTransactions() { + if (currentTxnIndex >= 0) { + return txnToWriteIds.size() - currentTxnIndex - 1; + } + return txnToWriteIds.size(); + } + + + public void write(final byte[] record) throws StreamingException { + checkIsClosed(); + boolean success = false; + try { + recordWriter.write(getCurrentWriteId(), record); + success = true; + } catch (SerializationError ex) { + //this exception indicates that a {@code record} could not be parsed and the + //caller can decide whether to drop it or send it to dead letter queue. + //rolling back the txn and retrying won't help since the tuple will be exactly the same + //when it's replayed. + success = true; + throw ex; + } finally { + markDead(success); + } + } + + private void checkIsClosed() throws StreamingException { + if (isTxnClosed.get()) { + throw new StreamingException("Transaction" + toString() + " is closed()"); + } + } + + /** + * A transaction batch opens a single HDFS file and writes multiple transaction to it. If there is any issue + * with the write, we can't continue to write to the same file any as it may be corrupted now (at the tail). + * This ensures that a client can't ignore these failures and continue to write. + */ + private void markDead(boolean success) { + if (success) { + return; + } + isTxnClosed.set(true); //also ensures that heartbeat() is no-op since client is likely doing it async + try { + abort(true);//abort all remaining txns + } catch (Exception ex) { + LOG.error("Fatal error on " + toString() + "; cause " + ex.getMessage(), ex); + } + try { + closeImpl(); + } catch (Exception ex) { + LOG.error("Fatal error on " + toString() + "; cause " + ex.getMessage(), ex); + } + } + + + void commit() throws StreamingException { + checkIsClosed(); + boolean success = false; + try { + commitImpl(); + success = true; + } finally { + markDead(success); + } + } + + private void commitImpl() throws StreamingException { + try { + recordWriter.flush(); + TxnToWriteId txnToWriteId = txnToWriteIds.get(currentTxnIndex); + if (conn.isDynamicPartitioning()) { + List partNames = new ArrayList<>(recordWriter.getPartitions()); + msClient.addDynamicPartitions(txnToWriteId.getTxnId(), txnToWriteId.getWriteId(), conn.database, conn.table, + partNames, DataOperationType.INSERT); + } + msClient.commitTxn(txnToWriteId.getTxnId()); + // increment the min txn id so that heartbeat thread will heartbeat only from the next open transaction. + if (currentTxnIndex + 1 < txnToWriteIds.size()) { + atomicTxnRange.setMin(txnToWriteIds.get(currentTxnIndex + 1).getTxnId()); + } else { + // exhausted the batch, no longer have to heartbeat for current txn batch + atomicTxnRange.setMin(-1); + } + state = TxnState.COMMITTED; + txnStatus[currentTxnIndex] = TxnState.COMMITTED; + } catch (NoSuchTxnException e) { + throw new TransactionError("Invalid transaction id : " + + getCurrentTxnId(), e); + } catch (TxnAbortedException e) { + throw new TransactionError("Aborted transaction cannot be committed" + , e); + } catch (TException e) { + throw new TransactionError("Unable to commitTransaction transaction" + + getCurrentTxnId(), e); + } + } + + void abort() throws StreamingException { + if (isTxnClosed.get()) { + /* + * isDead is only set internally by this class. {@link #markDead(boolean)} will abort all + * remaining txns, so make this no-op to make sure that a well-behaved client that calls abortTransaction() + * error doesn't get misleading errors + */ + return; + } + abort(false); + } + + private void abort(final boolean abortAllRemaining) throws StreamingException { + abortImpl(abortAllRemaining); + } + + private void abortImpl(boolean abortAllRemaining) throws StreamingException { + try { + if (abortAllRemaining) { + // we are aborting all txns in the current batch, so no need to heartbeat + atomicTxnRange.setMin(-1); + //when last txn finished (abortTransaction/commitTransaction) the currentTxnIndex is pointing at that txn + //so we need to start from next one, if any. Also if batch was created but + //fetchTransactionBatch() was never called, we want to start with first txn + int minOpenTxnIndex = Math.max(currentTxnIndex + + (state == TxnState.ABORTED || state == TxnState.COMMITTED ? 1 : 0), 0); + for (currentTxnIndex = minOpenTxnIndex; + currentTxnIndex < txnToWriteIds.size(); currentTxnIndex++) { + msClient.rollbackTxn(txnToWriteIds.get(currentTxnIndex).getTxnId()); + txnStatus[currentTxnIndex] = TxnState.ABORTED; + } + currentTxnIndex--;//since the loop left it == txnToWriteIds.size() + } else { + // we are aborting only the current transaction, so move the min range for heartbeat or disable heartbeat + // if the current txn is last in the batch. + if (currentTxnIndex + 1 < txnToWriteIds.size()) { + atomicTxnRange.setMin(txnToWriteIds.get(currentTxnIndex + 1).getTxnId()); + } else { + // exhausted the batch, no longer have to heartbeat + atomicTxnRange.setMin(-1); + } + long currTxnId = getCurrentTxnId(); + if (currTxnId > 0) { + msClient.rollbackTxn(currTxnId); + txnStatus[currentTxnIndex] = TxnState.ABORTED; + } + } + state = TxnState.ABORTED; + } catch (NoSuchTxnException e) { + throw new TransactionError("Unable to abort invalid transaction id : " + + getCurrentTxnId(), e); + } catch (TException e) { + throw new TransactionError("Unable to abort transaction id : " + + getCurrentTxnId(), e); + } + } + + public boolean isClosed() { + return isTxnClosed.get(); + } + + /** + * Close the TransactionBatch. This will abort any still open txns in this batch. + * + * @throws StreamingException - failure when closing transaction batch + */ + public void close() throws StreamingException { + if (isTxnClosed.get()) { + return; + } + isTxnClosed.set(true); + abortImpl(true); + closeImpl(); + } + + private void closeImpl() throws StreamingException { + state = TxnState.INACTIVE; + recordWriter.close(); + if (scheduledExecutorService != null) { + scheduledExecutorService.shutdownNow(); + } + } + + static LockRequest createLockRequest(final HiveStreamingConnection connection, + String partNameForLock, String user, long txnId, String agentInfo) { + LockRequestBuilder requestBuilder = new LockRequestBuilder(agentInfo); + requestBuilder.setUser(user); + requestBuilder.setTransactionId(txnId); + + LockComponentBuilder lockCompBuilder = new LockComponentBuilder() + .setDbName(connection.database) + .setTableName(connection.table) + .setShared() + .setOperationType(DataOperationType.INSERT); + if (connection.isDynamicPartitioning()) { + lockCompBuilder.setIsDynamicPartitionWrite(true); + } + if (partNameForLock != null && !partNameForLock.isEmpty()) { + lockCompBuilder.setPartitionName(partNameForLock); + } + requestBuilder.addLockComponent(lockCompBuilder.build()); + + return requestBuilder.build(); + } + } + + private HiveConf createHiveConf(Class clazz, String metaStoreUri) { + HiveConf conf = new HiveConf(clazz); + if (metaStoreUri != null) { + conf.set(MetastoreConf.ConfVars.THRIFT_URIS.getHiveName(), metaStoreUri); + } + return conf; + } + + private void overrideConfSettings(HiveConf conf) { + setHiveConf(conf, HiveConf.ConfVars.HIVE_TXN_MANAGER, DbTxnManager.class.getName()); + setHiveConf(conf, HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, true); + setHiveConf(conf, MetastoreConf.ConfVars.EXECUTE_SET_UGI.getHiveName()); + // Avoids creating Tez Client sessions internally as it takes much longer currently + setHiveConf(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE, "mr"); + setHiveConf(conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict"); + if (streamingOptimizations) { + setHiveConf(conf, HiveConf.ConfVars.HIVE_ORC_DELTA_STREAMING_OPTIMIZATIONS_ENABLED, true); + } + // since same thread creates metastore client for streaming connection thread and heartbeat thread we explicitly + // disable metastore client cache + setHiveConf(conf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_ENABLED, false); + } + + private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var, String value) { + if (LOG.isDebugEnabled()) { + LOG.debug("Overriding HiveConf setting : " + var + " = " + value); + } + conf.setVar(var, value); + } + + private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var, boolean value) { + if (LOG.isDebugEnabled()) { + LOG.debug("Overriding HiveConf setting : " + var + " = " + value); + } + conf.setBoolVar(var, true); + } + + private static void setHiveConf(HiveConf conf, String var) { + if (LOG.isDebugEnabled()) { + LOG.debug("Overriding HiveConf setting : " + var + " = " + true); + } + conf.setBoolean(var, true); + } + + @Override + public HiveConf getHiveConf() { + return conf; + } + + @Override + public String getMetastoreUri() { + return metastoreUri; + } + + @Override + public String getDatabase() { + return database; + } + + @Override + public String getTable() { + return table; + } + + @Override + public List getStaticPartitionValues() { + return staticPartitionValues; + } + + @Override + public String getAgentInfo() { + return agentInfo; + } + + @Override + public boolean isPartitionedTable() { + return isPartitionedTable; + } + + @Override + public boolean isDynamicPartitioning() { + return isPartitionedTable() && (staticPartitionValues == null || staticPartitionValues.isEmpty()); + } +} diff --git a/streaming/src/java/org/apache/hive/streaming/ImpersonationFailed.java b/streaming/src/java/org/apache/hive/streaming/ImpersonationFailed.java deleted file mode 100644 index 23e17e7..0000000 --- a/streaming/src/java/org/apache/hive/streaming/ImpersonationFailed.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class ImpersonationFailed extends StreamingException { - public ImpersonationFailed(String username, Exception e) { - super("Failed to impersonate user " + username, e); - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidColumn.java b/streaming/src/java/org/apache/hive/streaming/InvalidColumn.java deleted file mode 100644 index 0011b14..0000000 --- a/streaming/src/java/org/apache/hive/streaming/InvalidColumn.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class InvalidColumn extends StreamingException { - - public InvalidColumn(String msg) { - super(msg); - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidPartition.java b/streaming/src/java/org/apache/hive/streaming/InvalidPartition.java deleted file mode 100644 index f1f9804..0000000 --- a/streaming/src/java/org/apache/hive/streaming/InvalidPartition.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class InvalidPartition extends StreamingException { - - public InvalidPartition(String partitionName, String partitionValue) { - super("Invalid partition: Name=" + partitionName + - ", Value=" + partitionValue); - } - -} diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidTable.java b/streaming/src/java/org/apache/hive/streaming/InvalidTable.java index ef1c91d..5c60160 100644 --- a/streaming/src/java/org/apache/hive/streaming/InvalidTable.java +++ b/streaming/src/java/org/apache/hive/streaming/InvalidTable.java @@ -24,15 +24,11 @@ private static String makeMsg(String db, String table) { return "Invalid table db:" + db + ", table:" + table; } - public InvalidTable(String db, String table) { - super(makeMsg(db,table), null); - } - - public InvalidTable(String db, String table, String msg) { + InvalidTable(String db, String table, String msg) { super(makeMsg(db, table) + ": " + msg, null); } - public InvalidTable(String db, String table, Exception inner) { + InvalidTable(String db, String table, Exception inner) { super(makeMsg(db, table) + ": " + inner.getMessage(), inner); } } diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidTransactionState.java b/streaming/src/java/org/apache/hive/streaming/InvalidTransactionState.java new file mode 100644 index 0000000..9d92dfa --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/InvalidTransactionState.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +public class InvalidTransactionState extends TransactionError { + InvalidTransactionState(String msg) { + super(msg); + } +} diff --git a/streaming/src/java/org/apache/hive/streaming/InvalidTrasactionState.java b/streaming/src/java/org/apache/hive/streaming/InvalidTrasactionState.java deleted file mode 100644 index 762f5f8..0000000 --- a/streaming/src/java/org/apache/hive/streaming/InvalidTrasactionState.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class InvalidTrasactionState extends TransactionError { - public InvalidTrasactionState(String msg) { - super(msg); - } - -} diff --git a/streaming/src/java/org/apache/hive/streaming/PartitionCreationFailed.java b/streaming/src/java/org/apache/hive/streaming/PartitionCreationFailed.java index 5f9aca6..e464399 100644 --- a/streaming/src/java/org/apache/hive/streaming/PartitionCreationFailed.java +++ b/streaming/src/java/org/apache/hive/streaming/PartitionCreationFailed.java @@ -19,7 +19,7 @@ package org.apache.hive.streaming; public class PartitionCreationFailed extends StreamingException { - public PartitionCreationFailed(HiveEndPoint endPoint, Exception cause) { - super("Failed to create partition " + endPoint, cause); + PartitionCreationFailed(StreamingConnection connection, Throwable cause) { + super("Failed to create partition " + connection, cause); } } diff --git a/streaming/src/java/org/apache/hive/streaming/PartitionHandler.java b/streaming/src/java/org/apache/hive/streaming/PartitionHandler.java new file mode 100644 index 0000000..1b0e7de --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/PartitionHandler.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +import java.util.List; + +public interface PartitionHandler { + + /** + * Creates a partition if it does not exist. + * + * @param partitionValues - partition values + * @return partition location + * @throws StreamingException - any metastore related exceptions + */ + PartitionInfo createPartitionIfNotExists(List partitionValues) throws StreamingException; +} diff --git a/streaming/src/java/org/apache/hive/streaming/PartitionInfo.java b/streaming/src/java/org/apache/hive/streaming/PartitionInfo.java new file mode 100644 index 0000000..ce9f76a --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/PartitionInfo.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +/** + * Simple wrapper class for minimal partition related information used by streaming ingest. + */ +public class PartitionInfo { + private String name; + private String partitionLocation; + private boolean exists; + + public PartitionInfo(final String name, final String partitionLocation, final boolean exists) { + this.name = name; + this.partitionLocation = partitionLocation; + this.exists = exists; + } + + public String getName() { + return name; + } + + public void setName(final String name) { + this.name = name; + } + + public String getPartitionLocation() { + return partitionLocation; + } + + public void setPartitionLocation(final String partitionLocation) { + this.partitionLocation = partitionLocation; + } + + public boolean isExists() { + return exists; + } + + public void setExists(final boolean exists) { + this.exists = exists; + } +} diff --git a/streaming/src/java/org/apache/hive/streaming/QueryFailedException.java b/streaming/src/java/org/apache/hive/streaming/QueryFailedException.java deleted file mode 100644 index ccd3ae0..0000000 --- a/streaming/src/java/org/apache/hive/streaming/QueryFailedException.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class QueryFailedException extends StreamingException { - String query; - - public QueryFailedException(String query, Exception e) { - super("Query failed: " + query + ". Due to :" + e.getMessage(), e); - this.query = query; - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/RecordWriter.java b/streaming/src/java/org/apache/hive/streaming/RecordWriter.java index dc6d70e..4d25924 100644 --- a/streaming/src/java/org/apache/hive/streaming/RecordWriter.java +++ b/streaming/src/java/org/apache/hive/streaming/RecordWriter.java @@ -19,25 +19,44 @@ package org.apache.hive.streaming; +import java.util.Set; + public interface RecordWriter { - /** Writes using a hive RecordUpdater + /** + * Initialize record writer. + * + * @param connection - streaming connection + * @param minWriteId - min write id + * @param maxWriteID - max write id + * @throws StreamingException - thrown when initialization failed + */ + void init(StreamingConnection connection, long minWriteId, long maxWriteID) throws StreamingException; + + /** + * Writes using a hive RecordUpdater * * @param writeId the write ID of the table mapping to Txn in which the write occurs - * @param record the record to be written + * @param record the record to be written */ void write(long writeId, byte[] record) throws StreamingException; - /** Flush records from buffer. Invoked by TransactionBatch.commit() */ + /** + * Flush records from buffer. Invoked by TransactionBatch.commitTransaction() + */ void flush() throws StreamingException; - /** Clear bufferred writes. Invoked by TransactionBatch.abort() */ - void clear() throws StreamingException; - - /** Acquire a new RecordUpdater. Invoked when - * StreamingConnection.fetchTransactionBatch() is called */ - void newBatch(Long minWriteId, Long maxWriteID) throws StreamingException; + /** + * Close the RecordUpdater. Invoked by TransactionBatch.close() + * + * @throws StreamingException - thrown when record writer cannot be closed. + */ + void close() throws StreamingException; - /** Close the RecordUpdater. Invoked by TransactionBatch.close() */ - void closeBatch() throws StreamingException; + /** + * Get the set of partitions that were added by the record writer. + * + * @return - set of partitions + */ + Set getPartitions(); } diff --git a/streaming/src/java/org/apache/hive/streaming/SerializationError.java b/streaming/src/java/org/apache/hive/streaming/SerializationError.java index a57ba00..1473ff8 100644 --- a/streaming/src/java/org/apache/hive/streaming/SerializationError.java +++ b/streaming/src/java/org/apache/hive/streaming/SerializationError.java @@ -20,7 +20,7 @@ public class SerializationError extends StreamingException { - public SerializationError(String msg, Exception e) { + SerializationError(String msg, Exception e) { super(msg,e); } } diff --git a/streaming/src/java/org/apache/hive/streaming/StreamingConnection.java b/streaming/src/java/org/apache/hive/streaming/StreamingConnection.java index 2f760ea..cd7f3d8 100644 --- a/streaming/src/java/org/apache/hive/streaming/StreamingConnection.java +++ b/streaming/src/java/org/apache/hive/streaming/StreamingConnection.java @@ -18,40 +18,47 @@ package org.apache.hive.streaming; -import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.hive.conf.HiveConf; -/** - * Represents a connection to a HiveEndPoint. Used to acquire transaction batches. - * Note: the expectation is that there is at most 1 TransactionBatch outstanding for any given - * StreamingConnection. Violating this may result in "out of sequence response". - */ -public interface StreamingConnection { +public interface StreamingConnection extends ConnectionInfo, PartitionHandler { + /** + * Returns hive configuration object used during connection creation. + * + * @return - hive conf + */ + HiveConf getHiveConf(); + + /** + * Begin a transaction for writing. + * + * @throws StreamingException - if there are errors when beginning transaction + */ + void beginTransaction() throws StreamingException; + + /** + * Write record using RecordWriter. + * + * @param record - the data to be written + * @throws StreamingException - if there are errors when writing + */ + void write(byte[] record) throws StreamingException; /** - * Acquires a new batch of transactions from Hive. - - * @param numTransactionsHint is a hint from client indicating how many transactions client needs. - * @param writer Used to write record. The same writer instance can - * be shared with another TransactionBatch (to the same endpoint) - * only after the first TransactionBatch has been closed. - * Writer will be closed when the TransactionBatch is closed. - * @return - * @throws ConnectionError - * @throws InvalidPartition - * @throws StreamingException - * @return a batch of transactions + * Commit a transaction to make the writes visible for readers. + * + * @throws StreamingException - if there are errors when committing the open transaction */ - public TransactionBatch fetchTransactionBatch(int numTransactionsHint, - RecordWriter writer) - throws ConnectionError, StreamingException, InterruptedException; + void commitTransaction() throws StreamingException; /** - * Close connection + * Manually abort the opened transaction. + * + * @throws StreamingException - if there are errors when aborting the transaction */ - public void close(); + void abortTransaction() throws StreamingException; /** - * @return UserGroupInformation associated with this connection or {@code null} if there is none + * Closes streaming connection. */ - UserGroupInformation getUserGroupInformation(); + void close(); } diff --git a/streaming/src/java/org/apache/hive/streaming/StreamingException.java b/streaming/src/java/org/apache/hive/streaming/StreamingException.java index a7f84c1..1af5c6a 100644 --- a/streaming/src/java/org/apache/hive/streaming/StreamingException.java +++ b/streaming/src/java/org/apache/hive/streaming/StreamingException.java @@ -19,7 +19,7 @@ package org.apache.hive.streaming; public class StreamingException extends Exception { - public StreamingException(String msg, Exception cause) { + public StreamingException(String msg, Throwable cause) { super(msg, cause); } public StreamingException(String msg) { diff --git a/streaming/src/java/org/apache/hive/streaming/StreamingIOFailure.java b/streaming/src/java/org/apache/hive/streaming/StreamingIOFailure.java index 0dfbfa7..090167d 100644 --- a/streaming/src/java/org/apache/hive/streaming/StreamingIOFailure.java +++ b/streaming/src/java/org/apache/hive/streaming/StreamingIOFailure.java @@ -21,11 +21,11 @@ public class StreamingIOFailure extends StreamingException { - public StreamingIOFailure(String msg, Exception cause) { + StreamingIOFailure(String msg, Exception cause) { super(msg, cause); } - public StreamingIOFailure(String msg) { + StreamingIOFailure(String msg) { super(msg); } } diff --git a/streaming/src/java/org/apache/hive/streaming/StrictDelimitedInputWriter.java b/streaming/src/java/org/apache/hive/streaming/StrictDelimitedInputWriter.java new file mode 100644 index 0000000..4a07435 --- /dev/null +++ b/streaming/src/java/org/apache/hive/streaming/StrictDelimitedInputWriter.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + + +import java.util.Properties; + +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.io.BytesWritable; + +import com.google.common.base.Joiner; + +/** + * Streaming Writer handles delimited input (eg. CSV). + * Delimited input is parsed to extract partition values, bucketing info and is forwarded to record updater. + * Uses Lazy Simple SerDe to process delimited input. + * + * NOTE: This record writer is NOT thread-safe. Use one record writer per streaming connection. + */ +public class StrictDelimitedInputWriter extends AbstractRecordWriter { + private char fieldDelimiter; + private char collectionDelimiter; + private char mapKeyDelimiter; + private LazySimpleSerDe serde; + + private StrictDelimitedInputWriter(Builder builder) { + this.fieldDelimiter = builder.fieldDelimiter; + this.collectionDelimiter = builder.collectionDelimiter; + this.mapKeyDelimiter = builder.mapKeyDelimiter; + } + + public static Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + private char fieldDelimiter = (char) LazySerDeParameters.DefaultSeparators[0]; + private char collectionDelimiter = (char) LazySerDeParameters.DefaultSeparators[1]; + private char mapKeyDelimiter = (char) LazySerDeParameters.DefaultSeparators[2]; + + public Builder withFieldDelimiter(final char fieldDelimiter) { + this.fieldDelimiter = fieldDelimiter; + return this; + } + + public Builder withCollectionDelimiter(final char collectionDelimiter) { + this.collectionDelimiter = collectionDelimiter; + return this; + } + + public Builder withMapKeyDelimiter(final char mapKeyDelimiter) { + this.mapKeyDelimiter = mapKeyDelimiter; + return this; + } + + public StrictDelimitedInputWriter build() { + return new StrictDelimitedInputWriter(this); + } + } + + @Override + public Object encode(byte[] record) throws SerializationError { + try { + BytesWritable blob = new BytesWritable(); + blob.set(record, 0, record.length); + return serde.deserialize(blob); + } catch (SerDeException e) { + throw new SerializationError("Unable to convert byte[] record into Object", e); + } + } + + @Override + public LazySimpleSerDe createSerde() throws SerializationError { + try { + Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); + tableProps.setProperty(serdeConstants.LIST_COLUMNS, Joiner.on(",").join(inputColumns)); + tableProps.setProperty(serdeConstants.LIST_COLUMN_TYPES, Joiner.on(":").join(inputTypes)); + tableProps.setProperty(serdeConstants.FIELD_DELIM, String.valueOf(fieldDelimiter)); + tableProps.setProperty(serdeConstants.COLLECTION_DELIM, String.valueOf(collectionDelimiter)); + tableProps.setProperty(serdeConstants.MAPKEY_DELIM, String.valueOf(mapKeyDelimiter)); + LazySimpleSerDe serde = new LazySimpleSerDe(); + SerDeUtils.initializeSerDe(serde, conf, tableProps, null); + this.serde = serde; + return serde; + } catch (SerDeException e) { + throw new SerializationError("Error initializing serde", e); + } + } +} diff --git a/streaming/src/java/org/apache/hive/streaming/StrictJsonWriter.java b/streaming/src/java/org/apache/hive/streaming/StrictJsonWriter.java index 0077913..1600e7c 100644 --- a/streaming/src/java/org/apache/hive/streaming/StrictJsonWriter.java +++ b/streaming/src/java/org/apache/hive/streaming/StrictJsonWriter.java @@ -18,131 +18,45 @@ package org.apache.hive.streaming; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.Table; +import java.util.Properties; + import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; -import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.hive.serde2.JsonSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.io.Text; -import org.apache.hive.hcatalog.data.HCatRecordObjectInspector; -import org.apache.hive.hcatalog.data.JsonSerDe; - -import java.io.IOException; -import java.util.List; -import java.util.Properties; /** * Streaming Writer handles utf8 encoded Json (Strict syntax). - * Uses org.apache.hive.hcatalog.data.JsonSerDe to process Json input + * Uses org.apache.hadoop.hive.serde2.JsonSerDe to process Json input + * + * NOTE: This record writer is NOT thread-safe. Use one record writer per streaming connection. */ public class StrictJsonWriter extends AbstractRecordWriter { private JsonSerDe serde; - private final HCatRecordObjectInspector recordObjInspector; - private final ObjectInspector[] bucketObjInspectors; - private final StructField[] bucketStructFields; - - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #StrictJsonWriter(HiveEndPoint, HiveConf, StreamingConnection)} - */ - public StrictJsonWriter(HiveEndPoint endPoint) - throws ConnectionError, SerializationError, StreamingException { - this(endPoint, null, null); - } - - /** - * @deprecated As of release 1.3/2.1. Replaced by {@link #StrictJsonWriter(HiveEndPoint, HiveConf, StreamingConnection)} - */ - public StrictJsonWriter(HiveEndPoint endPoint, HiveConf conf) throws StreamingException { - this(endPoint, conf, null); - } - /** - * @param endPoint the end point to write to - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictJsonWriter(HiveEndPoint endPoint, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - this(endPoint, null, conn); + public static Builder newBuilder() { + return new Builder(); } - /** - * @param endPoint the end point to write to - * @param conf a Hive conf object. Should be null if not using advanced Hive settings. - * @param conn connection this Writer is to be used with - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictJsonWriter(HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - super(endPoint, conf, conn); - this.serde = createSerde(tbl, conf); - // get ObjInspectors for entire record and bucketed cols - try { - recordObjInspector = ( HCatRecordObjectInspector ) serde.getObjectInspector(); - this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, recordObjInspector); - } catch (SerDeException e) { - throw new SerializationError("Unable to get ObjectInspector for bucket columns", e); - } - // get StructFields for bucketed cols - bucketStructFields = new StructField[bucketIds.size()]; - List allFields = recordObjInspector.getAllStructFieldRefs(); - for (int i = 0; i < bucketIds.size(); i++) { - bucketStructFields[i] = allFields.get(bucketIds.get(i)); + public static class Builder { + public StrictJsonWriter build() { + return new StrictJsonWriter(); } } - @Override - public AbstractSerDe getSerde() { - return serde; - } - - protected HCatRecordObjectInspector getRecordObjectInspector() { - return recordObjInspector; - } - - @Override - protected StructField[] getBucketStructFields() { - return bucketStructFields; - } - - protected ObjectInspector[] getBucketObjectInspectors() { - return bucketObjInspectors; - } - - - @Override - public void write(long writeId, byte[] record) - throws StreamingIOFailure, SerializationError { - try { - Object encodedRow = encode(record); - int bucket = getBucket(encodedRow); - getRecordUpdater(bucket).insert(writeId, encodedRow); - } catch (IOException e) { - throw new StreamingIOFailure("Error writing record in transaction write id(" - + writeId + ")", e); - } - - } - /** * Creates JsonSerDe - * @param tbl used to create serde - * @param conf used to create serde - * @return + * * @throws SerializationError if serde could not be initialized */ - private static JsonSerDe createSerde(Table tbl, HiveConf conf) - throws SerializationError { + @Override + public JsonSerDe createSerde() throws SerializationError { try { Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); JsonSerDe serde = new JsonSerDe(); SerDeUtils.initializeSerDe(serde, conf, tableProps, null); + this.serde = serde; return serde; } catch (SerDeException e) { throw new SerializationError("Error initializing serde " + JsonSerDe.class.getName(), e); @@ -158,5 +72,4 @@ public Object encode(byte[] utf8StrRecord) throws SerializationError { throw new SerializationError("Unable to convert byte[] record into Object", e); } } - } diff --git a/streaming/src/java/org/apache/hive/streaming/StrictRegexWriter.java b/streaming/src/java/org/apache/hive/streaming/StrictRegexWriter.java index c0b7324..563cf66 100644 --- a/streaming/src/java/org/apache/hive/streaming/StrictRegexWriter.java +++ b/streaming/src/java/org/apache/hive/streaming/StrictRegexWriter.java @@ -18,163 +18,73 @@ package org.apache.hive.streaming; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import java.util.Properties; import org.apache.commons.lang.StringUtils; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.RegexSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.io.Text; /** * Streaming Writer handles text input data with regex. Uses * org.apache.hadoop.hive.serde2.RegexSerDe + * + * NOTE: This record writer is NOT thread-safe. Use one record writer per streaming connection. */ public class StrictRegexWriter extends AbstractRecordWriter { + private String regex; private RegexSerDe serde; - private final StructObjectInspector recordObjInspector; - private final ObjectInspector[] bucketObjInspectors; - private final StructField[] bucketStructFields; - /** - * @param endPoint the end point to write to - * @param conn connection this Writer is to be used with - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictRegexWriter(HiveEndPoint endPoint, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - this(null, endPoint, null, conn); + private StrictRegexWriter(final Builder builder) { + this.regex = builder.regex; } - /** - * @param endPoint the end point to write to - * @param conf a Hive conf object. Should be null if not using advanced Hive settings. - * @param conn connection this Writer is to be used with - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictRegexWriter(HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - this(null, endPoint, conf, conn); + public static Builder newBuilder() { + return new Builder(); } - /** - * @param regex to parse the data - * @param endPoint the end point to write to - * @param conf a Hive conf object. Should be null if not using advanced Hive settings. - * @param conn connection this Writer is to be used with - * @throws ConnectionError - * @throws SerializationError - * @throws StreamingException - */ - public StrictRegexWriter(String regex, HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) - throws ConnectionError, SerializationError, StreamingException { - super(endPoint, conf, conn); - this.serde = createSerde(tbl, conf, regex); - // get ObjInspectors for entire record and bucketed cols - try { - recordObjInspector = (StructObjectInspector) serde.getObjectInspector(); - this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, recordObjInspector); - } catch (SerDeException e) { - throw new SerializationError("Unable to get ObjectInspector for bucket columns", e); - } + public static class Builder { + private String regex; - // get StructFields for bucketed cols - bucketStructFields = new StructField[bucketIds.size()]; - List allFields = recordObjInspector.getAllStructFieldRefs(); - for (int i = 0; i < bucketIds.size(); i++) { - bucketStructFields[i] = allFields.get(bucketIds.get(i)); + public Builder withRegex(final String regex) { + this.regex = regex; + return this; } - } - - @Override - public AbstractSerDe getSerde() { - return serde; - } - - @Override - protected StructObjectInspector getRecordObjectInspector() { - return recordObjInspector; - } - - @Override - protected StructField[] getBucketStructFields() { - return bucketStructFields; - } - - @Override - protected ObjectInspector[] getBucketObjectInspectors() { - return bucketObjInspectors; - } - - @Override - public void write(long writeId, byte[] record) - throws StreamingIOFailure, SerializationError { - try { - Object encodedRow = encode(record); - int bucket = getBucket(encodedRow); - getRecordUpdater(bucket).insert(writeId, encodedRow); - } catch (IOException e) { - throw new StreamingIOFailure("Error writing record in transaction write id(" - + writeId + ")", e); + public StrictRegexWriter build() { + return new StrictRegexWriter(this); } } /** * Creates RegexSerDe * - * @param tbl used to create serde - * @param conf used to create serde - * @param regex used to create serde - * @return * @throws SerializationError if serde could not be initialized */ - private static RegexSerDe createSerde(Table tbl, HiveConf conf, String regex) - throws SerializationError { + @Override + public RegexSerDe createSerde() throws SerializationError { try { Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); tableProps.setProperty(RegexSerDe.INPUT_REGEX, regex); - ArrayList tableColumns = getCols(tbl); - tableProps.setProperty(serdeConstants.LIST_COLUMNS, StringUtils.join(tableColumns, ",")); + tableProps.setProperty(serdeConstants.LIST_COLUMNS, StringUtils.join(inputColumns, ",")); RegexSerDe serde = new RegexSerDe(); SerDeUtils.initializeSerDe(serde, conf, tableProps, null); + this.serde = serde; return serde; } catch (SerDeException e) { throw new SerializationError("Error initializing serde " + RegexSerDe.class.getName(), e); } } - private static ArrayList getCols(Table table) { - List cols = table.getSd().getCols(); - ArrayList colNames = new ArrayList(cols.size()); - for (FieldSchema col : cols) { - colNames.add(col.getName().toLowerCase()); - } - return colNames; - } - /** * Encode Utf8 encoded string bytes using RegexSerDe * - * @param utf8StrRecord + * @param utf8StrRecord - serialized record * @return The encoded object - * @throws SerializationError + * @throws SerializationError - in case of any deserialization error */ @Override public Object encode(byte[] utf8StrRecord) throws SerializationError { diff --git a/streaming/src/java/org/apache/hive/streaming/TransactionBatch.java b/streaming/src/java/org/apache/hive/streaming/TransactionBatch.java deleted file mode 100644 index 2b05771..0000000 --- a/streaming/src/java/org/apache/hive/streaming/TransactionBatch.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - - -import java.util.Collection; - -/** - * Represents a set of Transactions returned by Hive. Supports opening, writing to - * and commiting/aborting each transaction. The interface is designed to ensure - * transactions in a batch are used up sequentially. To stream to the same HiveEndPoint - * concurrently, create separate StreamingConnections. - * - * Note on thread safety: At most 2 threads can run through a given TransactionBatch at the same - * time. One thread may call {@link #heartbeat()} and the other all other methods. - * Violating this may result in "out of sequence response". - * - */ -public interface TransactionBatch { - enum TxnState { - INACTIVE("I"), OPEN("O"), COMMITTED("C"), ABORTED("A"); - - private final String code; - TxnState(String code) { - this.code = code; - }; - public String toString() { - return code; - } - } - - /** - * Activate the next available transaction in the current transaction batch. - * @throws StreamingException if not able to switch to next Txn - * @throws InterruptedException if call in interrupted - */ - void beginNextTransaction() throws StreamingException, InterruptedException; - - /** - * Get Id of currently open transaction. - * @return transaction id - */ - Long getCurrentTxnId(); - - - /** - * Get write Id mapping to currently open transaction. - * @return write id - */ - Long getCurrentWriteId(); - - /** - * get state of current transaction. - */ - TxnState getCurrentTransactionState(); - - /** - * Commit the currently open transaction. - * @throws StreamingException if there are errors committing - * @throws InterruptedException if call in interrupted - */ - void commit() throws StreamingException, InterruptedException; - - /** - * Abort the currently open transaction. - * @throws StreamingException if there are errors - * @throws InterruptedException if call in interrupted - */ - void abort() throws StreamingException, InterruptedException; - - /** - * Remaining transactions are the ones that are not committed or aborted or open. - * Current open transaction is not considered part of remaining txns. - * @return number of transactions remaining this batch. - */ - int remainingTransactions(); - - - /** - * Write record using RecordWriter. - * @param record the data to be written - * @throws StreamingException if there are errors when writing - * @throws InterruptedException if call in interrupted - */ - void write(byte[] record) throws StreamingException, InterruptedException; - - /** - * Write records using RecordWriter. - * @throws StreamingException if there are errors when writing - * @throws InterruptedException if call in interrupted - */ - void write(Collection records) throws StreamingException, InterruptedException; - - - /** - * Issues a heartbeat to hive metastore on the current and remaining txn ids - * to keep them from expiring. - * @throws StreamingException if there are errors - */ - void heartbeat() throws StreamingException; - - /** - * Close the TransactionBatch. - * @throws StreamingException if there are errors closing batch - * @throws InterruptedException if call in interrupted - */ - void close() throws StreamingException, InterruptedException; - boolean isClosed(); -} diff --git a/streaming/src/java/org/apache/hive/streaming/TransactionBatchUnAvailable.java b/streaming/src/java/org/apache/hive/streaming/TransactionBatchUnAvailable.java deleted file mode 100644 index a8c8cd4..0000000 --- a/streaming/src/java/org/apache/hive/streaming/TransactionBatchUnAvailable.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -public class TransactionBatchUnAvailable extends StreamingException { - public TransactionBatchUnAvailable(HiveEndPoint ep, Exception e) { - super("Unable to acquire transaction batch on end point: " + ep, e); - } -} diff --git a/streaming/src/java/org/apache/hive/streaming/TransactionError.java b/streaming/src/java/org/apache/hive/streaming/TransactionError.java index a331b20..ae56e7c 100644 --- a/streaming/src/java/org/apache/hive/streaming/TransactionError.java +++ b/streaming/src/java/org/apache/hive/streaming/TransactionError.java @@ -19,11 +19,11 @@ package org.apache.hive.streaming; public class TransactionError extends StreamingException { - public TransactionError(String msg, Exception e) { + TransactionError(String msg, Exception e) { super(msg + (e == null ? "" : ": " + e.getMessage()), e); } - public TransactionError(String msg) { + TransactionError(String msg) { super(msg); } } diff --git a/streaming/src/test/org/apache/hive/streaming/TestDelimitedInputWriter.java b/streaming/src/test/org/apache/hive/streaming/TestDelimitedInputWriter.java deleted file mode 100644 index f0843a1..0000000 --- a/streaming/src/test/org/apache/hive/streaming/TestDelimitedInputWriter.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.streaming; - -import java.util.ArrayList; -import java.util.Arrays; - -import org.junit.Test; - -import com.google.common.collect.Lists; - -import junit.framework.Assert; - -public class TestDelimitedInputWriter { - @Test - public void testFieldReordering() throws Exception { - - ArrayList colNames = Lists.newArrayList(new String[]{"col1", "col2", "col3", "col4", "col5"}); - {//1) test dropping fields - first middle & last - String[] fieldNames = {null, "col2", null, "col4", null}; - int[] mapping = DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.assertTrue(Arrays.equals(mapping, new int[]{-1, 1, -1, 3, -1})); - } - - {//2) test reordering - String[] fieldNames = {"col5", "col4", "col3", "col2", "col1"}; - int[] mapping = DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.assertTrue( Arrays.equals(mapping, new int[]{4,3,2,1,0}) ); - } - - {//3) test bad field names - String[] fieldNames = {"xyz", "abc", "col3", "col4", "as"}; - try { - DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.fail(); - } catch (InvalidColumn e) { - // should throw - } - } - - {//4) test few field names - String[] fieldNames = {"col3", "col4"}; - int[] mapping = DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.assertTrue( Arrays.equals(mapping, new int[]{2,3}) ); - } - - {//5) test extra field names - String[] fieldNames = {"col5", "col4", "col3", "col2", "col1", "col1"}; - try { - DelimitedInputWriter.getFieldReordering(fieldNames, colNames); - Assert.fail(); - } catch (InvalidColumn e) { - //show throw - } - } - } -} diff --git a/streaming/src/test/org/apache/hive/streaming/TestStreaming.java b/streaming/src/test/org/apache/hive/streaming/TestStreaming.java index 0ec3048..a6fdd66 100644 --- a/streaming/src/test/org/apache/hive/streaming/TestStreaming.java +++ b/streaming/src/test/org/apache/hive/streaming/TestStreaming.java @@ -35,6 +35,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; @@ -53,12 +54,10 @@ import org.apache.hadoop.hive.conf.Validator; import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; import org.apache.hadoop.hive.metastore.IMetaStoreClient; -import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.GetOpenTxnsInfoResponse; import org.apache.hadoop.hive.metastore.api.LockState; import org.apache.hadoop.hive.metastore.api.LockType; import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; @@ -90,13 +89,11 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; -import org.apache.hadoop.hive.shims.Utils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.security.UserGroupInformation; import org.apache.orc.impl.OrcAcidUtils; import org.apache.orc.tools.FileDump; import org.apache.thrift.TException; @@ -115,6 +112,7 @@ public static class RawFileSystem extends RawLocalFileSystem { private static final URI NAME; + static { try { NAME = new URI("raw:///"); @@ -128,12 +126,16 @@ public URI getUri() { return NAME; } + @Override + public String getScheme() { + return "raw"; + } @Override public FileStatus getFileStatus(Path path) throws IOException { File file = pathToFile(path); if (!file.exists()) { - throw new FileNotFoundException("Can't find " + path); + throw new FileNotFoundException("Can'table find " + path); } // get close enough short mod = 0; @@ -147,32 +149,30 @@ public FileStatus getFileStatus(Path path) throws IOException { mod |= 0111; } return new FileStatus(file.length(), file.isDirectory(), 1, 1024, - file.lastModified(), file.lastModified(), - FsPermission.createImmutable(mod), "owen", "users", path); + file.lastModified(), file.lastModified(), + FsPermission.createImmutable(mod), "owen", "users", path); } } private static final String COL1 = "id"; private static final String COL2 = "msg"; - private final HiveConf conf; + private static HiveConf conf = null; private IDriver driver; private final IMetaStoreClient msClient; - final String metaStoreURI = null; - // partitioned table private final static String dbName = "testing"; private final static String tblName = "alerts"; - private final static String[] fieldNames = new String[]{COL1,COL2}; - List partitionVals; + private final static String[] fieldNames = new String[]{COL1, COL2}; + static List partitionVals; private static Path partLoc; private static Path partLoc2; // unpartitioned table private final static String dbName2 = "testing2"; private final static String tblName2 = "alerts"; - private final static String[] fieldNames2 = new String[]{COL1,COL2}; + private final static String[] fieldNames2 = new String[]{COL1, COL2}; // for bucket join testing @@ -201,13 +201,9 @@ public TestStreaming() throws Exception { conf = new HiveConf(this.getClass()); conf.set("fs.raw.impl", RawFileSystem.class.getName()); - conf - .setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, - "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); + conf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, + "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); TxnDbUtil.setConfValues(conf); - if (metaStoreURI!=null) { - conf.setVar(HiveConf.ConfVars.METASTOREURIS, metaStoreURI); - } conf.setBoolVar(HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI, true); conf.setBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, true); dbFolder.create(); @@ -229,12 +225,13 @@ public void setup() throws Exception { // drop and recreate the necessary databases and tables dropDB(msClient, dbName); - String[] colNames = new String[] {COL1, COL2}; - String[] colTypes = new String[] {serdeConstants.INT_TYPE_NAME, serdeConstants.STRING_TYPE_NAME}; - String[] bucketCols = new String[] {COL1}; + String[] colNames = new String[]{COL1, COL2}; + String[] colTypes = new String[]{serdeConstants.INT_TYPE_NAME, serdeConstants.STRING_TYPE_NAME}; + String[] bucketCols = new String[]{COL1}; String loc1 = dbFolder.newFolder(dbName + ".db").toString(); String[] partNames = new String[]{"Continent", "Country"}; - partLoc = createDbAndTable(driver, dbName, tblName, partitionVals, colNames, colTypes, bucketCols, partNames, loc1, 1); + partLoc = createDbAndTable(driver, dbName, tblName, partitionVals, colNames, colTypes, bucketCols, partNames, loc1, + 1); dropDB(msClient, dbName2); String loc2 = dbFolder.newFolder(dbName2 + ".db").toString(); @@ -249,19 +246,11 @@ public void setup() throws Exception { } @After - public void cleanup() throws Exception { + public void cleanup() { msClient.close(); driver.close(); } - private static List getPartitionKeys() { - List fields = new ArrayList(); - // Defining partition names in unsorted order - fields.add(new FieldSchema("continent", serdeConstants.STRING_TYPE_NAME, "")); - fields.add(new FieldSchema("country", serdeConstants.STRING_TYPE_NAME, "")); - return fields; - } - private void createStoreSales(String dbName, String loc) throws Exception { String dbUri = "raw://" + new Path(loc).toUri().toString(); String tableLoc = dbUri + Path.SEPARATOR + "store_sales"; @@ -301,43 +290,48 @@ private void createStoreSales(String dbName, String loc) throws Exception { ")\n" + " partitioned by (dt string)\n" + "clustered by (ss_store_sk, ss_promo_sk)\n" + - "INTO 4 BUCKETS stored as orc " + " location '" + tableLoc + "'" + " TBLPROPERTIES ('orc.compress'='NONE', 'transactional'='true')"); + "INTO 4 BUCKETS stored as orc " + " location '" + tableLoc + "'" + + " TBLPROPERTIES ('orc.compress'='NONE', 'transactional'='true')"); Assert.assertTrue(success); success = runDDL(driver, "alter table store_sales add partition(dt='2015')"); Assert.assertTrue(success); } + /** * make sure it works with table where bucket col is not 1st col + * * @throws Exception */ @Test public void testBucketingWhereBucketColIsNotFirstCol() throws Exception { List partitionVals = new ArrayList(); partitionVals.add("2015"); - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testing5", "store_sales", partitionVals); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"ss_sold_date_sk","ss_sold_time_sk", "ss_item_sk", - "ss_customer_sk", "ss_cdemo_sk", "ss_hdemo_sk", "ss_addr_sk", "ss_store_sk", "ss_promo_sk", "ss_ticket_number", "ss_quantity", - "ss_wholesale_cost", "ss_list_price", "ss_sales_price", "ss_ext_discount_amt", "ss_ext_sales_price", "ss_ext_wholesale_cost", - "ss_ext_list_price", "ss_ext_tax", "ss_coupon_amt", "ss_net_paid", "ss_net_paid_inc_tax", "ss_net_profit"},",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testing5") + .withTable("store_sales") + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); StringBuilder row = new StringBuilder(); - for(int i = 0; i < 10; i++) { - for(int ints = 0; ints < 11; ints++) { + for (int i = 0; i < 10; i++) { + for (int ints = 0; ints < 11; ints++) { row.append(ints).append(','); } - for(int decs = 0; decs < 12; decs++) { + for (int decs = 0; decs < 12; decs++) { row.append(i + 0.1).append(','); } row.setLength(row.length() - 1); - txnBatch.write(row.toString().getBytes()); + connection.write(row.toString().getBytes()); } - txnBatch.commit(); - txnBatch.close(); + connection.commitTransaction(); connection.close(); ArrayList res = queryTable(driver, "select row__id.bucketid, * from testing5.store_sales"); @@ -352,35 +346,41 @@ public void testBucketingWhereBucketColIsNotFirstCol() throws Exception { @Test public void testNoBuckets() throws Exception { queryTable(driver, "drop table if exists default.streamingnobuckets"); - //todo: why does it need transactional_properties? - queryTable(driver, "create table default.streamingnobuckets (a string, b string) stored as orc TBLPROPERTIES('transactional'='true', 'transactional_properties'='default')"); + queryTable(driver, "create table default.streamingnobuckets (a string, b string) stored as orc " + + "TBLPROPERTIES('transactional'='true')"); queryTable(driver, "insert into default.streamingnobuckets values('foo','bar')"); List rs = queryTable(driver, "select * from default.streamingnobuckets"); Assert.assertEquals(1, rs.size()); Assert.assertEquals("foo\tbar", rs.get(0)); - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "Default", "streamingNoBuckets", null); - String[] colNames1 = new String[] { "a", "b" }; - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter wr = new DelimitedInputWriter(colNames1,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, wr); - txnBatch.beginNextTransaction(); - txnBatch.write("a1,b2".getBytes()); - txnBatch.write("a3,b4".getBytes()); + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("Default") + .withTable("streamingNoBuckets") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withTransactionBatchSize(2) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); + connection.write("a1,b2".getBytes()); + connection.write("a3,b4".getBytes()); TxnStore txnHandler = TxnUtils.getTxnStore(conf); ShowLocksResponse resp = txnHandler.showLocks(new ShowLocksRequest()); Assert.assertEquals(resp.getLocksSize(), 1); Assert.assertEquals("streamingnobuckets", resp.getLocks().get(0).getTablename()); Assert.assertEquals("default", resp.getLocks().get(0).getDbname()); - txnBatch.commit(); - txnBatch.beginNextTransaction(); - txnBatch.write("a5,b6".getBytes()); - txnBatch.write("a7,b8".getBytes()); - txnBatch.commit(); - txnBatch.close(); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("a5,b6".getBytes()); + connection.write("a7,b8".getBytes()); + connection.commitTransaction(); + connection.close(); Assert.assertEquals("", 0, BucketCodec.determineVersion(536870912).decodeWriterId(536870912)); - rs = queryTable(driver,"select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); + rs = queryTable(driver, "select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); Assert.assertTrue(rs.get(0), rs.get(0).startsWith("{\"writeid\":1,\"bucketid\":536870912,\"rowid\":0}\tfoo\tbar")); Assert.assertTrue(rs.get(0), rs.get(0).endsWith("streamingnobuckets/delta_0000001_0000001_0000/bucket_00000")); @@ -404,7 +404,7 @@ public void testNoBuckets() throws Exception { queryTable(driver, "alter table default.streamingnobuckets compact 'major'"); runWorker(conf); - rs = queryTable(driver,"select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); + rs = queryTable(driver, "select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); Assert.assertTrue(rs.get(0), rs.get(0).startsWith("{\"writeid\":1,\"bucketid\":536870912,\"rowid\":0}\tfoo\tbar")); Assert.assertTrue(rs.get(0), rs.get(0).endsWith("streamingnobuckets/base_0000005/bucket_00000")); @@ -416,6 +416,152 @@ public void testNoBuckets() throws Exception { Assert.assertTrue(rs.get(3), rs.get(3).endsWith("streamingnobuckets/base_0000005/bucket_00000")); } + @Test + public void testAllTypesDelimitedWriter() throws Exception { + queryTable(driver, "drop table if exists default.alltypes"); + queryTable(driver, + "create table if not exists default.alltypes ( bo boolean, ti tinyint, si smallint, i int, bi bigint, " + + "f float, d double, de decimal(10,3), ts timestamp, da date, s string, c char(5), vc varchar(5), " + + "m map, l array, st struct ) " + + "stored as orc TBLPROPERTIES('transactional'='true')"); + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter('|') + .withCollectionDelimiter(',') + .withMapKeyDelimiter(':') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("default") + .withTable("alltypes") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withTransactionBatchSize(2) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + + String row1 = "true|10|100|1000|10000|4.0|20.0|4.2222|1969-12-31 " + + "15:59:58.174|1970-01-01|string|hello|hello|k1:v1|100,200|10,foo"; + String row2 = "false|20|200|2000|20000|8.0|40.0|2.2222|1970-12-31 15:59:58.174|1971-01-01|abcd|world|world|" + + "k4:v4|200,300|20,bar"; + connection.beginTransaction(); + connection.write(row1.getBytes()); + connection.write(row2.getBytes()); + connection.commitTransaction(); + connection.close(); + + List rs = queryTable(driver, "select ROW__ID, bo, ti, si, i, bi, f, d, de, ts, da, s, c, vc, m, l, st," + + " INPUT__FILE__NAME from default.alltypes order by ROW__ID"); + Assert.assertEquals(2, rs.size()); + String gotRow1 = rs.get(0); + String expectedPrefixRow1 = "{\"writeid\":1,\"bucketid\":536870912," + + "\"rowid\":0}\ttrue\t10\t100\t1000\t10000\t4.0\t20.0\t4.222\t1969-12-31 15:59:58.174\t1970-01-01\tstring" + + "\thello\thello\t{\"k1\":\"v1\"}\t[100,200]\t{\"c1\":10,\"c2\":\"foo\"}"; + String expectedSuffixRow1 = "alltypes/delta_0000001_0000002/bucket_00000"; + String gotRow2 = rs.get(1); + String expectedPrefixRow2 = "{\"writeid\":1,\"bucketid\":536870912," + + "\"rowid\":1}\tfalse\t20\t200\t2000\t20000\t8.0\t40.0\t2.222\t1970-12-31 15:59:58.174\t1971-01-01\tabcd" + + "\tworld\tworld\t{\"k4\":\"v4\"}\t[200,300]\t{\"c1\":20,\"c2\":\"bar\"}"; + String expectedSuffixRow2 = "alltypes/delta_0000001_0000002/bucket_00000"; + Assert.assertTrue(gotRow1, gotRow1.startsWith(expectedPrefixRow1)); + Assert.assertTrue(gotRow1, gotRow1.endsWith(expectedSuffixRow1)); + Assert.assertTrue(gotRow2, gotRow2.startsWith(expectedPrefixRow2)); + Assert.assertTrue(gotRow2, gotRow2.endsWith(expectedSuffixRow2)); + } + + @Test + public void testAutoRollTransactionBatch() throws Exception { + queryTable(driver, "drop table if exists default.streamingnobuckets"); + queryTable(driver, "create table default.streamingnobuckets (a string, b string) stored as orc " + + "TBLPROPERTIES('transactional'='true')"); + queryTable(driver, "insert into default.streamingnobuckets values('foo','bar')"); + List rs = queryTable(driver, "select * from default.streamingnobuckets"); + Assert.assertEquals(1, rs.size()); + Assert.assertEquals("foo\tbar", rs.get(0)); + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("default") + .withTable("streamingnobuckets") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(wr) + .withHiveConf(conf) + .withTransactionBatchSize(2) + .connect(); + + connection.beginTransaction(); + connection.write("a1,b2".getBytes()); + connection.write("a3,b4".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("a5,b6".getBytes()); + connection.write("a7,b8".getBytes()); + connection.commitTransaction(); + // should have rolled over to next transaction batch + connection.beginTransaction(); + connection.write("a9,b10".getBytes()); + connection.write("a11,b12".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("a13,b14".getBytes()); + connection.write("a15,b16".getBytes()); + connection.commitTransaction(); + connection.close(); + + Assert.assertEquals("", 0, BucketCodec.determineVersion(536870912).decodeWriterId(536870912)); + rs = queryTable(driver, "select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); + + Assert.assertTrue(rs.get(0), rs.get(0).startsWith("{\"writeid\":1,\"bucketid\":536870912,\"rowid\":0}\tfoo\tbar")); + Assert.assertTrue(rs.get(0), rs.get(0).endsWith("streamingnobuckets/delta_0000001_0000001_0000/bucket_00000")); + Assert.assertTrue(rs.get(1), rs.get(1).startsWith("{\"writeid\":2,\"bucketid\":536870912,\"rowid\":0}\ta1\tb2")); + Assert.assertTrue(rs.get(1), rs.get(1).endsWith("streamingnobuckets/delta_0000002_0000003/bucket_00000")); + Assert.assertTrue(rs.get(2), rs.get(2).startsWith("{\"writeid\":2,\"bucketid\":536870912,\"rowid\":1}\ta3\tb4")); + Assert.assertTrue(rs.get(2), rs.get(2).endsWith("streamingnobuckets/delta_0000002_0000003/bucket_00000")); + Assert.assertTrue(rs.get(3), rs.get(3).startsWith("{\"writeid\":3,\"bucketid\":536870912,\"rowid\":0}\ta5\tb6")); + Assert.assertTrue(rs.get(3), rs.get(3).endsWith("streamingnobuckets/delta_0000002_0000003/bucket_00000")); + Assert.assertTrue(rs.get(4), rs.get(4).startsWith("{\"writeid\":3,\"bucketid\":536870912,\"rowid\":1}\ta7\tb8")); + Assert.assertTrue(rs.get(4), rs.get(4).endsWith("streamingnobuckets/delta_0000002_0000003/bucket_00000")); + + Assert.assertTrue(rs.get(5), rs.get(5).startsWith("{\"writeid\":4,\"bucketid\":536870912,\"rowid\":0}\ta9\tb10")); + Assert.assertTrue(rs.get(5), rs.get(5).endsWith("streamingnobuckets/delta_0000004_0000005/bucket_00000")); + Assert.assertTrue(rs.get(6), rs.get(6).startsWith("{\"writeid\":4,\"bucketid\":536870912,\"rowid\":1}\ta11\tb12")); + Assert.assertTrue(rs.get(6), rs.get(6).endsWith("streamingnobuckets/delta_0000004_0000005/bucket_00000")); + Assert.assertTrue(rs.get(7), rs.get(7).startsWith("{\"writeid\":5,\"bucketid\":536870912,\"rowid\":0}\ta13\tb14")); + Assert.assertTrue(rs.get(7), rs.get(7).endsWith("streamingnobuckets/delta_0000004_0000005/bucket_00000")); + Assert.assertTrue(rs.get(8), rs.get(8).startsWith("{\"writeid\":5,\"bucketid\":536870912,\"rowid\":1}\ta15\tb16")); + Assert.assertTrue(rs.get(8), rs.get(8).endsWith("streamingnobuckets/delta_0000004_0000005/bucket_00000")); + + queryTable(driver, "update default.streamingnobuckets set a=0, b=0 where a='a7'"); + queryTable(driver, "delete from default.streamingnobuckets where a='a1'"); + queryTable(driver, "update default.streamingnobuckets set a=0, b=0 where a='a15'"); + queryTable(driver, "delete from default.streamingnobuckets where a='a9'"); + rs = queryTable(driver, "select a, b from default.streamingnobuckets order by a, b"); + int row = 0; + Assert.assertEquals("at row=" + row, "0\t0", rs.get(row++)); + Assert.assertEquals("at row=" + row, "0\t0", rs.get(row++)); + Assert.assertEquals("at row=" + row, "a11\tb12", rs.get(row++)); + Assert.assertEquals("at row=" + row, "a13\tb14", rs.get(row++)); + Assert.assertEquals("at row=" + row, "a3\tb4", rs.get(row++)); + Assert.assertEquals("at row=" + row, "a5\tb6", rs.get(row++)); + Assert.assertEquals("at row=" + row, "foo\tbar", rs.get(row++)); + + queryTable(driver, "alter table default.streamingnobuckets compact 'major'"); + runWorker(conf); + rs = queryTable(driver, "select ROW__ID, a, b, INPUT__FILE__NAME from default.streamingnobuckets order by ROW__ID"); + + Assert.assertTrue(rs.get(0), rs.get(0).startsWith("{\"writeid\":1,\"bucketid\":536870912,\"rowid\":0}\tfoo\tbar")); + Assert.assertTrue(rs.get(0), rs.get(0).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(1), rs.get(1).startsWith("{\"writeid\":2,\"bucketid\":536870912,\"rowid\":1}\ta3\tb4")); + Assert.assertTrue(rs.get(1), rs.get(1).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(2), rs.get(2).startsWith("{\"writeid\":3,\"bucketid\":536870912,\"rowid\":0}\ta5\tb6")); + Assert.assertTrue(rs.get(2), rs.get(2).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(3), rs.get(3).startsWith("{\"writeid\":4,\"bucketid\":536870912,\"rowid\":1}\ta11\tb12")); + Assert.assertTrue(rs.get(3), rs.get(3).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(4), rs.get(4).startsWith("{\"writeid\":5,\"bucketid\":536870912,\"rowid\":0}\ta13\tb14")); + Assert.assertTrue(rs.get(4), rs.get(4).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + Assert.assertTrue(rs.get(5), rs.get(5).startsWith("{\"writeid\":6,\"bucketid\":536870912,\"rowid\":0}\t0\t0")); + Assert.assertTrue(rs.get(5), rs.get(5).endsWith("streamingnobuckets/base_0000009/bucket_00000")); + } + /** * this is a clone from TestTxnStatement2.... */ @@ -436,71 +582,83 @@ public void testStreamBucketingMatchesRegularBucketing() throws Exception { int bucketCount = 100; String dbUri = "raw://" + new Path(dbFolder.newFolder().toString()).toUri().toString(); - String tableLoc = "'" + dbUri + Path.SEPARATOR + "streamedtable" + "'"; + String tableLoc = "'" + dbUri + Path.SEPARATOR + "streamedtable" + "'"; String tableLoc2 = "'" + dbUri + Path.SEPARATOR + "finaltable" + "'"; String tableLoc3 = "'" + dbUri + Path.SEPARATOR + "nobucket" + "'"; - runDDL(driver, "create database testBucketing3"); - runDDL(driver, "use testBucketing3"); - runDDL(driver, "create table streamedtable ( key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='true')") ; -// In 'nobucket' table we capture bucketid from streamedtable to workaround a hive bug that prevents joins two identically bucketed tables - runDDL(driver, "create table nobucket ( bucketid int, key1 string,key2 int,data string ) location " + tableLoc3) ; - runDDL(driver, "create table finaltable ( bucketid int, key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='true')"); - - - String[] records = new String[] { - "PSFAHYLZVC,29,EPNMA", - "PPPRKWAYAU,96,VUTEE", - "MIAOFERCHI,3,WBDSI", - "CEGQAZOWVN,0,WCUZL", - "XWAKMNSVQF,28,YJVHU", - "XBWTSAJWME,2,KDQFO", - "FUVLQTAXAY,5,LDSDG", - "QTQMDJMGJH,6,QBOMA", - "EFLOTLWJWN,71,GHWPS", - "PEQNAOJHCM,82,CAAFI", - "MOEKQLGZCP,41,RUACR", - "QZXMCOPTID,37,LFLWE", - "EYALVWICRD,13,JEZLC", - "VYWLZAYTXX,16,DMVZX", - "OSALYSQIXR,47,HNZVE", - "JGKVHKCEGQ,25,KSCJB", - "WQFMMYDHET,12,DTRWA", - "AJOVAYZKZQ,15,YBKFO", - "YAQONWCUAU,31,QJNHZ", - "DJBXUEUOEB,35,IYCBL" - }; - - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testBucketing3", "streamedtable", null); - String[] colNames1 = new String[] { "key1", "key2", "data" }; - DelimitedInputWriter wr = new DelimitedInputWriter(colNames1,",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, wr); - txnBatch.beginNextTransaction(); - - for (String record : records) { - txnBatch.write(record.toString().getBytes()); - } + // disabling vectorization as this test yields incorrect results with vectorization + conf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false); + try (IDriver driver = DriverFactory.newDriver(conf)) { + runDDL(driver, "create database testBucketing3"); + runDDL(driver, "use testBucketing3"); + runDDL(driver, "create table streamedtable ( key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='true')"); + // In 'nobucket' table we capture bucketid from streamedtable to workaround a hive bug that prevents joins two identically bucketed tables + runDDL(driver, "create table nobucket ( bucketid int, key1 string,key2 int,data string ) location " + tableLoc3); + runDDL(driver, + "create table finaltable ( bucketid int, key1 string,key2 int,data string ) clustered by ( key1,key2 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='true')"); + + + String[] records = new String[]{ + "PSFAHYLZVC,29,EPNMA", + "PPPRKWAYAU,96,VUTEE", + "MIAOFERCHI,3,WBDSI", + "CEGQAZOWVN,0,WCUZL", + "XWAKMNSVQF,28,YJVHU", + "XBWTSAJWME,2,KDQFO", + "FUVLQTAXAY,5,LDSDG", + "QTQMDJMGJH,6,QBOMA", + "EFLOTLWJWN,71,GHWPS", + "PEQNAOJHCM,82,CAAFI", + "MOEKQLGZCP,41,RUACR", + "QZXMCOPTID,37,LFLWE", + "EYALVWICRD,13,JEZLC", + "VYWLZAYTXX,16,DMVZX", + "OSALYSQIXR,47,HNZVE", + "JGKVHKCEGQ,25,KSCJB", + "WQFMMYDHET,12,DTRWA", + "AJOVAYZKZQ,15,YBKFO", + "YAQONWCUAU,31,QJNHZ", + "DJBXUEUOEB,35,IYCBL" + }; + + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("streamedtable") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); + + for (String record : records) { + connection.write(record.getBytes()); + } - txnBatch.commit(); - txnBatch.close(); - connection.close(); + connection.commitTransaction(); + connection.close(); - ArrayList res1 = queryTable(driver, "select row__id.bucketid, * from streamedtable order by key2"); - for (String re : res1) { - System.out.println(re); - } + ArrayList res1 = queryTable(driver, "select row__id.bucketid, * from streamedtable order by key2"); + for (String re : res1) { + LOG.error(re); + } - driver.run("insert into nobucket select row__id.bucketid,* from streamedtable"); - runDDL(driver, " insert into finaltable select * from nobucket"); - ArrayList res2 = queryTable(driver, "select row__id.bucketid,* from finaltable where row__id.bucketid<>bucketid"); - for (String s : res2) { - LOG.error(s); + driver.run("insert into nobucket select row__id.bucketid,* from streamedtable"); + runDDL(driver, "insert into finaltable select * from nobucket"); + ArrayList res2 = queryTable(driver, + "select row__id.bucketid,* from finaltable where row__id.bucketid<>bucketid"); + for (String s : res2) { + LOG.error(s); + } + Assert.assertTrue(res2.isEmpty()); + } finally { + conf.unset(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.varname); } - Assert.assertTrue(res2.isEmpty()); } @@ -512,32 +670,53 @@ public void testTableValidation() throws Exception { String tbl1 = "validation1"; String tbl2 = "validation2"; - String tableLoc = "'" + dbUri + Path.SEPARATOR + tbl1 + "'"; + String tableLoc = "'" + dbUri + Path.SEPARATOR + tbl1 + "'"; String tableLoc2 = "'" + dbUri + Path.SEPARATOR + tbl2 + "'"; runDDL(driver, "create database testBucketing3"); runDDL(driver, "use testBucketing3"); runDDL(driver, "create table " + tbl1 + " ( key1 string, data string ) clustered by ( key1 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='false')") ; + + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='false')"); runDDL(driver, "create table " + tbl2 + " ( key1 string, data string ) clustered by ( key1 ) into " - + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='false')") ; - + + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='false')"); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = null; try { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testBucketing3", "validation1", null); - endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + connection = HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("validation2") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); Assert.assertTrue("InvalidTable exception was not thrown", false); } catch (InvalidTable e) { // expecting this exception + } finally { + if (connection != null) { + connection.close(); + } } try { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testBucketing3", "validation2", null); - endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + connection = HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("validation2") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); Assert.assertTrue("InvalidTable exception was not thrown", false); } catch (InvalidTable e) { // expecting this exception + } finally { + if (connection != null) { + connection.close(); + } } } @@ -547,7 +726,7 @@ public void testTableValidation() throws Exception { */ @Deprecated private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles, - String... records) throws Exception { + String... records) throws Exception { ValidWriteIdList writeIds = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName)); AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds); Assert.assertEquals(0, dir.getObsolete().size()); @@ -585,7 +764,7 @@ private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int InputSplit[] splits = inf.getSplits(job, buckets); Assert.assertEquals(numExpectedFiles, splits.length); org.apache.hadoop.mapred.RecordReader rr = - inf.getRecordReader(splits[0], job, Reporter.NULL); + inf.getRecordReader(splits[0], job, Reporter.NULL); NullWritable key = rr.createKey(); OrcStruct value = rr.createValue(); @@ -595,12 +774,13 @@ private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int } Assert.assertEquals(false, rr.next(key, value)); } + /** * @param validationQuery query to read from table to compare data against {@code records} - * @param records expected data. each row is CVS list of values + * @param records expected data. each row is CVS list of values */ private void checkDataWritten2(Path partitionPath, long minTxn, long maxTxn, int numExpectedFiles, - String validationQuery, boolean vectorize, String... records) throws Exception { + String validationQuery, boolean vectorize, String... records) throws Exception { ValidWriteIdList txns = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName)); AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, txns); Assert.assertEquals(0, dir.getObsolete().size()); @@ -626,12 +806,13 @@ private void checkDataWritten2(Path partitionPath, long minTxn, long maxTxn, int Assert.assertEquals(minTxn, min); Assert.assertEquals(maxTxn, max); boolean isVectorizationEnabled = conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED); - if(vectorize) { + if (vectorize) { conf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); } String currStrategy = conf.getVar(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY); - for(String strategy : ((Validator.StringSet)HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.getValidator()).getExpected()) { + for (String strategy : ((Validator.StringSet) HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.getValidator()) + .getExpected()) { //run it with each split strategy - make sure there are differences conf.setVar(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY, strategy.toUpperCase()); List actualResult = queryTable(driver, validationQuery); @@ -656,35 +837,44 @@ private void checkNothingWritten(Path partitionPath) throws Exception { @Test public void testEndpointConnection() throws Exception { // For partitioned table, partitionVals are specified - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, partitionVals); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); //shouldn't throw + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); connection.close(); // For unpartitioned table, partitionVals are not specified - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - endPt.newConnection(false, "UT_" + Thread.currentThread().getName()).close(); // should not throw - - // For partitioned table, partitionVals are not specified - try { - endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, null); - connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - Assert.assertTrue("ConnectionError was not thrown", false); - connection.close(); - } catch (ConnectionError e) { - // expecting this exception - String errMsg = "doesn't specify any partitions for partitioned table"; - Assert.assertTrue(e.toString().endsWith(errMsg)); - } + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.close(); // For unpartitioned table, partition values are specified try { - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, partitionVals); - connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); Assert.assertTrue("ConnectionError was not thrown", false); connection.close(); } catch (ConnectionError e) { // expecting this exception - String errMsg = "specifies partitions for unpartitioned table"; + String errMsg = "specifies partitions for un-partitioned table"; Assert.assertTrue(e.toString().endsWith(errMsg)); } } @@ -695,417 +885,474 @@ public void testAddPartition() throws Exception { newPartVals.add(PART1_CONTINENT); newPartVals.add("Nepal"); - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName - , newPartVals); - - // Ensure partition is absent - try { - msClient.getPartition(endPt.database, endPt.table, endPt.partitionVals); - Assert.assertTrue("Partition already exists", false); - } catch (NoSuchObjectException e) { - // expect this exception - } + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(newPartVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); // Create partition - Assert.assertNotNull(endPt.newConnection(true, "UT_" + Thread.currentThread().getName())); + Assert.assertNotNull(connection); // Ensure partition is present - Partition p = msClient.getPartition(endPt.database, endPt.table, endPt.partitionVals); + Partition p = msClient.getPartition(dbName, tblName, partitionVals); Assert.assertNotNull("Did not find added partition", p); } @Test public void testTransactionBatchEmptyCommit() throws Exception { // 1) to partitioned table - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - - txnBatch.beginNextTransaction(); - txnBatch.commit(); - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); - txnBatch.close(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); + connection.commitTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); connection.close(); // 2) To unpartitioned table - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - writer = new DelimitedInputWriter(fieldNames2,",", endPt); - connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.commit(); - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); - txnBatch.close(); + writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); + connection.commitTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); connection.close(); } /** * check that transactions that have not heartbeated and timedout get properly aborted + * * @throws Exception */ @Test public void testTimeOutReaper() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames2,",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(5, writer); - txnBatch.beginNextTransaction(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); conf.setTimeVar(HiveConf.ConfVars.HIVE_TIMEDOUT_TXN_REAPER_START, 0, TimeUnit.SECONDS); //ensure txn timesout - conf.setTimeVar(HiveConf.ConfVars.HIVE_TXN_TIMEOUT, 1, TimeUnit.MILLISECONDS); + conf.setTimeVar(HiveConf.ConfVars.HIVE_TXN_TIMEOUT, 2, TimeUnit.MILLISECONDS); AcidHouseKeeperService houseKeeperService = new AcidHouseKeeperService(); houseKeeperService.setConf(conf); houseKeeperService.run(); try { //should fail because the TransactionBatch timed out - txnBatch.commit(); - } - catch(TransactionError e) { + connection.commitTransaction(); + } catch (TransactionError e) { Assert.assertTrue("Expected aborted transaction", e.getCause() instanceof TxnAbortedException); } - txnBatch.close(); - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.commit(); - txnBatch.beginNextTransaction(); + connection.close(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); + connection.commitTransaction(); + connection.beginTransaction(); houseKeeperService.run(); try { //should fail because the TransactionBatch timed out - txnBatch.commit(); - } - catch(TransactionError e) { + connection.commitTransaction(); + } catch (TransactionError e) { Assert.assertTrue("Expected aborted transaction", e.getCause() instanceof TxnAbortedException); } - txnBatch.close(); connection.close(); } @Test public void testHeartbeat() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames2,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(5, writer); - txnBatch.beginNextTransaction(); - //todo: this should ideally check Transaction heartbeat as well, but heartbeat - //timestamp is not reported yet - //GetOpenTxnsInfoResponse txnresp = msClient.showTxns(); - ShowLocksRequest request = new ShowLocksRequest(); - request.setDbname(dbName2); - request.setTablename(tblName2); - ShowLocksResponse response = msClient.showLocks(request); - Assert.assertEquals("Wrong nubmer of locks: " + response, 1, response.getLocks().size()); - ShowLocksResponseElement lock = response.getLocks().get(0); - long acquiredAt = lock.getAcquiredat(); - long heartbeatAt = lock.getLastheartbeat(); - txnBatch.heartbeat(); - response = msClient.showLocks(request); - Assert.assertEquals("Wrong number of locks2: " + response, 1, response.getLocks().size()); - lock = response.getLocks().get(0); - Assert.assertEquals("Acquired timestamp didn't match", acquiredAt, lock.getAcquiredat()); - Assert.assertTrue("Expected new heartbeat (" + lock.getLastheartbeat() + - ") == old heartbeat(" + heartbeatAt +")", lock.getLastheartbeat() == heartbeatAt); - txnBatch.close(); - int txnBatchSize = 200; - txnBatch = connection.fetchTransactionBatch(txnBatchSize, writer); - for(int i = 0; i < txnBatchSize; i++) { - txnBatch.beginNextTransaction(); - if(i % 47 == 0) { - txnBatch.heartbeat(); - } - if(i % 10 == 0) { - txnBatch.abort(); - } - else { - txnBatch.commit(); - } - if(i % 37 == 0) { - txnBatch.heartbeat(); + int transactionBatch = 20; + conf.setTimeVar(HiveConf.ConfVars.HIVE_TXN_TIMEOUT, 200, TimeUnit.MILLISECONDS); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withTransactionBatchSize(transactionBatch) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + try { + connection.beginTransaction(); + ShowLocksRequest request = new ShowLocksRequest(); + request.setDbname(dbName2); + request.setTablename(tblName2); + ShowLocksResponse response = msClient.showLocks(request); + Assert.assertEquals("Wrong number of locks: " + response, 1, response.getLocks().size()); + ShowLocksResponseElement lock = response.getLocks().get(0); + long acquiredAt = lock.getAcquiredat(); + long heartbeatAt = lock.getLastheartbeat(); + response = msClient.showLocks(request); + Assert.assertEquals("Wrong number of locks2: " + response, 1, response.getLocks().size()); + lock = response.getLocks().get(0); + Assert.assertEquals("Acquired timestamp didn'table match", acquiredAt, lock.getAcquiredat()); + Assert.assertTrue("Expected new heartbeat (" + lock.getLastheartbeat() + + ") == old heartbeat(" + heartbeatAt + ")", lock.getLastheartbeat() == heartbeatAt); + for (int i = 0; i < transactionBatch * 3; i++) { + connection.beginTransaction(); + if (i % 10 == 0) { + connection.abortTransaction(); + } else { + connection.commitTransaction(); + } + Thread.sleep(10); } + } finally { + conf.unset(HiveConf.ConfVars.HIVE_TXN_TIMEOUT.varname); + connection.close(); } - } + @Test public void testTransactionBatchEmptyAbort() throws Exception { // 1) to partitioned table - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.abort(); - Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); - txnBatch.close(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); + connection.abortTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.ABORTED + , connection.getCurrentTransactionState()); connection.close(); // 2) to unpartitioned table - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - writer = new DelimitedInputWriter(fieldNames,",", endPt); - connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.abort(); - Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); - txnBatch.close(); + writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); + connection.abortTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.ABORTED + , connection.getCurrentTransactionState()); connection.close(); } @Test - public void testTransactionBatchCommit_Delimited() throws Exception { - testTransactionBatchCommit_Delimited(null); - } - @Test - public void testTransactionBatchCommit_DelimitedUGI() throws Exception { - testTransactionBatchCommit_Delimited(Utils.getUGI()); - } - private void testTransactionBatchCommit_Delimited(UserGroupInformation ugi) throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(true, conf, ugi, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, conf, connection); + public void testTransactionBatchCommitDelimited() throws Exception { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .withTransactionBatchSize(10) + .connect(); // 1st Txn - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.commit(); + connection.beginTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.OPEN + , connection.getCurrentTransactionState()); + connection.write("1,Hello streaming".getBytes()); + connection.commitTransaction(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); // 2nd Txn - txnBatch.beginNextTransaction(); - Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("2,Welcome to streaming".getBytes()); + connection.beginTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.OPEN + , connection.getCurrentTransactionState()); + connection.write("2,Welcome to streaming".getBytes()); // data should not be visible checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); - txnBatch.commit(); + connection.commitTransaction(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}", "{2, Welcome to streaming}"); - txnBatch.close(); - Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); - - connection.close(); + Assert.assertEquals(HiveStreamingConnection.TxnState.INACTIVE + , connection.getCurrentTransactionState()); - // To Unpartitioned table - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - connection = endPt.newConnection(true, conf, ugi, "UT_" + Thread.currentThread().getName()); - writer = new DelimitedInputWriter(fieldNames,",", endPt, conf, connection); + // To Unpartitioned table + writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); // 1st Txn - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.commit(); - - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + connection.beginTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.OPEN + , connection.getCurrentTransactionState()); + connection.write("1,Hello streaming".getBytes()); + connection.commitTransaction(); + + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); connection.close(); } @Test - public void testTransactionBatchCommit_Regex() throws Exception { - testTransactionBatchCommit_Regex(null); - } - @Test - public void testTransactionBatchCommit_RegexUGI() throws Exception { - testTransactionBatchCommit_Regex(Utils.getUGI()); - } - private void testTransactionBatchCommit_Regex(UserGroupInformation ugi) throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(true, conf, ugi, "UT_" + Thread.currentThread().getName()); + public void testTransactionBatchCommitRegex() throws Exception { String regex = "([^,]*),(.*)"; - StrictRegexWriter writer = new StrictRegexWriter(regex, endPt, conf, connection); + StrictRegexWriter writer = StrictRegexWriter.newBuilder() + .withRegex(regex) + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .withTransactionBatchSize(10) + .connect(); // 1st Txn - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.commit(); + connection.beginTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.OPEN + , connection.getCurrentTransactionState()); + connection.write("1,Hello streaming".getBytes()); + connection.commitTransaction(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); // 2nd Txn - txnBatch.beginNextTransaction(); - Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("2,Welcome to streaming".getBytes()); + connection.beginTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.OPEN + , connection.getCurrentTransactionState()); + connection.write("2,Welcome to streaming".getBytes()); // data should not be visible checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); - txnBatch.commit(); + connection.commitTransaction(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}", "{2, Welcome to streaming}"); - txnBatch.close(); - Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); - - connection.close(); - + Assert.assertEquals(HiveStreamingConnection.TxnState.INACTIVE + , connection.getCurrentTransactionState()); // To Unpartitioned table - endPt = new HiveEndPoint(metaStoreURI, dbName2, tblName2, null); - connection = endPt.newConnection(true, conf, ugi, "UT_" + Thread.currentThread().getName()); regex = "([^:]*):(.*)"; - writer = new StrictRegexWriter(regex, endPt, conf, connection); + writer = StrictRegexWriter.newBuilder() + .withRegex(regex) + .build(); + + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName2) + .withTable(tblName2) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); // 1st Txn - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write("1:Hello streaming".getBytes()); - txnBatch.commit(); - - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + connection.beginTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.OPEN + , connection.getCurrentTransactionState()); + connection.write("1:Hello streaming".getBytes()); + connection.commitTransaction(); + + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); connection.close(); } @Test - public void testTransactionBatchCommit_Json() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - StrictJsonWriter writer = new StrictJsonWriter(endPt, connection); + public void testTransactionBatchCommitJson() throws Exception { + StrictJsonWriter writer = StrictJsonWriter.newBuilder() + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .withTransactionBatchSize(10) + .connect(); // 1st Txn - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); + connection.beginTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.OPEN + , connection.getCurrentTransactionState()); String rec1 = "{\"id\" : 1, \"msg\": \"Hello streaming\"}"; - txnBatch.write(rec1.getBytes()); - txnBatch.commit(); + connection.write(rec1.getBytes()); + connection.commitTransaction(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}"); - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); - - txnBatch.close(); - Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); connection.close(); + Assert.assertEquals(HiveStreamingConnection.TxnState.INACTIVE + , connection.getCurrentTransactionState()); + List rs = queryTable(driver, "select * from " + dbName + "." + tblName); Assert.assertEquals(1, rs.size()); } @Test public void testRemainingTransactions() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt); - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); // 1) test with txn.Commit() - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - int batch=0; - int initialCount = txnBatch.remainingTransactions(); - while (txnBatch.remainingTransactions()>0) { - txnBatch.beginNextTransaction(); - Assert.assertEquals(--initialCount, txnBatch.remainingTransactions()); - for (int rec=0; rec<2; ++rec) { - Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write((batch * rec + ",Hello streaming").getBytes()); + int batch = 0; + int initialCount = connection.remainingTransactions(); + while (connection.remainingTransactions() > 0) { + connection.beginTransaction(); + Assert.assertEquals(--initialCount, connection.remainingTransactions()); + for (int rec = 0; rec < 2; ++rec) { + Assert.assertEquals(HiveStreamingConnection.TxnState.OPEN + , connection.getCurrentTransactionState()); + connection.write((batch * rec + ",Hello streaming").getBytes()); } - txnBatch.commit(); - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); + connection.commitTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); ++batch; } - Assert.assertEquals(0, txnBatch.remainingTransactions()); - txnBatch.close(); - - Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); + Assert.assertEquals(0, connection.remainingTransactions()); + connection.close(); + Assert.assertEquals(HiveStreamingConnection.TxnState.INACTIVE + , connection.getCurrentTransactionState()); + + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); // 2) test with txn.Abort() - txnBatch = connection.fetchTransactionBatch(10, writer); - batch=0; - initialCount = txnBatch.remainingTransactions(); - while (txnBatch.remainingTransactions()>0) { - txnBatch.beginNextTransaction(); - Assert.assertEquals(--initialCount,txnBatch.remainingTransactions()); - for (int rec=0; rec<2; ++rec) { - Assert.assertEquals(TransactionBatch.TxnState.OPEN - , txnBatch.getCurrentTransactionState()); - txnBatch.write((batch * rec + ",Hello streaming").getBytes()); + connection.beginTransaction(); + batch = 0; + initialCount = connection.remainingTransactions(); + while (connection.remainingTransactions() > 0) { + connection.beginTransaction(); + Assert.assertEquals(--initialCount, connection.remainingTransactions()); + for (int rec = 0; rec < 2; ++rec) { + Assert.assertEquals(HiveStreamingConnection.TxnState.OPEN + , connection.getCurrentTransactionState()); + connection.write((batch * rec + ",Hello streaming").getBytes()); } - txnBatch.abort(); - Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); + connection.abortTransaction(); + Assert.assertEquals(HiveStreamingConnection.TxnState.ABORTED + , connection.getCurrentTransactionState()); ++batch; } - Assert.assertEquals(0, txnBatch.remainingTransactions()); - txnBatch.close(); - - Assert.assertEquals(TransactionBatch.TxnState.INACTIVE - , txnBatch.getCurrentTransactionState()); - + Assert.assertEquals(0, connection.remainingTransactions()); connection.close(); + + Assert.assertEquals(HiveStreamingConnection.TxnState.INACTIVE + , connection.getCurrentTransactionState()); } @Test public void testTransactionBatchAbort() throws Exception { - - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, connection); - - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.write("2,Welcome to streaming".getBytes()); - txnBatch.abort(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); + connection.write("1,Hello streaming".getBytes()); + connection.write("2,Welcome to streaming".getBytes()); + connection.abortTransaction(); checkNothingWritten(partLoc); - Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); + Assert.assertEquals(HiveStreamingConnection.TxnState.ABORTED + , connection.getCurrentTransactionState()); - txnBatch.close(); connection.close(); checkNothingWritten(partLoc); @@ -1116,123 +1363,162 @@ public void testTransactionBatchAbort() throws Exception { @Test public void testTransactionBatchAbortAndCommit() throws Exception { String agentInfo = "UT_" + Thread.currentThread().getName(); - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - StreamingConnection connection = endPt.newConnection(false, agentInfo); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.write("2,Welcome to streaming".getBytes()); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo(agentInfo) + .withRecordWriter(writer) + .withHiveConf(conf) + .withTransactionBatchSize(10) + .connect(); + + connection.beginTransaction(); + connection.write("1,Hello streaming".getBytes()); + connection.write("2,Welcome to streaming".getBytes()); ShowLocksResponse resp = msClient.showLocks(new ShowLocksRequest()); Assert.assertEquals("LockCount", 1, resp.getLocksSize()); Assert.assertEquals("LockType", LockType.SHARED_READ, resp.getLocks().get(0).getType()); Assert.assertEquals("LockState", LockState.ACQUIRED, resp.getLocks().get(0).getState()); Assert.assertEquals("AgentInfo", agentInfo, resp.getLocks().get(0).getAgentInfo()); - txnBatch.abort(); + connection.abortTransaction(); checkNothingWritten(partLoc); - Assert.assertEquals(TransactionBatch.TxnState.ABORTED - , txnBatch.getCurrentTransactionState()); + Assert.assertEquals(HiveStreamingConnection.TxnState.ABORTED + , connection.getCurrentTransactionState()); - txnBatch.beginNextTransaction(); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.write("2,Welcome to streaming".getBytes()); - txnBatch.commit(); + connection.beginTransaction(); + connection.write("1,Hello streaming".getBytes()); + connection.write("2,Welcome to streaming".getBytes()); + connection.commitTransaction(); checkDataWritten(partLoc, 1, 10, 1, 1, "{1, Hello streaming}", - "{2, Welcome to streaming}"); + "{2, Welcome to streaming}"); - txnBatch.close(); connection.close(); } @Test public void testMultipleTransactionBatchCommits() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames,",", endPt); - StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName()); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("1,Hello streaming".getBytes()); - txnBatch.commit(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withTransactionBatchSize(10) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); + connection.write("1,Hello streaming".getBytes()); + connection.commitTransaction(); String validationQuery = "select id, msg from " + dbName + "." + tblName + " order by id, msg"; checkDataWritten2(partLoc, 1, 10, 1, validationQuery, false, "1\tHello streaming"); - txnBatch.beginNextTransaction(); - txnBatch.write("2,Welcome to streaming".getBytes()); - txnBatch.commit(); + connection.beginTransaction(); + connection.write("2,Welcome to streaming".getBytes()); + connection.commitTransaction(); - checkDataWritten2(partLoc, 1, 10, 1, validationQuery, true, "1\tHello streaming", - "2\tWelcome to streaming"); + checkDataWritten2(partLoc, 1, 10, 1, validationQuery, true, "1\tHello streaming", + "2\tWelcome to streaming"); - txnBatch.close(); + connection.close(); + connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withTransactionBatchSize(10) + .withHiveConf(conf) + .connect(); // 2nd Txn Batch - txnBatch = connection.fetchTransactionBatch(10, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("3,Hello streaming - once again".getBytes()); - txnBatch.commit(); + connection.beginTransaction(); + connection.write("3,Hello streaming - once again".getBytes()); + connection.commitTransaction(); - checkDataWritten2(partLoc, 1, 20, 2, validationQuery, false, "1\tHello streaming", - "2\tWelcome to streaming", "3\tHello streaming - once again"); + checkDataWritten2(partLoc, 1, 20, 2, validationQuery, false, "1\tHello streaming", + "2\tWelcome to streaming", "3\tHello streaming - once again"); - txnBatch.beginNextTransaction(); - txnBatch.write("4,Welcome to streaming - once again".getBytes()); - txnBatch.commit(); + connection.beginTransaction(); + connection.write("4,Welcome to streaming - once again".getBytes()); + connection.commitTransaction(); - checkDataWritten2(partLoc, 1, 20, 2, validationQuery, true, "1\tHello streaming", - "2\tWelcome to streaming", "3\tHello streaming - once again", - "4\tWelcome to streaming - once again"); + checkDataWritten2(partLoc, 1, 20, 2, validationQuery, true, "1\tHello streaming", + "2\tWelcome to streaming", "3\tHello streaming - once again", + "4\tWelcome to streaming - once again"); - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch.getCurrentTransactionState()); - - txnBatch.close(); + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); connection.close(); } @Test public void testInterleavedTransactionBatchCommits() throws Exception { - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName, tblName, - partitionVals); - DelimitedInputWriter writer = new DelimitedInputWriter(fieldNames, ",", endPt); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .withTransactionBatchSize(10) + .connect(); // Acquire 1st Txn Batch - TransactionBatch txnBatch1 = connection.fetchTransactionBatch(10, writer); - txnBatch1.beginNextTransaction(); + connection.beginTransaction(); // Acquire 2nd Txn Batch - DelimitedInputWriter writer2 = new DelimitedInputWriter(fieldNames, ",", endPt); - TransactionBatch txnBatch2 = connection.fetchTransactionBatch(10, writer2); - txnBatch2.beginNextTransaction(); + StrictDelimitedInputWriter writer2 = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer2) + .withHiveConf(conf) + .withTransactionBatchSize(10) + .connect(); + connection2.beginTransaction(); // Interleaved writes to both batches - txnBatch1.write("1,Hello streaming".getBytes()); - txnBatch2.write("3,Hello streaming - once again".getBytes()); + connection.write("1,Hello streaming".getBytes()); + connection2.write("3,Hello streaming - once again".getBytes()); checkNothingWritten(partLoc); - txnBatch2.commit(); + connection2.commitTransaction(); String validationQuery = "select id, msg from " + dbName + "." + tblName + " order by id, msg"; checkDataWritten2(partLoc, 11, 20, 1, validationQuery, true, "3\tHello streaming - once again"); - txnBatch1.commit(); + connection.commitTransaction(); /*now both batches have committed (but not closed) so we for each primary file we expect a side file to exist and indicate the true length of primary file*/ FileSystem fs = partLoc.getFileSystem(conf); AcidUtils.Directory dir = AcidUtils.getAcidState(partLoc, conf, - msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName))); - for(AcidUtils.ParsedDelta pd : dir.getCurrentDirectories()) { - for(FileStatus stat : fs.listStatus(pd.getPath(), AcidUtils.bucketFileFilter)) { + msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName))); + for (AcidUtils.ParsedDelta pd : dir.getCurrentDirectories()) { + for (FileStatus stat : fs.listStatus(pd.getPath(), AcidUtils.bucketFileFilter)) { Path lengthFile = OrcAcidUtils.getSideFile(stat.getPath()); Assert.assertTrue(lengthFile + " missing", fs.exists(lengthFile)); long lengthFileSize = fs.getFileStatus(lengthFile).getLen(); @@ -1244,20 +1530,20 @@ public void testInterleavedTransactionBatchCommits() throws Exception { } } checkDataWritten2(partLoc, 1, 20, 2, - validationQuery, false,"1\tHello streaming", "3\tHello streaming - once again"); + validationQuery, false, "1\tHello streaming", "3\tHello streaming - once again"); - txnBatch1.beginNextTransaction(); - txnBatch1.write("2,Welcome to streaming".getBytes()); + connection.beginTransaction(); + connection.write("2,Welcome to streaming".getBytes()); - txnBatch2.beginNextTransaction(); - txnBatch2.write("4,Welcome to streaming - once again".getBytes()); + connection2.beginTransaction(); + connection2.write("4,Welcome to streaming - once again".getBytes()); //here each batch has written data and committed (to bucket0 since table only has 1 bucket) //so each of 2 deltas has 1 bucket0 and 1 bucket0_flush_length. Furthermore, each bucket0 //has now received more data(logically - it's buffered) but it is not yet committed. //lets check that side files exist, etc dir = AcidUtils.getAcidState(partLoc, conf, msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName))); - for(AcidUtils.ParsedDelta pd : dir.getCurrentDirectories()) { - for(FileStatus stat : fs.listStatus(pd.getPath(), AcidUtils.bucketFileFilter)) { + for (AcidUtils.ParsedDelta pd : dir.getCurrentDirectories()) { + for (FileStatus stat : fs.listStatus(pd.getPath(), AcidUtils.bucketFileFilter)) { Path lengthFile = OrcAcidUtils.getSideFile(stat.getPath()); Assert.assertTrue(lengthFile + " missing", fs.exists(lengthFile)); long lengthFileSize = fs.getFileStatus(lengthFile).getLen(); @@ -1269,103 +1555,97 @@ public void testInterleavedTransactionBatchCommits() throws Exception { } } checkDataWritten2(partLoc, 1, 20, 2, - validationQuery, true,"1\tHello streaming", "3\tHello streaming - once again"); + validationQuery, true, "1\tHello streaming", "3\tHello streaming - once again"); - txnBatch1.commit(); + connection.commitTransaction(); checkDataWritten2(partLoc, 1, 20, 2, validationQuery, false, "1\tHello streaming", - "2\tWelcome to streaming", - "3\tHello streaming - once again"); + "2\tWelcome to streaming", + "3\tHello streaming - once again"); - txnBatch2.commit(); + connection2.commitTransaction(); checkDataWritten2(partLoc, 1, 20, 2, validationQuery, true, "1\tHello streaming", - "2\tWelcome to streaming", - "3\tHello streaming - once again", - "4\tWelcome to streaming - once again"); - - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch1.getCurrentTransactionState()); - Assert.assertEquals(TransactionBatch.TxnState.COMMITTED - , txnBatch2.getCurrentTransactionState()); + "2\tWelcome to streaming", + "3\tHello streaming - once again", + "4\tWelcome to streaming - once again"); - txnBatch1.close(); - txnBatch2.close(); + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection.getCurrentTransactionState()); + Assert.assertEquals(HiveStreamingConnection.TxnState.COMMITTED + , connection2.getCurrentTransactionState()); connection.close(); + connection2.close(); } private static class WriterThd extends Thread { private final StreamingConnection conn; - private final DelimitedInputWriter writer; private final String data; private Throwable error; - WriterThd(HiveEndPoint ep, String data) throws Exception { + WriterThd(String data) throws Exception { super("Writer_" + data); - writer = new DelimitedInputWriter(fieldNames, ",", ep); - conn = ep.newConnection(false, "UT_" + Thread.currentThread().getName()); + RecordWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withStaticPartitionValues(partitionVals) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + this.conn = connection; this.data = data; - setUncaughtExceptionHandler(new UncaughtExceptionHandler() { - @Override - public void uncaughtException(Thread thread, Throwable throwable) { - error = throwable; - LOG.error("Thread " + thread.getName() + " died: " + throwable.getMessage(), throwable); - } + setUncaughtExceptionHandler((thread, throwable) -> { + error = throwable; + LOG.error(connection.toTransactionString()); + LOG.error("Thread " + thread.getName() + " died: " + throwable.getMessage(), throwable); }); } @Override public void run() { - TransactionBatch txnBatch = null; try { - txnBatch = conn.fetchTransactionBatch(10, writer); - while (txnBatch.remainingTransactions() > 0) { - txnBatch.beginNextTransaction(); - txnBatch.write(data.getBytes()); - txnBatch.write(data.getBytes()); - txnBatch.commit(); + for (int i = 0; i < 10; i++) { + conn.beginTransaction(); + conn.write(data.getBytes()); + conn.write(data.getBytes()); + conn.commitTransaction(); } // while } catch (Exception e) { throw new RuntimeException(e); } finally { - if (txnBatch != null) { + if (conn != null) { try { - txnBatch.close(); + conn.close(); } catch (Exception e) { LOG.error("txnBatch.close() failed: " + e.getMessage(), e); - conn.close(); } } - try { - conn.close(); - } catch (Exception e) { - LOG.error("conn.close() failed: " + e.getMessage(), e); - } - } } } @Test public void testConcurrentTransactionBatchCommits() throws Exception { - final HiveEndPoint ep = new HiveEndPoint(metaStoreURI, dbName, tblName, partitionVals); List writers = new ArrayList(3); - writers.add(new WriterThd(ep, "1,Matrix")); - writers.add(new WriterThd(ep, "2,Gandhi")); - writers.add(new WriterThd(ep, "3,Silence")); + writers.add(new WriterThd("1,Matrix")); + writers.add(new WriterThd("2,Gandhi")); + writers.add(new WriterThd("3,Silence")); - for(WriterThd w : writers) { + for (WriterThd w : writers) { w.start(); } - for(WriterThd w : writers) { + for (WriterThd w : writers) { w.join(); } - for(WriterThd w : writers) { - if(w.error != null) { + for (WriterThd w : writers) { + if (w.error != null) { Assert.assertFalse("Writer thread" + w.getName() + " died: " + w.error.getMessage() + " See log file for stack trace", true); } @@ -1376,11 +1656,11 @@ public void testConcurrentTransactionBatchCommits() throws Exception { private ArrayList dumpBucket(Path orcFile) throws IOException { org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration()); Reader reader = OrcFile.createReader(orcFile, - OrcFile.readerOptions(conf).filesystem(fs)); + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); StructObjectInspector inspector = (StructObjectInspector) reader - .getObjectInspector(); + .getObjectInspector(); System.out.format("Found Bucket File : %s \n", orcFile.getName()); ArrayList result = new ArrayList(); @@ -1402,7 +1682,7 @@ public void testConcurrentTransactionBatchCommits() throws Exception { WritableLongObjectInspector f1ins = (WritableLongObjectInspector) fields.get(1).getFieldObjectInspector(); WritableIntObjectInspector f2ins = (WritableIntObjectInspector) fields.get(2).getFieldObjectInspector(); WritableLongObjectInspector f3ins = (WritableLongObjectInspector) fields.get(3).getFieldObjectInspector(); - WritableLongObjectInspector f4ins = (WritableLongObjectInspector) fields.get(4).getFieldObjectInspector(); + WritableLongObjectInspector f4ins = (WritableLongObjectInspector) fields.get(4).getFieldObjectInspector(); StructObjectInspector f5ins = (StructObjectInspector) fields.get(5).getFieldObjectInspector(); int f0 = f0ins.get(inspector.getStructFieldData(row, fields.get(0))); @@ -1412,7 +1692,7 @@ public void testConcurrentTransactionBatchCommits() throws Exception { long f4 = f4ins.get(inspector.getStructFieldData(row, fields.get(4))); SampleRec f5 = deserializeInner(inspector.getStructFieldData(row, fields.get(5)), f5ins); - return new Object[] {f0, f1, f2, f3, f4, f5}; + return new Object[]{f0, f1, f2, f3, f4, f5}; } // Assumes row schema => string,int,string @@ -1437,49 +1717,67 @@ public void testBucketing() throws Exception { // 1) Create two bucketed tables String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; - dbLocation = dbLocation.replaceAll("\\\\","/"); // for windows paths + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths String[] colNames = "key1,key2,data".split(","); String[] colTypes = "string,int,string".split(","); String[] bucketNames = "key1,key2".split(","); int bucketCount = 4; createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames - , null, dbLocation, bucketCount); + , null, dbLocation, bucketCount); String dbLocation2 = dbFolder.newFolder(dbName4).getCanonicalPath() + ".db"; - dbLocation2 = dbLocation2.replaceAll("\\\\","/"); // for windows paths + dbLocation2 = dbLocation2.replaceAll("\\\\", "/"); // for windows paths String[] colNames2 = "key3,key4,data2".split(","); String[] colTypes2 = "string,int,string".split(","); String[] bucketNames2 = "key3,key4".split(","); createDbAndTable(driver, dbName4, tblName4, null, colNames2, colTypes2, bucketNames2 - , null, dbLocation2, bucketCount); + , null, dbLocation2, bucketCount); // 2) Insert data into both tables - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName3, tblName3, null); - StreamingConnection connection = endPt.newConnection(false, agentInfo); - DelimitedInputWriter writer = new DelimitedInputWriter(colNames,",", endPt, connection); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo(agentInfo) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); + connection.write("name0,1,Hello streaming".getBytes()); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.commitTransaction(); + connection.close(); + + + StrictDelimitedInputWriter writer2 = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name0,1,Hello streaming".getBytes()); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.commit(); + HiveStreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName4) + .withTable(tblName4) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer2) + .withHiveConf(conf) + .connect(); - HiveEndPoint endPt2 = new HiveEndPoint(metaStoreURI, dbName4, tblName4, null); - StreamingConnection connection2 = endPt2.newConnection(false, agentInfo); - DelimitedInputWriter writer2 = new DelimitedInputWriter(colNames2,",", endPt2, connection); - TransactionBatch txnBatch2 = connection2.fetchTransactionBatch(2, writer2); - txnBatch2.beginNextTransaction(); + connection2.beginTransaction(); - txnBatch2.write("name5,2,fact3".getBytes()); // bucket 0 - txnBatch2.write("name8,2,fact3".getBytes()); // bucket 1 - txnBatch2.write("name0,1,fact1".getBytes()); // bucket 2 + connection2.write("name5,2,fact3".getBytes()); // bucket 0 + connection2.write("name8,2,fact3".getBytes()); // bucket 1 + connection2.write("name0,1,fact1".getBytes()); // bucket 2 - txnBatch2.commit(); + connection2.commitTransaction(); + connection2.close(); // 3 Check data distribution in buckets HashMap> actual1 = dumpAllBuckets(dbLocation, tblName3); @@ -1496,7 +1794,8 @@ public void testBucketing() throws Exception { Assert.assertEquals("records in bucket does not match expectation", actual1.get(2).size(), 2); Assert.assertEquals("records in bucket does not match expectation", actual1.get(3).size(), 1); } - private void runCmdOnDriver(String cmd) throws QueryFailedException { + + private void runCmdOnDriver(String cmd) { boolean t = runDDL(driver, cmd); Assert.assertTrue(cmd + " failed", t); } @@ -1510,35 +1809,41 @@ public void testFileDump() throws Exception { // 1) Create two bucketed tables String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; - dbLocation = dbLocation.replaceAll("\\\\","/"); // for windows paths + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths String[] colNames = "key1,key2,data".split(","); String[] colTypes = "string,int,string".split(","); String[] bucketNames = "key1,key2".split(","); int bucketCount = 4; createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames - , null, dbLocation, bucketCount); + , null, dbLocation, bucketCount); String dbLocation2 = dbFolder.newFolder(dbName4).getCanonicalPath() + ".db"; - dbLocation2 = dbLocation2.replaceAll("\\\\","/"); // for windows paths + dbLocation2 = dbLocation2.replaceAll("\\\\", "/"); // for windows paths String[] colNames2 = "key3,key4,data2".split(","); String[] colTypes2 = "string,int,string".split(","); String[] bucketNames2 = "key3,key4".split(","); createDbAndTable(driver, dbName4, tblName4, null, colNames2, colTypes2, bucketNames2 - , null, dbLocation2, bucketCount); - - + , null, dbLocation2, bucketCount); + + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo(agentInfo) + .withHiveConf(conf) + .withRecordWriter(writer) + .connect(); // 2) Insert data into both tables - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName3, tblName3, null); - StreamingConnection connection = endPt.newConnection(false, agentInfo); - DelimitedInputWriter writer = new DelimitedInputWriter(colNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name0,1,Hello streaming".getBytes()); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.commit(); + connection.beginTransaction(); + connection.write("name0,1,Hello streaming".getBytes()); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.commitTransaction(); + connection.close(); PrintStream origErr = System.err; ByteArrayOutputStream myErr = new ByteArrayOutputStream(); @@ -1556,18 +1861,27 @@ public void testFileDump() throws Exception { // for writes (transaction batch not closed yet) Assert.assertEquals(false, errDump.contains("is still open for writes.")); - HiveEndPoint endPt2 = new HiveEndPoint(metaStoreURI, dbName4, tblName4, null); - DelimitedInputWriter writer2 = new DelimitedInputWriter(colNames2,",", endPt2); - StreamingConnection connection2 = endPt2.newConnection(false, agentInfo); - TransactionBatch txnBatch2 = connection2.fetchTransactionBatch(2, writer2); - txnBatch2.beginNextTransaction(); + StrictDelimitedInputWriter writer2 = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection2 = HiveStreamingConnection.newBuilder() + .withDatabase(dbName4) + .withTable(tblName4) + .withAgentInfo(agentInfo) + .withRecordWriter(writer2) + .withHiveConf(conf) + .connect(); - txnBatch2.write("name5,2,fact3".getBytes()); // bucket 0 - txnBatch2.write("name8,2,fact3".getBytes()); // bucket 1 - txnBatch2.write("name0,1,fact1".getBytes()); // bucket 2 + connection2.beginTransaction(); + + connection2.write("name5,2,fact3".getBytes()); // bucket 0 + connection2.write("name8,2,fact3".getBytes()); // bucket 1 + connection2.write("name0,1,fact1".getBytes()); // bucket 2 // no data for bucket 3 -- expect 0 length bucket file - txnBatch2.commit(); + connection2.commitTransaction(); + connection2.close(); origErr = System.err; myErr = new ByteArrayOutputStream(); @@ -1586,75 +1900,135 @@ public void testFileDump() throws Exception { } @Test - public void testFileDumpDeltaFiles() throws Exception { + public void testFileDumpDeltaFilesWithStreamingOptimizations() throws Exception { String agentInfo = "UT_" + Thread.currentThread().getName(); - conf.setBoolVar(HiveConf.ConfVars.HIVE_ORC_DELTA_STREAMING_OPTIMIZATIONS_ENABLED, true); - try { - dropDB(msClient, dbName3); - dropDB(msClient, dbName4); - - // 1) Create two bucketed tables - String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; - dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths - String[] colNames = "key1,key2,data".split(","); - String[] colTypes = "string,int,string".split(","); - String[] bucketNames = "key1,key2".split(","); - int bucketCount = 4; - createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames - , null, dbLocation, bucketCount); - - // 2) Insert data into both tables - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName3, tblName3, null); - StreamingConnection connection = endPt.newConnection(false, agentInfo); - DelimitedInputWriter writer = new DelimitedInputWriter(colNames, ",", endPt, conf, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name0,1,streaming".getBytes()); - txnBatch.write("name2,2,streaming".getBytes()); - txnBatch.write("name4,2,unlimited".getBytes()); - txnBatch.write("name5,2,unlimited".getBytes()); - for (int i = 0; i < 6000; i++) { - if (i % 2 == 0) { - txnBatch.write(("name" + i + "," + i + "," + "streaming").getBytes()); - } else { - txnBatch.write(("name" + i + "," + i + "," + "unlimited").getBytes()); - } + dropDB(msClient, dbName3); + dropDB(msClient, dbName4); + + // 1) Create two bucketed tables + String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths + String[] colNames = "key1,key2,data".split(","); + String[] colTypes = "string,int,string".split(","); + String[] bucketNames = "key1,key2".split(","); + int bucketCount = 4; + createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames + , null, dbLocation, bucketCount); + + // 2) Insert data into both tables + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo(agentInfo) + .withHiveConf(conf) + .withRecordWriter(writer) + .withStreamingOptimizations(true) + .connect(); + connection.beginTransaction(); + connection.write("name0,1,streaming".getBytes()); + connection.write("name2,2,streaming".getBytes()); + connection.write("name4,2,unlimited".getBytes()); + connection.write("name5,2,unlimited".getBytes()); + for (int i = 0; i < 6000; i++) { + if (i % 2 == 0) { + connection.write(("name" + i + "," + i + "," + "streaming").getBytes()); + } else { + connection.write(("name" + i + "," + i + "," + "unlimited").getBytes()); } - txnBatch.commit(); - txnBatch.close(); - connection.close(); + } + connection.commitTransaction(); + connection.close(); + connection.close(); - PrintStream origOut = System.out; - ByteArrayOutputStream myOut = new ByteArrayOutputStream(); - - // replace stderr and run command - System.setOut(new PrintStream(myOut)); - FileDump.main(new String[]{dbLocation}); - System.out.flush(); - System.setOut(origOut); - - String outDump = new String(myOut.toByteArray()); - // make sure delta files are written with no indexes, no compression and no dictionary - // no compression - Assert.assertEquals(true, outDump.contains("Compression: NONE")); - // no stats/indexes - Assert.assertEquals(true, outDump.contains("Column 0: count: 0 hasNull: false")); - Assert.assertEquals(true, outDump.contains("Column 1: count: 0 hasNull: false sum: 0")); - Assert.assertEquals(true, outDump.contains("Column 2: count: 0 hasNull: false sum: 0")); - Assert.assertEquals(true, outDump.contains("Column 3: count: 0 hasNull: false sum: 0")); - Assert.assertEquals(true, outDump.contains("Column 4: count: 0 hasNull: false sum: 0")); - Assert.assertEquals(true, outDump.contains("Column 5: count: 0 hasNull: false sum: 0")); - Assert.assertEquals(true, outDump.contains("Column 6: count: 0 hasNull: false")); - Assert.assertEquals(true, outDump.contains("Column 7: count: 0 hasNull: false")); - Assert.assertEquals(true, outDump.contains("Column 8: count: 0 hasNull: false sum: 0")); - Assert.assertEquals(true, outDump.contains("Column 9: count: 0 hasNull: false")); - // no dictionary - Assert.assertEquals(true, outDump.contains("Encoding column 7: DIRECT_V2")); - Assert.assertEquals(true, outDump.contains("Encoding column 9: DIRECT_V2")); - } finally { - conf.unset(HiveConf.ConfVars.HIVE_ORC_DELTA_STREAMING_OPTIMIZATIONS_ENABLED.varname); + PrintStream origOut = System.out; + ByteArrayOutputStream myOut = new ByteArrayOutputStream(); + + // replace stderr and run command + System.setOut(new PrintStream(myOut)); + FileDump.main(new String[]{dbLocation}); + System.out.flush(); + System.setOut(origOut); + + String outDump = new String(myOut.toByteArray()); + // make sure delta files are written with no indexes, no compression and no dictionary + // no compression + Assert.assertEquals(true, outDump.contains("Compression: NONE")); + // no stats/indexes + Assert.assertEquals(true, outDump.contains("Column 0: count: 0 hasNull: false")); + Assert.assertEquals(true, outDump.contains("Column 1: count: 0 hasNull: false sum: 0")); + Assert.assertEquals(true, outDump.contains("Column 2: count: 0 hasNull: false sum: 0")); + Assert.assertEquals(true, outDump.contains("Column 3: count: 0 hasNull: false sum: 0")); + Assert.assertEquals(true, outDump.contains("Column 4: count: 0 hasNull: false sum: 0")); + Assert.assertEquals(true, outDump.contains("Column 5: count: 0 hasNull: false sum: 0")); + Assert.assertEquals(true, outDump.contains("Column 6: count: 0 hasNull: false")); + Assert.assertEquals(true, outDump.contains("Column 7: count: 0 hasNull: false")); + Assert.assertEquals(true, outDump.contains("Column 8: count: 0 hasNull: false sum: 0")); + Assert.assertEquals(true, outDump.contains("Column 9: count: 0 hasNull: false")); + // no dictionary + Assert.assertEquals(true, outDump.contains("Encoding column 7: DIRECT_V2")); + Assert.assertEquals(true, outDump.contains("Encoding column 9: DIRECT_V2")); + } + + @Test + public void testFileDumpDeltaFilesWithoutStreamingOptimizations() throws Exception { + String agentInfo = "UT_" + Thread.currentThread().getName(); + dropDB(msClient, dbName3); + dropDB(msClient, dbName4); + + // 1) Create two bucketed tables + String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths + String[] colNames = "key1,key2,data".split(","); + String[] colTypes = "string,int,string".split(","); + String[] bucketNames = "key1,key2".split(","); + int bucketCount = 4; + createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames + , null, dbLocation, bucketCount); + + // 2) Insert data into both tables + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo(agentInfo) + .withHiveConf(conf) + .withRecordWriter(writer) + .withStreamingOptimizations(false) + .connect(); + connection.beginTransaction(); + connection.write("name0,1,streaming".getBytes()); + connection.write("name2,2,streaming".getBytes()); + connection.write("name4,2,unlimited".getBytes()); + connection.write("name5,2,unlimited".getBytes()); + for (int i = 0; i < 6000; i++) { + if (i % 2 == 0) { + connection.write(("name" + i + "," + i + "," + "streaming").getBytes()); + } else { + connection.write(("name" + i + "," + i + "," + "unlimited").getBytes()); + } } + connection.commitTransaction(); + connection.close(); + + PrintStream origOut = System.out; + ByteArrayOutputStream myOut = new ByteArrayOutputStream(); + + // replace stderr and run command + System.setOut(new PrintStream(myOut)); + FileDump.main(new String[]{dbLocation}); + System.out.flush(); + System.setOut(origOut); + + String outDump = new String(myOut.toByteArray()); + Assert.assertEquals(true, outDump.contains("Compression: ZLIB")); + Assert.assertEquals(true, outDump.contains("Encoding column 9: DICTIONARY")); } @Test @@ -1663,32 +2037,38 @@ public void testFileDumpCorruptDataFiles() throws Exception { // 1) Create two bucketed tables String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; - dbLocation = dbLocation.replaceAll("\\\\","/"); // for windows paths + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths String[] colNames = "key1,key2,data".split(","); String[] colTypes = "string,int,string".split(","); String[] bucketNames = "key1,key2".split(","); int bucketCount = 4; createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames - , null, dbLocation, bucketCount); + , null, dbLocation, bucketCount); // 2) Insert data into both tables - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName3, tblName3, null); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(colNames,",", endPt, connection); - + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .withTransactionBatchSize(10) + .connect(); // we need side file for this test, so we create 2 txn batch and test with only one - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name0,1,Hello streaming".getBytes()); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.commit(); + connection.beginTransaction(); + connection.write("name0,1,Hello streaming".getBytes()); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.commitTransaction(); // intentionally corrupt some files Path path = new Path(dbLocation); Collection files = FileDump.getAllFilesInPath(path, conf); - int readableFooter = -1; for (String file : files) { if (file.contains("bucket_00000")) { // empty out the file @@ -1748,17 +2128,17 @@ public void testFileDumpCorruptDataFiles() throws Exception { Assert.assertEquals(false, errDump.contains("file(s) are corrupted")); Assert.assertEquals(false, errDump.contains("is still open for writes.")); - // after recovery there shouldn't be any *_flush_length files + // after recovery there shouldn'table be any *_flush_length files files = FileDump.getAllFilesInPath(path, conf); for (String file : files) { Assert.assertEquals(false, file.contains("_flush_length")); } - txnBatch.close(); + connection.close(); } private void corruptDataFile(final String file, final Configuration conf, final int addRemoveBytes) - throws Exception { + throws Exception { Path bPath = new Path(file); Path cPath = new Path(bPath.getParent(), bPath.getName() + ".corrupt"); FileSystem fs = bPath.getFileSystem(conf); @@ -1781,48 +2161,56 @@ public void testFileDumpCorruptSideFiles() throws Exception { // 1) Create two bucketed tables String dbLocation = dbFolder.newFolder(dbName3).getCanonicalPath() + ".db"; - dbLocation = dbLocation.replaceAll("\\\\","/"); // for windows paths + dbLocation = dbLocation.replaceAll("\\\\", "/"); // for windows paths String[] colNames = "key1,key2,data".split(","); String[] colTypes = "string,int,string".split(","); String[] bucketNames = "key1,key2".split(","); int bucketCount = 4; createDbAndTable(driver, dbName3, tblName3, null, colNames, colTypes, bucketNames - , null, dbLocation, bucketCount); + , null, dbLocation, bucketCount); // 2) Insert data into both tables - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, dbName3, tblName3, null); - StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName()); - DelimitedInputWriter writer = new DelimitedInputWriter(colNames,",", endPt, connection); - - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name0,1,Hello streaming".getBytes()); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.write("name6,3,aHello streaming".getBytes()); - txnBatch.commit(); - - Map> offsetMap = new HashMap>(); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName3) + .withTable(tblName3) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .withTransactionBatchSize(10) + .connect(); + + connection.beginTransaction(); + connection.write("name0,1,Hello streaming".getBytes()); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.write("name6,3,aHello streaming".getBytes()); + connection.commitTransaction(); + + Map> offsetMap = new HashMap>(); recordOffsets(conf, dbLocation, offsetMap); - txnBatch.beginNextTransaction(); - txnBatch.write("name01,11,-Hello streaming".getBytes()); - txnBatch.write("name21,21,-Welcome to streaming".getBytes()); - txnBatch.write("name41,21,-more Streaming unlimited".getBytes()); - txnBatch.write("name51,21,-even more Streaming unlimited".getBytes()); - txnBatch.write("name02,12,--Hello streaming".getBytes()); - txnBatch.write("name22,22,--Welcome to streaming".getBytes()); - txnBatch.write("name42,22,--more Streaming unlimited".getBytes()); - txnBatch.write("name52,22,--even more Streaming unlimited".getBytes()); - txnBatch.write("name7,4,aWelcome to streaming".getBytes()); - txnBatch.write("name8,5,amore Streaming unlimited".getBytes()); - txnBatch.write("name9,6,aeven more Streaming unlimited".getBytes()); - txnBatch.write("name10,7,bHello streaming".getBytes()); - txnBatch.write("name11,8,bWelcome to streaming".getBytes()); - txnBatch.write("name12,9,bmore Streaming unlimited".getBytes()); - txnBatch.write("name13,10,beven more Streaming unlimited".getBytes()); - txnBatch.commit(); + connection.beginTransaction(); + connection.write("name01,11,-Hello streaming".getBytes()); + connection.write("name21,21,-Welcome to streaming".getBytes()); + connection.write("name41,21,-more Streaming unlimited".getBytes()); + connection.write("name51,21,-even more Streaming unlimited".getBytes()); + connection.write("name02,12,--Hello streaming".getBytes()); + connection.write("name22,22,--Welcome to streaming".getBytes()); + connection.write("name42,22,--more Streaming unlimited".getBytes()); + connection.write("name52,22,--even more Streaming unlimited".getBytes()); + connection.write("name7,4,aWelcome to streaming".getBytes()); + connection.write("name8,5,amore Streaming unlimited".getBytes()); + connection.write("name9,6,aeven more Streaming unlimited".getBytes()); + connection.write("name10,7,bHello streaming".getBytes()); + connection.write("name11,8,bWelcome to streaming".getBytes()); + connection.write("name12,9,bmore Streaming unlimited".getBytes()); + connection.write("name13,10,beven more Streaming unlimited".getBytes()); + connection.commitTransaction(); recordOffsets(conf, dbLocation, offsetMap); @@ -1899,18 +2287,18 @@ public void testFileDumpCorruptSideFiles() throws Exception { Assert.assertEquals(false, errDump.contains("file(s) are corrupted")); Assert.assertEquals(false, errDump.contains("is still open for writes.")); - // after recovery there shouldn't be any *_flush_length files + // after recovery there shouldn'table be any *_flush_length files files = FileDump.getAllFilesInPath(path, conf); for (String file : files) { Assert.assertEquals(false, file.contains("_flush_length")); } - txnBatch.close(); + connection.close(); } private void corruptSideFile(final String file, final HiveConf conf, - final Map> offsetMap, final String key, final int numEntries) - throws IOException { + final Map> offsetMap, final String key, final int numEntries) + throws IOException { Path dataPath = new Path(file); Path sideFilePath = OrcAcidUtils.getSideFile(dataPath); Path cPath = new Path(sideFilePath.getParent(), sideFilePath.getName() + ".corrupt"); @@ -1929,7 +2317,7 @@ private void corruptSideFile(final String file, final HiveConf conf, } else if (numEntries > 0) { int firstRun = Math.min(offsets.size(), numEntries); // add original entries - for (int i=0; i < firstRun; i++) { + for (int i = 0; i < firstRun; i++) { fdos.writeLong(offsets.get(i)); } @@ -1945,17 +2333,17 @@ private void corruptSideFile(final String file, final HiveConf conf, fs.rename(cPath, sideFilePath); } - private byte[] longToBytes(long x) { + private byte[] longToBytes(long x) { ByteBuffer buffer = ByteBuffer.allocate(8); buffer.putLong(x); return buffer.array(); } private void recordOffsets(final HiveConf conf, final String dbLocation, - final Map> offsetMap) throws IOException { + final Map> offsetMap) throws IOException { Path path = new Path(dbLocation); Collection files = FileDump.getAllFilesInPath(path, conf); - for (String file: files) { + for (String file : files) { Path bPath = new Path(file); FileSystem fs = bPath.getFileSystem(conf); FileStatus fileStatus = fs.getFileStatus(bPath); @@ -2010,89 +2398,107 @@ public void testErrorHandling() throws Exception { String agentInfo = "UT_" + Thread.currentThread().getName(); runCmdOnDriver("create database testErrors"); runCmdOnDriver("use testErrors"); - runCmdOnDriver("create table T(a int, b int) clustered by (b) into 2 buckets stored as orc TBLPROPERTIES ('transactional'='true')"); - - HiveEndPoint endPt = new HiveEndPoint(metaStoreURI, "testErrors", "T", null); - StreamingConnection connection = endPt.newConnection(false, agentInfo); - DelimitedInputWriter innerWriter = new DelimitedInputWriter("a,b".split(","),",", endPt, connection); + runCmdOnDriver( + "create table T(a int, b int) clustered by (b) into 2 buckets stored as orc TBLPROPERTIES ('transactional'='true')"); + + StrictDelimitedInputWriter innerWriter = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testErrors") + .withTable("T") + .withAgentInfo(agentInfo) + .withTransactionBatchSize(2) + .withRecordWriter(innerWriter) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); FaultyWriter writer = new FaultyWriter(innerWriter); - TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.close(); - txnBatch.heartbeat();//this is no-op on closed batch - txnBatch.abort();//ditto + connection.close(); + Exception expectedEx = null; GetOpenTxnsInfoResponse r = msClient.showTxns(); - Assert.assertEquals("HWM didn't match", 17, r.getTxn_high_water_mark()); + Assert.assertEquals("HWM didn'table match", 17, r.getTxn_high_water_mark()); List ti = r.getOpen_txns(); Assert.assertEquals("wrong status ti(0)", TxnState.ABORTED, ti.get(0).getState()); Assert.assertEquals("wrong status ti(1)", TxnState.ABORTED, ti.get(1).getState()); - Exception expectedEx = null; + try { - txnBatch.beginNextTransaction(); - } - catch(IllegalStateException ex) { + connection.beginTransaction(); + } catch (StreamingException ex) { expectedEx = ex; } - Assert.assertTrue("beginNextTransaction() should have failed", - expectedEx != null && expectedEx.getMessage().contains("has been closed()")); + Assert.assertTrue("beginTransaction() should have failed", + expectedEx != null && expectedEx.getMessage().contains("Streaming connection is closed already.")); + + connection = HiveStreamingConnection.newBuilder() + .withDatabase("testErrors") + .withTable("T") + .withAgentInfo(agentInfo) + .withTransactionBatchSize(2) + .withRecordWriter(innerWriter) + .withHiveConf(conf) + .connect(); expectedEx = null; try { - txnBatch.write("name0,1,Hello streaming".getBytes()); - } - catch(IllegalStateException ex) { + connection.write("name0,1,Hello streaming".getBytes()); + } catch (StreamingException ex) { expectedEx = ex; } - Assert.assertTrue("write() should have failed", - expectedEx != null && expectedEx.getMessage().contains("has been closed()")); + Assert.assertTrue("write() should have failed", + expectedEx != null && expectedEx.getMessage().equals("Transaction batch is null. Missing beginTransaction?")); expectedEx = null; try { - txnBatch.commit(); - } - catch(IllegalStateException ex) { + connection.commitTransaction(); + } catch (StreamingException ex) { expectedEx = ex; } - Assert.assertTrue("commit() should have failed", - expectedEx != null && expectedEx.getMessage().contains("has been closed()")); - - txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); - txnBatch.write("name4,2,more Streaming unlimited".getBytes()); - txnBatch.write("name5,2,even more Streaming unlimited".getBytes()); - txnBatch.commit(); + Assert.assertTrue("commitTransaction() should have failed", + expectedEx != null && expectedEx.getMessage().equals("Transaction batch is null. Missing beginTransaction?")); + + connection = HiveStreamingConnection.newBuilder() + .withDatabase("testErrors") + .withTable("T") + .withAgentInfo(agentInfo) + .withTransactionBatchSize(2) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); + connection.write("name2,2,Welcome to streaming".getBytes()); + connection.write("name4,2,more Streaming unlimited".getBytes()); + connection.write("name5,2,even more Streaming unlimited".getBytes()); + connection.commitTransaction(); //test toString() - String s = txnBatch.toString(); - Assert.assertTrue("Actual: " + s, s.contains("LastUsed " + JavaUtils.txnIdToString(txnBatch.getCurrentTxnId()))); + String s = connection.toTransactionString(); + Assert.assertTrue("Actual: " + s, s.contains("LastUsed " + JavaUtils.txnIdToString(connection.getCurrentTxnId()))); Assert.assertTrue("Actual: " + s, s.contains("TxnStatus[CO]")); expectedEx = null; - txnBatch.beginNextTransaction(); + connection.beginTransaction(); writer.enableErrors(); try { - txnBatch.write("name6,2,Doh!".getBytes()); - } - catch(StreamingIOFailure ex) { + connection.write("name6,2,Doh!".getBytes()); + } catch (StreamingIOFailure ex) { expectedEx = ex; - txnBatch.getCurrentTransactionState(); - txnBatch.getCurrentTxnId();//test it doesn't throw ArrayIndexOutOfBounds... } Assert.assertTrue("Wrong exception: " + (expectedEx != null ? expectedEx.getMessage() : "?"), expectedEx != null && expectedEx.getMessage().contains("Simulated fault occurred")); expectedEx = null; try { - txnBatch.commit(); - } - catch(IllegalStateException ex) { + connection.commitTransaction(); + } catch (StreamingException ex) { expectedEx = ex; } - Assert.assertTrue("commit() should have failed", - expectedEx != null && expectedEx.getMessage().contains("has been closed()")); + Assert.assertTrue("commitTransaction() should have failed", + expectedEx != null && expectedEx.getMessage().equals("Transaction state is not OPEN. Missing beginTransaction?")); //test toString() - s = txnBatch.toString(); - Assert.assertTrue("Actual: " + s, s.contains("LastUsed " + JavaUtils.txnIdToString(txnBatch.getCurrentTxnId()))); + s = connection.toTransactionString(); + Assert.assertTrue("Actual: " + s, s.contains("LastUsed " + JavaUtils.txnIdToString(connection.getCurrentTxnId()))); Assert.assertTrue("Actual: " + s, s.contains("TxnStatus[CA]")); r = msClient.showTxns(); @@ -2102,39 +2508,44 @@ public void testErrorHandling() throws Exception { Assert.assertEquals("wrong status ti(1)", TxnState.ABORTED, ti.get(1).getState()); //txnid 3 was committed and thus not open Assert.assertEquals("wrong status ti(2)", TxnState.ABORTED, ti.get(2).getState()); + connection.close(); writer.disableErrors(); - txnBatch = connection.fetchTransactionBatch(2, writer); - txnBatch.beginNextTransaction(); - txnBatch.write("name2,2,Welcome to streaming".getBytes()); + connection = HiveStreamingConnection.newBuilder() + .withDatabase("testErrors") + .withTable("T") + .withAgentInfo(agentInfo) + .withTransactionBatchSize(2) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); + connection.write("name2,2,Welcome to streaming".getBytes()); writer.enableErrors(); expectedEx = null; try { - txnBatch.commit(); - } - catch(StreamingIOFailure ex) { + connection.commitTransaction(); + } catch (StreamingIOFailure ex) { expectedEx = ex; } Assert.assertTrue("Wrong exception: " + (expectedEx != null ? expectedEx.getMessage() : "?"), expectedEx != null && expectedEx.getMessage().contains("Simulated fault occurred")); r = msClient.showTxns(); - Assert.assertEquals("HWM didn't match", 21, r.getTxn_high_water_mark()); + Assert.assertEquals("HWM didn'table match", 21, r.getTxn_high_water_mark()); ti = r.getOpen_txns(); Assert.assertEquals("wrong status ti(3)", TxnState.ABORTED, ti.get(3).getState()); Assert.assertEquals("wrong status ti(4)", TxnState.ABORTED, ti.get(4).getState()); - - txnBatch.abort(); } - // assumes un partitioned table + // assumes un partitioned table // returns a map > private HashMap> dumpAllBuckets(String dbLocation, String tableName) - throws IOException { + throws IOException { HashMap> result = new HashMap>(); for (File deltaDir : new File(dbLocation + "/" + tableName).listFiles()) { - if(!deltaDir.getName().startsWith("delta")) { + if (!deltaDir.getName().startsWith("delta")) { continue; } File[] bucketFiles = deltaDir.listFiles(new FileFilter() { @@ -2145,11 +2556,11 @@ public boolean accept(File pathname) { } }); for (File bucketFile : bucketFiles) { - if(bucketFile.toString().endsWith("length")) { + if (bucketFile.toString().endsWith("length")) { continue; } Integer bucketNum = getBucketNumber(bucketFile); - ArrayList recs = dumpBucket(new Path(bucketFile.toString())); + ArrayList recs = dumpBucket(new Path(bucketFile.toString())); result.put(bucketNum, recs); } } @@ -2160,14 +2571,14 @@ public boolean accept(File pathname) { private Integer getBucketNumber(File bucketFile) { String fname = bucketFile.getName(); int start = fname.indexOf('_'); - String number = fname.substring(start+1, fname.length()); + String number = fname.substring(start + 1, fname.length()); return Integer.parseInt(number); } // delete db and all tables in it public static void dropDB(IMetaStoreClient client, String databaseName) { try { - for (String table : client.listTableNamesByFilter(databaseName, "", (short)-1)) { + for (String table : client.listTableNamesByFilter(databaseName, "", (short) -1)) { client.dropTable(databaseName, table, true, true); } client.dropDatabase(databaseName); @@ -2177,15 +2588,14 @@ public static void dropDB(IMetaStoreClient client, String databaseName) { } - ///////// -------- UTILS ------- ///////// // returns Path of the partition created (if any) else Path of table - public static Path createDbAndTable(IDriver driver, String databaseName, - String tableName, List partVals, - String[] colNames, String[] colTypes, - String[] bucketCols, - String[] partNames, String dbLocation, int bucketCount) - throws Exception { + private static Path createDbAndTable(IDriver driver, String databaseName, + String tableName, List partVals, + String[] colNames, String[] colTypes, + String[] bucketCols, + String[] partNames, String dbLocation, int bucketCount) + throws Exception { String dbUri = "raw://" + new Path(dbLocation).toUri().toString(); String tableLoc = dbUri + Path.SEPARATOR + tableName; @@ -2193,24 +2603,24 @@ public static Path createDbAndTable(IDriver driver, String databaseName, runDDL(driver, "create database IF NOT EXISTS " + databaseName + " location '" + dbUri + "'"); runDDL(driver, "use " + databaseName); String crtTbl = "create table " + tableName + - " ( " + getTableColumnsStr(colNames,colTypes) + " )" + - getPartitionStmtStr(partNames) + - " clustered by ( " + join(bucketCols, ",") + " )" + - " into " + bucketCount + " buckets " + - " stored as orc " + - " location '" + tableLoc + "'" + - " TBLPROPERTIES ('transactional'='true') "; + " ( " + getTableColumnsStr(colNames, colTypes) + " )" + + getPartitionStmtStr(partNames) + + " clustered by ( " + join(bucketCols, ",") + " )" + + " into " + bucketCount + " buckets " + + " stored as orc " + + " location '" + tableLoc + "'" + + " TBLPROPERTIES ('transactional'='true') "; runDDL(driver, crtTbl); - if(partNames!=null && partNames.length!=0) { + if (partNames != null && partNames.length != 0) { return addPartition(driver, tableName, partVals, partNames); } return new Path(tableLoc); } private static Path addPartition(IDriver driver, String tableName, List partVals, String[] partNames) - throws Exception { + throws Exception { String partSpec = getPartsSpec(partNames, partVals); - String addPart = "alter table " + tableName + " add partition ( " + partSpec + " )"; + String addPart = "alter table " + tableName + " add partition ( " + partSpec + " )"; runDDL(driver, addPart); return getPartitionPath(driver, tableName, partSpec); } @@ -2219,15 +2629,15 @@ private static Path getPartitionPath(IDriver driver, String tableName, String pa ArrayList res = queryTable(driver, "describe extended " + tableName + " PARTITION (" + partSpec + ")"); String partInfo = res.get(res.size() - 1); int start = partInfo.indexOf("location:") + "location:".length(); - int end = partInfo.indexOf(",",start); - return new Path( partInfo.substring(start,end) ); + int end = partInfo.indexOf(",", start); + return new Path(partInfo.substring(start, end)); } private static String getTableColumnsStr(String[] colNames, String[] colTypes) { StringBuilder sb = new StringBuilder(); - for (int i=0; i < colNames.length; ++i) { - sb.append(colNames[i] + " " + colTypes[i]); - if (i partVals) { StringBuilder sb = new StringBuilder(); - for (int i=0; i < partVals.size(); ++i) { - sb.append(partNames[i] + " = '" + partVals.get(i) + "'"); - if(i < partVals.size()-1) { + for (int i = 0; i < partVals.size(); ++i) { + sb.append(partNames[i]).append(" = '").append(partVals.get(i)).append("'"); + if (i < partVals.size() - 1) { sb.append(","); } } @@ -2262,28 +2672,33 @@ private static String getPartsSpec(String[] partNames, List partVals) { } private static String join(String[] values, String delimiter) { - if(values==null) { + if (values == null) { return null; } StringBuilder strbuf = new StringBuilder(); boolean first = true; - for (Object value : values) { - if (!first) { strbuf.append(delimiter); } else { first = false; } + for (Object value : values) { + if (!first) { + strbuf.append(delimiter); + } else { + first = false; + } strbuf.append(value.toString()); } return strbuf.toString(); } + private static String getPartitionStmtStr(String[] partNames) { - if ( partNames == null || partNames.length == 0) { + if (partNames == null || partNames.length == 0) { return ""; } return " partitioned by (" + getTablePartsStr(partNames) + " )"; } - private static boolean runDDL(IDriver driver, String sql) throws QueryFailedException { + private static boolean runDDL(IDriver driver, String sql) { LOG.debug(sql); System.out.println(sql); //LOG.debug("Running Hive Query: "+ sql); @@ -2296,9 +2711,9 @@ private static boolean runDDL(IDriver driver, String sql) throws QueryFailedExce } - public static ArrayList queryTable(IDriver driver, String query) throws IOException { + private static ArrayList queryTable(IDriver driver, String query) throws IOException { CommandProcessorResponse cpr = driver.run(query); - if(cpr.getResponseCode() != 0) { + if (cpr.getResponseCode() != 0) { throw new RuntimeException(query + " failed: " + cpr); } ArrayList res = new ArrayList(); @@ -2349,12 +2764,13 @@ public int hashCode() { @Override public String toString() { return " { " + - "'" + field1 + '\'' + - "," + field2 + - ",'" + field3 + '\'' + - " }"; + "'" + field1 + '\'' + + "," + field2 + + ",'" + field3 + '\'' + + " }"; } } + /** * This is test-only wrapper around the real RecordWriter. * It can simulate faults from lower levels to test error handling logic. @@ -2367,41 +2783,50 @@ private FaultyWriter(RecordWriter delegate) { assert delegate != null; this.delegate = delegate; } + + @Override + public void init(final StreamingConnection connection, final long minWriteId, final long maxWriteID) + throws StreamingException { + delegate.init(connection, minWriteId, maxWriteID); + } + @Override public void write(long writeId, byte[] record) throws StreamingException { delegate.write(writeId, record); produceFault(); } + @Override public void flush() throws StreamingException { delegate.flush(); produceFault(); } + @Override - public void clear() throws StreamingException { - delegate.clear(); - } - @Override - public void newBatch(Long minTxnId, Long maxTxnID) throws StreamingException { - delegate.newBatch(minTxnId, maxTxnID); + public void close() throws StreamingException { + delegate.close(); } + @Override - public void closeBatch() throws StreamingException { - delegate.closeBatch(); + public Set getPartitions() { + return delegate.getPartitions(); } /** * allows testing of "unexpected" errors + * * @throws StreamingIOFailure */ private void produceFault() throws StreamingIOFailure { - if(shouldThrow) { + if (shouldThrow) { throw new StreamingIOFailure("Simulated fault occurred"); } } + void enableErrors() { shouldThrow = true; } + void disableErrors() { shouldThrow = false; } diff --git a/streaming/src/test/org/apache/hive/streaming/TestStreamingDynamicPartitioning.java b/streaming/src/test/org/apache/hive/streaming/TestStreamingDynamicPartitioning.java new file mode 100644 index 0000000..e513915 --- /dev/null +++ b/streaming/src/test/org/apache/hive/streaming/TestStreamingDynamicPartitioning.java @@ -0,0 +1,921 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.streaming; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RawLocalFileSystem; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hive.cli.CliSessionState; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.txn.TxnDbUtil; +import org.apache.hadoop.hive.ql.DriverFactory; +import org.apache.hadoop.hive.ql.IDriver; +import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.thrift.TException; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class TestStreamingDynamicPartitioning { + private static final Logger LOG = LoggerFactory.getLogger(TestStreamingDynamicPartitioning.class); + + public static class RawFileSystem extends RawLocalFileSystem { + private static final URI NAME; + + static { + try { + NAME = new URI("raw:///"); + } catch (URISyntaxException se) { + throw new IllegalArgumentException("bad uri", se); + } + } + + @Override + public URI getUri() { + return NAME; + } + + @Override + public String getScheme() { + return "raw"; + } + + @Override + public FileStatus getFileStatus(Path path) throws IOException { + File file = pathToFile(path); + if (!file.exists()) { + throw new FileNotFoundException("Cannot find " + path); + } + // get close enough + short mod = 0; + if (file.canRead()) { + mod |= 0444; + } + if (file.canWrite()) { + mod |= 0200; + } + if (file.canExecute()) { + mod |= 0111; + } + return new FileStatus(file.length(), file.isDirectory(), 1, 1024, + file.lastModified(), file.lastModified(), + FsPermission.createImmutable(mod), "owen", "users", path); + } + } + + private final HiveConf conf; + private IDriver driver; + private final IMetaStoreClient msClient; + + private static final String COL1 = "id"; + private static final String COL2 = "msg"; + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + // partitioned table + private final static String dbName = "testing"; + private final static String tblName = "alerts"; + private final static String[] fieldNames = new String[]{COL1, COL2}; + private final static String[] colTypes = new String[]{serdeConstants.INT_TYPE_NAME, serdeConstants.STRING_TYPE_NAME}; + private final static String[] partNames = new String[]{"Continent", "Country"}; + private final static String[] bucketCols = new String[]{COL1}; + private final String loc1; + + // unpartitioned table + private final static String dbName2 = "testing2"; + + public TestStreamingDynamicPartitioning() throws Exception { + conf = new HiveConf(this.getClass()); + conf.set("fs.raw.impl", RawFileSystem.class.getName()); + conf + .setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, + "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); + TxnDbUtil.setConfValues(conf); + conf.setBoolVar(HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI, true); + conf.setBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, true); + conf.setVar(HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict"); + dbFolder.create(); + loc1 = dbFolder.newFolder(dbName + ".db").toString(); + + //1) Start from a clean slate (metastore) + TxnDbUtil.cleanDb(conf); + TxnDbUtil.prepDb(conf); + + //2) obtain metastore clients + msClient = new HiveMetaStoreClient(conf); + } + + @Before + public void setup() throws Exception { + SessionState.start(new CliSessionState(conf)); + driver = DriverFactory.newDriver(conf); + driver.setMaxRows(200002);//make sure Driver returns all results + // drop and recreate the necessary databases and tables + dropDB(msClient, dbName); + + createDbAndTable(driver, dbName, tblName, null, fieldNames, colTypes, bucketCols, partNames, loc1, 1); + + dropDB(msClient, dbName2); + String loc2 = dbFolder.newFolder(dbName2 + ".db").toString(); + String loc3 = dbFolder.newFolder("testing5.db").toString(); + createStoreSales("testing5", loc3); + + runDDL(driver, "drop table testBucketing3.streamedtable"); + runDDL(driver, "drop table testBucketing3.finaltable"); + runDDL(driver, "drop table testBucketing3.nobucket"); + } + + @After + public void cleanup() throws Exception { + msClient.close(); + driver.close(); + } + + private void createStoreSales(String dbName, String loc) throws Exception { + String dbUri = "raw://" + new Path(loc).toUri().toString(); + String tableLoc = dbUri + Path.SEPARATOR + "store_sales"; + + boolean success = runDDL(driver, "create database IF NOT EXISTS " + dbName + " location '" + dbUri + "'"); + Assert.assertTrue(success); + success = runDDL(driver, "use " + dbName); + Assert.assertTrue(success); + + success = runDDL(driver, "drop table if exists store_sales"); + Assert.assertTrue(success); + success = runDDL(driver, "create table store_sales\n" + + "(\n" + + " ss_sold_date_sk int,\n" + + " ss_sold_time_sk int,\n" + + " ss_item_sk int,\n" + + " ss_customer_sk int,\n" + + " ss_cdemo_sk int,\n" + + " ss_hdemo_sk int,\n" + + " ss_addr_sk int,\n" + + " ss_store_sk int,\n" + + " ss_promo_sk int,\n" + + " ss_ticket_number int,\n" + + " ss_quantity int,\n" + + " ss_wholesale_cost decimal(7,2),\n" + + " ss_list_price decimal(7,2),\n" + + " ss_sales_price decimal(7,2),\n" + + " ss_ext_discount_amt decimal(7,2),\n" + + " ss_ext_sales_price decimal(7,2),\n" + + " ss_ext_wholesale_cost decimal(7,2),\n" + + " ss_ext_list_price decimal(7,2),\n" + + " ss_ext_tax decimal(7,2),\n" + + " ss_coupon_amt decimal(7,2),\n" + + " ss_net_paid decimal(7,2),\n" + + " ss_net_paid_inc_tax decimal(7,2),\n" + + " ss_net_profit decimal(7,2)\n" + + ")\n" + + " partitioned by (dt string)\n" + + "clustered by (ss_store_sk, ss_promo_sk)\n" + + "INTO 4 BUCKETS stored as orc " + " location '" + tableLoc + "'" + + " TBLPROPERTIES ('orc.compress'='NONE', 'transactional'='true')"); + Assert.assertTrue(success); + + success = runDDL(driver, "alter table store_sales add partition(dt='2015')"); + Assert.assertTrue(success); + } + + @Test + public void testDynamicPartitioning() throws Exception { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testing5") + .withTable("store_sales") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); + + for (int i = 0; i < 10; i++) { + StringBuilder row = new StringBuilder(); + for (int ints = 0; ints < 11; ints++) { + row.append(ints).append(','); + } + for (int decs = 0; decs < 12; decs++) { + row.append(i + 0.1).append(','); + } + row.append("2018-04-").append(i); + connection.write(row.toString().getBytes()); + } + connection.commitTransaction(); + connection.close(); + + List partitions = queryTable(driver, "show partitions testing5.store_sales"); + // 1 static partition created during setup + 10 dynamic partitions + assertEquals(11, partitions.size()); + // ignore the first static partition + for (int i = 1; i < partitions.size(); i++) { + assertEquals("dt=2018-04-" + (i - 1), partitions.get(i)); + } + + ArrayList res = queryTable(driver, "select * from testing5.store_sales"); + for (String re : res) { + System.out.println(re); + assertEquals(true, re.contains("2018-04-")); + } + } + + // stream data into streaming table with N buckets, then copy the data into another bucketed table + // check if bucketing in both was done in the same way + @Test + public void testDPStreamBucketingMatchesRegularBucketing() throws Exception { + int bucketCount = 100; + + String dbUri = "raw://" + new Path(dbFolder.newFolder().toString()).toUri().toString(); + String tableLoc = "'" + dbUri + Path.SEPARATOR + "streamedtable" + "'"; + String tableLoc2 = "'" + dbUri + Path.SEPARATOR + "finaltable" + "'"; + String tableLoc3 = "'" + dbUri + Path.SEPARATOR + "nobucket" + "'"; + + // disabling vectorization as this test yields incorrect results with vectorization + conf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false); + try (IDriver driver = DriverFactory.newDriver(conf)) { + runDDL(driver, "create database testBucketing3"); + runDDL(driver, "use testBucketing3"); + runDDL(driver, "create table streamedtable ( key1 string,key2 int,data string ) partitioned by (year " + + "int) clustered by " + "( " + "key1,key2 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='true')"); + // In 'nobucket' table we capture bucketid from streamedtable to workaround a hive bug that prevents joins two identically bucketed tables + runDDL(driver, "create table nobucket ( bucketid int, key1 string,key2 int,data string ) partitioned by " + + "(year int) location " + tableLoc3); + runDDL(driver, "create table finaltable ( bucketid int, key1 string,key2 int,data string ) partitioned " + + "by (year int) clustered by ( key1,key2 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='true')"); + + + String[] records = new String[]{ + "PSFAHYLZVC,29,EPNMA,2017", + "PPPRKWAYAU,96,VUTEE,2017", + "MIAOFERCHI,3,WBDSI,2017", + "CEGQAZOWVN,0,WCUZL,2017", + "XWAKMNSVQF,28,YJVHU,2017", + "XBWTSAJWME,2,KDQFO,2017", + "FUVLQTAXAY,5,LDSDG,2017", + "QTQMDJMGJH,6,QBOMA,2018", + "EFLOTLWJWN,71,GHWPS,2018", + "PEQNAOJHCM,82,CAAFI,2018", + "MOEKQLGZCP,41,RUACR,2018", + "QZXMCOPTID,37,LFLWE,2018", + "EYALVWICRD,13,JEZLC,2018", + "VYWLZAYTXX,16,DMVZX,2018", + "OSALYSQIXR,47,HNZVE,2018", + "JGKVHKCEGQ,25,KSCJB,2018", + "WQFMMYDHET,12,DTRWA,2018", + "AJOVAYZKZQ,15,YBKFO,2018", + "YAQONWCUAU,31,QJNHZ,2018", + "DJBXUEUOEB,35,IYCBL,2018" + }; + + + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("streamedtable") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + + connection.beginTransaction(); + + for (String record : records) { + connection.write(record.getBytes()); + } + + connection.commitTransaction(); + connection.close(); + + ArrayList res1 = queryTable(driver, "select row__id.bucketid, * from streamedtable order by key2"); + for (String re : res1) { + System.out.println(re); + assertTrue(re.endsWith("2017") || re.endsWith("2018")); + } + + driver.run("insert into nobucket partition(year) select row__id.bucketid,* from streamedtable"); + ArrayList res = queryTable(driver, "select * from nobucket"); + assertEquals(records.length, res.size()); + runDDL(driver, " insert into finaltable partition(year) select * from nobucket"); + res = queryTable(driver, "select * from finaltable"); + assertEquals(records.length, res.size()); + ArrayList res2 = queryTable(driver, + "select row__id.bucketid,* from finaltable where row__id.bucketid<>bucketid"); + for (String s : res2) { + LOG.error(s); + } + Assert.assertTrue(res2.isEmpty()); + + res2 = queryTable(driver, "select * from finaltable where year=2018"); + assertEquals(13, res2.size()); + for (String s : res2) { + assertTrue(s.endsWith("2018")); + } + + res2 = queryTable(driver, "show partitions finaltable"); + assertEquals(2, res2.size()); + assertEquals("year=2017", res2.get(0)); + assertEquals("year=2018", res2.get(1)); + } finally { + conf.unset(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.varname); + } + } + + @Test + public void testDPTwoLevel() throws Exception { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); + connection.write("1,foo,Asia,India".getBytes()); + connection.write("2,bar,Europe,Germany".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("3,foo,Asia,India".getBytes()); + connection.write("4,bar,Europe,Germany".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("5,foo,Asia,China".getBytes()); + connection.write("6,bar,Europe,France".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("7,foo,Asia,China".getBytes()); + connection.write("8,bar,Europe,France".getBytes()); + connection.commitTransaction(); + connection.close(); + + List res = queryTable(driver, "select * from " + (dbName + "." + tblName) + " order by id"); + assertEquals(8, res.size()); + assertEquals("1\tfoo\tAsia\tIndia", res.get(0)); + assertEquals("2\tbar\tEurope\tGermany", res.get(1)); + assertEquals("3\tfoo\tAsia\tIndia", res.get(2)); + assertEquals("4\tbar\tEurope\tGermany", res.get(3)); + assertEquals("5\tfoo\tAsia\tChina", res.get(4)); + assertEquals("6\tbar\tEurope\tFrance", res.get(5)); + assertEquals("7\tfoo\tAsia\tChina", res.get(6)); + assertEquals("8\tbar\tEurope\tFrance", res.get(7)); + + res = queryTable(driver, "show partitions " + (dbName + "." + tblName)); + assertEquals(4, res.size()); + assertTrue(res.contains("continent=Asia/country=India")); + assertTrue(res.contains("continent=Asia/country=China")); + assertTrue(res.contains("continent=Europe/country=Germany")); + assertTrue(res.contains("continent=Europe/country=France")); + } + + @Test + public void testDPTwoLevelMissingPartitionValues() throws Exception { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); + connection.write("1,foo,Asia,India".getBytes()); + connection.write("2,bar,Europe,Germany".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("3,foo,Asia,India".getBytes()); + connection.write("4,bar,Europe,Germany".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("5,foo,Asia,China".getBytes()); + connection.write("6,bar,Europe,France".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("7,foo,Asia,China".getBytes()); + connection.write("8,bar,Europe,France".getBytes()); + connection.commitTransaction(); + connection.close(); + + List res = queryTable(driver, "select * from " + (dbName + "." + tblName) + " order by id"); + assertEquals(8, res.size()); + assertEquals("1\tfoo\tAsia\tIndia", res.get(0)); + assertEquals("2\tbar\tEurope\tGermany", res.get(1)); + assertEquals("3\tfoo\tAsia\tIndia", res.get(2)); + assertEquals("4\tbar\tEurope\tGermany", res.get(3)); + assertEquals("5\tfoo\tAsia\tChina", res.get(4)); + assertEquals("6\tbar\tEurope\tFrance", res.get(5)); + assertEquals("7\tfoo\tAsia\tChina", res.get(6)); + assertEquals("8\tbar\tEurope\tFrance", res.get(7)); + + res = queryTable(driver, "show partitions " + (dbName + "." + tblName)); + assertEquals(4, res.size()); + assertTrue(res.contains("continent=Asia/country=India")); + assertTrue(res.contains("continent=Asia/country=China")); + assertTrue(res.contains("continent=Europe/country=Germany")); + assertTrue(res.contains("continent=Europe/country=France")); + } + + @Test + public void testDPTwoLevelNonStringPartitionColumns() throws Exception { + String tblName = "alerts2"; + String[] partNames = new String[] {"year", "month"}; + createDbAndTable(driver, dbName, tblName, null, fieldNames, colTypes, bucketCols, partNames, loc1, 2, + "partitioned by (year int, month int)"); + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + connection.beginTransaction(); + connection.write("1,foo,2018,2".getBytes()); + connection.write("2,bar,2019".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("3,foo,2018".getBytes()); + connection.write("4,bar,2019".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("5,foo,2018".getBytes()); + connection.write("6,bar,2019".getBytes()); + connection.commitTransaction(); + connection.beginTransaction(); + connection.write("7,foo,,".getBytes()); + connection.write("8,bar,,12".getBytes()); + connection.commitTransaction(); + connection.close(); + + // when partition column type is not string, the values from __HIVE_DEFAULT_PARTITION__ will be NULL + String defaultPartitionName = "NULL"; + List res = queryTable(driver, "select * from " + (dbName + "." + tblName) + " order by id"); + assertEquals(8, res.size()); + assertEquals("1\tfoo\t2018\t2", res.get(0)); + assertEquals("2\tbar\t2019\t" + defaultPartitionName, res.get(1)); + assertEquals("3\tfoo\t2018\t" + defaultPartitionName, res.get(2)); + assertEquals("4\tbar\t2019\t" + defaultPartitionName, res.get(3)); + assertEquals("5\tfoo\t2018\t" + defaultPartitionName, res.get(4)); + assertEquals("6\tbar\t2019\t" + defaultPartitionName, res.get(5)); + assertEquals("7\tfoo\t" + defaultPartitionName + "\t" + defaultPartitionName, res.get(6)); + assertEquals("8\tbar\t" + defaultPartitionName + "\t12", res.get(7)); + + defaultPartitionName = conf.getVar(HiveConf.ConfVars.DEFAULTPARTITIONNAME); + res = queryTable(driver, "show partitions " + (dbName + "." + tblName)); + assertEquals(5, res.size()); + assertTrue(res.contains("year=2018/month=2")); + assertTrue(res.contains("year=2018/month=" + defaultPartitionName)); + assertTrue(res.contains("year=2019/month=" + defaultPartitionName)); + assertTrue(res.contains("year=" + defaultPartitionName + "/month=" + defaultPartitionName)); + assertTrue(res.contains("year=" + defaultPartitionName + "/month=12")); + } + + @Test + public void testWriteBeforeBegin() throws Exception { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + // begin + write + commit + connection.beginTransaction(); + connection.write("1,foo,Asia".getBytes()); + connection.write("2,bar,Europe".getBytes()); + connection.commitTransaction(); + + // no begin + write + Exception exception = null; + try { + connection.write("3,SHOULD FAIL!".getBytes()); + } catch (Exception e) { + exception = e; + } + assertNotNull(exception); + assertTrue(exception.getMessage().equals("Transaction state is not OPEN. Missing beginTransaction?")); + + // no begin + commit + exception = null; + try { + connection.commitTransaction(); + } catch (Exception e) { + exception = e; + } + assertNotNull(exception); + assertTrue(exception.getMessage().equals("Transaction state is not OPEN. Missing beginTransaction?")); + + // no begin + abort + exception = null; + try { + connection.abortTransaction(); + } catch (Exception e) { + exception = e; + } + assertNotNull(exception); + assertTrue(exception.getMessage().equals("Transaction state is not OPEN. Missing beginTransaction?")); + + connection.close(); + String defaultPartitionName = conf.getVar(HiveConf.ConfVars.DEFAULTPARTITIONNAME); + List res = queryTable(driver, "select * from " + (dbName + "." + tblName) + " order by id"); + assertEquals(2, res.size()); + assertEquals("1\tfoo\tAsia\t" + defaultPartitionName, res.get(0)); + assertEquals("2\tbar\tEurope\t" + defaultPartitionName, res.get(1)); + } + + @Test + public void testWriteAfterClose() throws Exception { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + // begin + write + commit + connection.beginTransaction(); + connection.write("1,foo,Asia".getBytes()); + connection.write("2,bar,Europe".getBytes()); + connection.commitTransaction(); + + // close + write + connection.close(); + + Exception exception = null; + try { + connection.write("3,SHOULD FAIL!".getBytes()); + } catch (Exception e) { + exception = e; + } + assertNotNull(exception); + assertTrue(exception.getMessage().endsWith("Streaming connection is closed already.")); + + // close + commit + exception = null; + try { + connection.commitTransaction(); + } catch (Exception e) { + exception = e; + } + assertNotNull(exception); + assertTrue(exception.getMessage().endsWith("Streaming connection is closed already.")); + + // close + abort + exception = null; + try { + connection.abortTransaction(); + } catch (Exception e) { + exception = e; + } + assertNotNull(exception); + assertTrue(exception.getMessage().endsWith("Streaming connection is closed already.")); + + String defaultPartitionName = conf.getVar(HiveConf.ConfVars.DEFAULTPARTITIONNAME); + List res = queryTable(driver, "select * from " + (dbName + "." + tblName) + " order by id"); + assertEquals(2, res.size()); + assertEquals("1\tfoo\tAsia\t" + defaultPartitionName, res.get(0)); + assertEquals("2\tbar\tEurope\t" + defaultPartitionName, res.get(1)); + } + + @Test + public void testWriteAfterAbort() throws Exception { + StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = HiveStreamingConnection.newBuilder() + .withDatabase(dbName) + .withTable(tblName) + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(writer) + .withHiveConf(conf) + .connect(); + + // begin + write + commit + connection.beginTransaction(); + connection.write("1,foo,Asia".getBytes()); + connection.write("2,bar,Europe".getBytes()); + connection.commitTransaction(); + + // begin + write + abort + connection.beginTransaction(); + connection.write("3,oops!".getBytes()); + connection.abortTransaction(); + + // begin + write + abort + connection.beginTransaction(); + connection.write("4,I did it again!".getBytes()); + connection.abortTransaction(); + + // begin + write + commit + connection.beginTransaction(); + connection.write("5,Not now!,Europe".getBytes()); + connection.commitTransaction(); + + // close + write + connection.close(); + Exception exception = null; + try { + connection.write("6,SHOULD FAIL!".getBytes()); + } catch (Exception e) { + exception = e; + } + assertNotNull(exception); + assertTrue(exception.getMessage().equals("Streaming connection is closed already.")); + String defaultPartitionName = conf.getVar(HiveConf.ConfVars.DEFAULTPARTITIONNAME); + List res = queryTable(driver, "select * from " + (dbName + "." + tblName) + " order by id"); + assertEquals(3, res.size()); + assertEquals("1\tfoo\tAsia\t" + defaultPartitionName, res.get(0)); + assertEquals("2\tbar\tEurope\t" + defaultPartitionName, res.get(1)); + assertEquals("5\tNot now!\tEurope\t" + defaultPartitionName, res.get(2)); + } + + @Test + public void testTableValidation() throws Exception { + int bucketCount = 100; + + String dbUri = "raw://" + new Path(dbFolder.newFolder().toString()).toUri().toString(); + String tbl1 = "validation1"; + String tbl2 = "validation2"; + + String tableLoc = "'" + dbUri + Path.SEPARATOR + tbl1 + "'"; + String tableLoc2 = "'" + dbUri + Path.SEPARATOR + tbl2 + "'"; + + runDDL(driver, "create database testBucketing3"); + runDDL(driver, "use testBucketing3"); + + runDDL(driver, "create table " + tbl1 + " ( key1 string, data string ) clustered by ( key1 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc + " TBLPROPERTIES ('transactional'='false')"); + + runDDL(driver, "create table " + tbl2 + " ( key1 string, data string ) clustered by ( key1 ) into " + + bucketCount + " buckets stored as orc location " + tableLoc2 + " TBLPROPERTIES ('transactional'='false')"); + + StrictDelimitedInputWriter wr = StrictDelimitedInputWriter.newBuilder() + .withFieldDelimiter(',') + .build(); + HiveStreamingConnection connection = null; + try { + connection = HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("validation2") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + Assert.assertTrue("InvalidTable exception was not thrown", false); + } catch (InvalidTable e) { + // expecting this exception + } finally { + if (connection != null) { + connection.close(); + } + } + try { + connection = HiveStreamingConnection.newBuilder() + .withDatabase("testBucketing3") + .withTable("validation2") + .withAgentInfo("UT_" + Thread.currentThread().getName()) + .withRecordWriter(wr) + .withHiveConf(conf) + .connect(); + Assert.assertTrue("InvalidTable exception was not thrown", false); + } catch (InvalidTable e) { + // expecting this exception + } finally { + if (connection != null) { + connection.close(); + } + } + } + + private static boolean runDDL(IDriver driver, String sql) { + LOG.debug(sql); + System.out.println(sql); + CommandProcessorResponse cpr = driver.run(sql); + if (cpr.getResponseCode() == 0) { + return true; + } + LOG.error("Statement: " + sql + " failed: " + cpr); + return false; + } + + + private static ArrayList queryTable(IDriver driver, String query) throws IOException { + CommandProcessorResponse cpr = driver.run(query); + if (cpr.getResponseCode() != 0) { + throw new RuntimeException(query + " failed: " + cpr); + } + ArrayList res = new ArrayList(); + driver.getResults(res); + return res; + } + + + // delete db and all tables in it + public static void dropDB(IMetaStoreClient client, String databaseName) { + try { + for (String table : client.listTableNamesByFilter(databaseName, "", (short) -1)) { + client.dropTable(databaseName, table, true, true); + } + client.dropDatabase(databaseName); + } catch (TException e) { + } + + } + + + ///////// -------- UTILS ------- ///////// + // returns Path of the partition created (if any) else Path of table + private static Path createDbAndTable(IDriver driver, String databaseName, + String tableName, List partVals, + String[] colNames, String[] colTypes, + String[] bucketCols, + String[] partNames, String dbLocation, int bucketCount) + throws Exception { + + String dbUri = "raw://" + new Path(dbLocation).toUri().toString(); + String tableLoc = dbUri + Path.SEPARATOR + tableName; + + runDDL(driver, "create database IF NOT EXISTS " + databaseName + " location '" + dbUri + "'"); + runDDL(driver, "use " + databaseName); + String crtTbl = "create table " + tableName + + " ( " + getTableColumnsStr(colNames, colTypes) + " )" + + getPartitionStmtStr(partNames) + + " clustered by ( " + join(bucketCols, ",") + " )" + + " into " + bucketCount + " buckets " + + " stored as orc " + + " location '" + tableLoc + "'" + + " TBLPROPERTIES ('transactional'='true') "; + runDDL(driver, crtTbl); + if (partNames != null && partNames.length != 0 && partVals != null) { + return addPartition(driver, tableName, partVals, partNames); + } + return new Path(tableLoc); + } + + private static Path createDbAndTable(IDriver driver, String databaseName, + String tableName, List partVals, + String[] colNames, String[] colTypes, + String[] bucketCols, + String[] partNames, String dbLocation, int bucketCount, String partLine) + throws Exception { + + String dbUri = "raw://" + new Path(dbLocation).toUri().toString(); + String tableLoc = dbUri + Path.SEPARATOR + tableName; + + runDDL(driver, "create database IF NOT EXISTS " + databaseName + " location '" + dbUri + "'"); + runDDL(driver, "use " + databaseName); + String crtTbl = "create table " + tableName + + " ( " + getTableColumnsStr(colNames, colTypes) + " )" + + partLine + + " clustered by ( " + join(bucketCols, ",") + " )" + + " into " + bucketCount + " buckets " + + " stored as orc " + + " location '" + tableLoc + "'" + + " TBLPROPERTIES ('transactional'='true') "; + runDDL(driver, crtTbl); + if (partNames != null && partNames.length != 0 && partVals != null) { + return addPartition(driver, tableName, partVals, partNames); + } + return new Path(tableLoc); + } + + private static Path addPartition(IDriver driver, String tableName, List partVals, String[] partNames) + throws Exception { + String partSpec = getPartsSpec(partNames, partVals); + String addPart = "alter table " + tableName + " add partition ( " + partSpec + " )"; + runDDL(driver, addPart); + return getPartitionPath(driver, tableName, partSpec); + } + + private static Path getPartitionPath(IDriver driver, String tableName, String partSpec) throws Exception { + ArrayList res = queryTable(driver, "describe extended " + tableName + " PARTITION (" + partSpec + ")"); + String partInfo = res.get(res.size() - 1); + int start = partInfo.indexOf("location:") + "location:".length(); + int end = partInfo.indexOf(",", start); + return new Path(partInfo.substring(start, end)); + } + + private static String getTableColumnsStr(String[] colNames, String[] colTypes) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < colNames.length; ++i) { + sb.append(colNames[i]).append(" ").append(colTypes[i]); + if (i < colNames.length - 1) { + sb.append(","); + } + } + return sb.toString(); + } + + // converts partNames into "partName1 string, partName2 string" + private static String getTablePartsStr(String[] partNames) { + if (partNames == null || partNames.length == 0) { + return ""; + } + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < partNames.length; ++i) { + sb.append(partNames[i]).append(" string"); + if (i < partNames.length - 1) { + sb.append(","); + } + } + return sb.toString(); + } + + // converts partNames,partVals into "partName1=val1, partName2=val2" + private static String getPartsSpec(String[] partNames, List partVals) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < partVals.size(); ++i) { + sb.append(partNames[i]).append(" = '").append(partVals.get(i)).append("'"); + if (i < partVals.size() - 1) { + sb.append(","); + } + } + return sb.toString(); + } + + private static String join(String[] values, String delimiter) { + if (values == null) { + return null; + } + StringBuilder strbuf = new StringBuilder(); + + boolean first = true; + + for (Object value : values) { + if (!first) { + strbuf.append(delimiter); + } else { + first = false; + } + strbuf.append(value.toString()); + } + + return strbuf.toString(); + } + + private static String getPartitionStmtStr(String[] partNames) { + if (partNames == null || partNames.length == 0) { + return ""; + } + return " partitioned by (" + getTablePartsStr(partNames) + " )"; + } +}