diff --git metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnUtils.java metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnUtils.java index 30b155f3b3..74fa21d5c7 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnUtils.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnUtils.java @@ -143,88 +143,131 @@ public static boolean isAcidTable(Table table) { } /** - * Build a query (or queries if one query is too big) with specified "prefix" and "suffix", - * while populating the IN list into multiple OR clauses, e.g. id in (1,2,3) OR id in (4,5,6) - * For NOT IN case, NOT IN list is broken into multiple AND clauses. - * @param queries array of complete query strings - * @param prefix part of the query that comes before IN list - * @param suffix part of the query that comes after IN list - * @param inList the list containing IN list values - * @param inColumn column name of IN list operator - * @param addParens add a pair of parenthesis outside the IN lists - * e.g. ( id in (1,2,3) OR id in (4,5,6) ) - * @param notIn clause to be broken up is NOT IN + * Build a query (or queries if one query is too big) with + * one or multiple 'IN' or 'NOT IN' clauses with the given input parameters. + * + * Note that this method currently support only single column for + * IN/NOT IN clauses and that only covers OR-based composite 'IN' clause and + * AND-based composite 'NOT IN' clause. + * For example, for 'IN' clause case, the method will build a query with OR. + * E.g., "id in (1,2,3) OR id in (4,5,6)". + * For 'NOT IN' case, NOT IN list is broken into multiple 'NOT IN" clauses connected by AND. + * + * Note that, in this method, "a composite 'IN' clause" is defined as "a list of multiple 'IN' + * clauses in a query". + * + * @param queries OUT: Array of query strings + * @param prefix IN: Part of the query that comes before IN list + * @param suffix IN: Part of the query that comes after IN list + * @param inList IN: the list with IN list values + * @param inColumn IN: single column name of IN list operator + * @param addParens IN: add a pair of parenthesis outside the IN lists + * e.g. "(id in (1,2,3) OR id in (4,5,6))" + * @param notIn IN: is this for building a 'NOT IN' composite clause? */ - public static void buildQueryWithINClause(HiveConf conf, List queries, StringBuilder prefix, - StringBuilder suffix, List inList, - String inColumn, boolean addParens, boolean notIn) { + public static void buildQueryWithINClause(HiveConf conf, + List queries, + StringBuilder prefix, + StringBuilder suffix, + List inList, + String inColumn, + boolean addParens, + boolean notIn) { + // Check parameter set validity as a public method. if (inList == null || inList.size() == 0) { throw new IllegalArgumentException("The IN list is empty!"); } + + // Define constants and local vaiables. + int maxQueryLength = conf.getIntVar(HiveConf.ConfVars.METASTORE_DIRECT_SQL_MAX_QUERY_LENGTH); int batchSize = conf.getIntVar(HiveConf.ConfVars.METASTORE_DIRECT_SQL_MAX_ELEMENTS_IN_CLAUSE); - int numWholeBatches = inList.size() / batchSize; + int inListSize = inList.size(); StringBuilder buf = new StringBuilder(); - buf.append(prefix); - if (addParens) { - buf.append("("); - } - buf.append(inColumn); - if (notIn) { - buf.append(" not in ("); - } else { - buf.append(" in ("); - } + + int i = 0, // cursor for the "inList" array. + j = 0, // cursor for an element list per an 'IN'/'NOT IN'-clause + k = 0; // cursor for in-clause lists per a query + boolean nextItemNeeded = true; + boolean newInclauseJustAppended = false; + StringBuilder nextValue = new StringBuilder(""); + StringBuilder newInclausePrefix = new StringBuilder(notIn ? " and " + inColumn + " not in (": + " or " + inColumn + " in ("); + + // Loop over the given inList elements. + while( i < inListSize) { + if (k == 0) { + // Append prefix + buf.append(prefix); + if (addParens) { + buf.append("("); + } + buf.append(inColumn); - for (int i = 0; i <= numWholeBatches; i++) { - if (i * batchSize == inList.size()) { - // At this point we just realized we don't need another query - break; + if (notIn) { + buf.append(" not in ("); + } else { + buf.append(" in ("); + } + k++; + newInclauseJustAppended = false; } - if (needNewQuery(conf, buf)) { - // Wrap up current query string + // Get the next "inList" value element if needed. + if (nextItemNeeded) { + nextValue.setLength(0); + nextValue.append(String.valueOf(inList.get(i++))); + nextItemNeeded = false; + } + + // Compute the size of a query when the 'nextValue' is added to the current query. + int querySize = querySizeExpected(buf.length(), nextValue.length(), suffix.length(), addParens); + + if (querySize > maxQueryLength * 1024) { + // Wrap up the current query string since we can not add another "inList" element value. + if (newInclauseJustAppended) { + buf.delete(buf.length()-newInclausePrefix.length(), buf.length()); + } + + buf.setCharAt(buf.length() - 1, ')'); // replace the commar to finish a 'IN' clause string. + if (addParens) { buf.append(")"); } + buf.append(suffix); queries.add(buf.toString()); - // Prepare a new query string + // Prepare a new query string. buf.setLength(0); - } - - if (i > 0) { - if (notIn) { - if (buf.length() == 0) { - buf.append(prefix); - if (addParens) { - buf.append("("); - } - } else { - buf.append(" and "); - } - buf.append(inColumn); - buf.append(" not in ("); - } else { - if (buf.length() == 0) { - buf.append(prefix); - if (addParens) { - buf.append("("); - } - } else { - buf.append(" or "); - } - buf.append(inColumn); - buf.append(" in ("); - } - } + k = j = 0; + querySize = 0; + newInclauseJustAppended = false; + continue; + } else if (j >= batchSize-1) { + // Finish the current 'IN'/'NOT IN' clause and start a new clause. + buf.setCharAt(buf.length() - 1, ')'); // replace the commar. + buf.append(newInclausePrefix.toString()); - for (int j = i * batchSize; j < (i + 1) * batchSize && j < inList.size(); j++) { - buf.append(inList.get(j)).append(","); + newInclauseJustAppended = true; + + // increment cursor for per-query IN-clause list + k++; + j = 0; + } else { + // for (int j = i * batchSize; j < (i + 1) * batchSize && j < inList.size(); j++) + buf.append(nextValue.toString()).append(","); + nextItemNeeded = true; + newInclauseJustAppended = false; + // increment cursor for elements per 'IN'/'NOT IN' clause. + j++; } - buf.setCharAt(buf.length() - 1, ')'); } + // Finish the last query. + if (newInclauseJustAppended) { + buf.delete(buf.length()-newInclausePrefix.length(), buf.length()); + } + buf.setCharAt(buf.length() - 1, ')'); // replace the commar. if (addParens) { buf.append(")"); } @@ -232,11 +275,25 @@ public static void buildQueryWithINClause(HiveConf conf, List queries, S queries.add(buf.toString()); } - /** Estimate if the size of a string will exceed certain limit */ - private static boolean needNewQuery(HiveConf conf, StringBuilder sb) { - int queryMemoryLimit = conf.getIntVar(HiveConf.ConfVars.METASTORE_DIRECT_SQL_MAX_QUERY_LENGTH); - // http://www.javamex.com/tutorials/memory/string_memory_usage.shtml - long sizeInBytes = 8 * (((sb.length() * 2) + 45) / 8); - return sizeInBytes / 1024 > queryMemoryLimit; + /* + * Compute and return the size of a query statement with the given parameters as input variables. + * + * @param sizeSoFar size of the current contents of the buf + * @param sizeNextItem size of the next 'IN' clause element value. + * @param suffixSize size of the suffix for a quey statement + * @param addParens Do we add an additional parenthesis? + */ + private static int querySizeExpected(int sizeSoFar, + int sizeNextItem, + int suffixSize, + boolean addParens) { + + int size = sizeSoFar + sizeNextItem + suffixSize; + + if (addParens) { + size++; + } + + return size; } } diff --git metastore/src/test/org/apache/hadoop/hive/metastore/txn/TestTxnUtils.java metastore/src/test/org/apache/hadoop/hive/metastore/txn/TestTxnUtils.java index 1497c00e5d..593563032b 100644 --- metastore/src/test/org/apache/hadoop/hive/metastore/txn/TestTxnUtils.java +++ metastore/src/test/org/apache/hadoop/hive/metastore/txn/TestTxnUtils.java @@ -56,7 +56,7 @@ public void testBuildQueryWithINClause() throws Exception { conf.setIntVar(HiveConf.ConfVars.METASTORE_DIRECT_SQL_MAX_QUERY_LENGTH, 1); conf.setIntVar(HiveConf.ConfVars.METASTORE_DIRECT_SQL_MAX_ELEMENTS_IN_CLAUSE, 10); List inList = new ArrayList(); - for (long i = 1; i <= 200; i++) { + for (long i = 1; i <= 190; i++) { inList.add(i); } TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, false); @@ -66,7 +66,7 @@ public void testBuildQueryWithINClause() throws Exception { // Case 2 - Max in list members: 10; Max query string length: 1KB // The first query has 2 full batches, and the second query only has 1 batch which only contains 1 member queries.clear(); - inList.add((long)201); + inList.add((long)191); TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, false); Assert.assertEquals(2, queries.size()); runAgainstDerby(queries); @@ -75,11 +75,11 @@ public void testBuildQueryWithINClause() throws Exception { conf.setIntVar(HiveConf.ConfVars.METASTORE_DIRECT_SQL_MAX_QUERY_LENGTH, 1); conf.setIntVar(HiveConf.ConfVars.METASTORE_DIRECT_SQL_MAX_ELEMENTS_IN_CLAUSE, 1000); queries.clear(); - for (long i = 202; i <= 1000; i++) { + for (long i = 192; i <= 1000; i++) { inList.add(i); } TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, false); - Assert.assertEquals(1, queries.size()); + Assert.assertEquals(5, queries.size()); runAgainstDerby(queries); // Case 3.2 - Max in list members: 1000, Max query string length: 10KB, and exact 1000 members in a single IN clause @@ -97,7 +97,7 @@ public void testBuildQueryWithINClause() throws Exception { conf.setIntVar(HiveConf.ConfVars.METASTORE_DIRECT_SQL_MAX_QUERY_LENGTH, 1); queries.clear(); TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, false); - Assert.assertEquals(2, queries.size()); + Assert.assertEquals(10, queries.size()); runAgainstDerby(queries); conf.setIntVar(HiveConf.ConfVars.METASTORE_DIRECT_SQL_MAX_QUERY_LENGTH, 10); queries.clear();