commit f6c50005977c4dbb7e73fd12917f54b0408ad733 Author: Misha Dmitriev Date: Fri Jun 1 16:46:56 2018 -0700 HIVE-19668: Over 30% of the heap wasted by duplicate org.antlr.runtime.CommonToken's and duplicate strings diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java index 9f63f62b168a475676bd4bdc58a899e680b6e745..7b32020f1afc5695d785cb19da0c4dff42f5f31c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java @@ -24,10 +24,13 @@ import java.util.Deque; import java.util.List; +import com.google.common.collect.Interner; +import com.google.common.collect.Interners; import org.antlr.runtime.Token; import org.antlr.runtime.tree.CommonTree; import org.antlr.runtime.tree.Tree; import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.ql.lib.Node; /** @@ -43,17 +46,16 @@ private transient boolean isValidASTStr; private transient boolean visited = false; + private static final Interner TOKEN_CACHE = Interners.newWeakInterner(); + public ASTNode() { } /** - * Constructor. - * - * @param t - * Token for the CommonTree Node + * @param t Token for the CommonTree Node */ public ASTNode(Token t) { - super(t); + super(internToken(t)); } public ASTNode(ASTNode node) { @@ -282,6 +284,13 @@ public void replaceChildren(int startChildIndex, int stopChildIndex, Object t) { } @Override + protected List createChildrenList() { + // Measurements show that in most situations the number of children is small. + // Avoid wasting memory by creating ArrayList with the default capacity of 10. + return new ArrayList(2); + } + + @Override public String toStringTree() { // The root might have changed because of tree modifications. @@ -346,4 +355,16 @@ private String toStringTree(ASTNode rootNode) { return rootNode.getMemoizedSubString(startIndx, endIndx); } + + private static Token internToken(Token t) { + if (t == null) { + return null; + } + if (t instanceof ImmutableCommonToken) { + return TOKEN_CACHE.intern((ImmutableCommonToken) t); + } else { + t.setText(StringInternUtils.internIfNotNull(t.getText())); + return t; + } + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNodeOrigin.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNodeOrigin.java index 8d812e498d249c8fa157adc1c3170abc52cad56e..3964c334855d50bfdcb5f362a75c62c94d7f759f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNodeOrigin.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNodeOrigin.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.parse; +import org.apache.hadoop.hive.common.StringInternUtils; + /** * ASTNodeOrigin contains contextual information about the object from whose * definition a particular ASTNode originated. For example, suppose a view v is @@ -46,7 +48,7 @@ public ASTNodeOrigin(String objectType, String objectName, String objectDefinition, String usageAlias, ASTNode usageNode) { this.objectType = objectType; this.objectName = objectName; - this.objectDefinition = objectDefinition; + this.objectDefinition = StringInternUtils.internIfNotNull(objectDefinition); this.usageAlias = usageAlias; this.usageNode = usageNode; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index e091f38bc69116b52b945f58f3243ae7c7daf13e..4e9fe2b0b356966755865ff3d4fba0d07dcc951a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -308,6 +308,27 @@ private boolean disableSemJoinReordering = true; private EnumSet profilesCBO; + private static final CommonToken FROM_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_FROM, "TOK_FROM"); + private static final CommonToken DEST_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_DESTINATION, "TOK_DESTINATION"); + private static final CommonToken DIR_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_DIR, "TOK_DIR"); + private static final CommonToken TMPFILE_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_TMP_FILE, "TOK_TMP_FILE"); + private static final CommonToken SELECT_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_SELECT, "TOK_SELECT"); + private static final CommonToken SELEXPR_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_SELEXPR, "TOK_SELEXPR"); + private static final CommonToken TABLEORCOL_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_TABLE_OR_COL, "TOK_TABLE_OR_COL"); + private static final CommonToken INSERT_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_INSERT, "TOK_INSERT"); + private static final CommonToken QUERY_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_QUERY, "TOK_QUERY"); + private static final CommonToken SUBQUERY_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_SUBQUERY, "TOK_SUBQUERY"); + public CalcitePlanner(QueryState queryState) throws SemanticException { super(queryState); if (!HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_ENABLED)) { @@ -726,45 +747,45 @@ public Object post(Object t) { // TOK_TMP_FILE // TOK_SELECT // refs - ASTNode from = new ASTNode(new CommonToken(HiveParser.TOK_FROM, "TOK_FROM")); + ASTNode from = new ASTNode(FROM_TOKEN); from.addChild((ASTNode) ParseDriver.adaptor.dupTree(nodeOfInterest)); - ASTNode destination = new ASTNode(new CommonToken(HiveParser.TOK_DESTINATION, "TOK_DESTINATION")); - ASTNode dir = new ASTNode(new CommonToken(HiveParser.TOK_DIR, "TOK_DIR")); - ASTNode tmpFile = new ASTNode(new CommonToken(HiveParser.TOK_TMP_FILE, "TOK_TMP_FILE")); + ASTNode destination = new ASTNode(DEST_TOKEN); + ASTNode dir = new ASTNode(DIR_TOKEN); + ASTNode tmpFile = new ASTNode(TMPFILE_TOKEN); dir.addChild(tmpFile); destination.addChild(dir); - ASTNode select = new ASTNode(new CommonToken(HiveParser.TOK_SELECT, "TOK_SELECT")); + ASTNode select = new ASTNode(SELECT_TOKEN); int num = 0; for (Collection selectIdentifier : aliasNodes.asMap().values()) { Iterator it = selectIdentifier.iterator(); ASTNode node = (ASTNode) it.next(); // Add select expression - ASTNode selectExpr = new ASTNode(new CommonToken(HiveParser.TOK_SELEXPR, "TOK_SELEXPR")); + ASTNode selectExpr = new ASTNode(SELEXPR_TOKEN); selectExpr.addChild((ASTNode) ParseDriver.adaptor.dupTree(node)); // Identifier String colAlias = "col" + num; selectExpr.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, colAlias))); // Alias select.addChild(selectExpr); // Rewrite all INSERT references (all the node values for this key) - ASTNode colExpr = new ASTNode(new CommonToken(HiveParser.TOK_TABLE_OR_COL, "TOK_TABLE_OR_COL")); + ASTNode colExpr = new ASTNode(TABLEORCOL_TOKEN); colExpr.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, colAlias))); replaceASTChild(node, colExpr); while (it.hasNext()) { // Loop to rewrite rest of INSERT references node = (ASTNode) it.next(); - colExpr = new ASTNode(new CommonToken(HiveParser.TOK_TABLE_OR_COL, "TOK_TABLE_OR_COL")); + colExpr = new ASTNode(TABLEORCOL_TOKEN); colExpr.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, colAlias))); replaceASTChild(node, colExpr); } num++; } - ASTNode insert = new ASTNode(new CommonToken(HiveParser.TOK_INSERT, "TOK_INSERT")); + ASTNode insert = new ASTNode(INSERT_TOKEN); insert.addChild(destination); insert.addChild(select); - ASTNode newQuery = new ASTNode(new CommonToken(HiveParser.TOK_QUERY, "TOK_QUERY")); + ASTNode newQuery = new ASTNode(QUERY_TOKEN); newQuery.addChild(from); newQuery.addChild(insert); // 3. create subquery - ASTNode subq = new ASTNode(new CommonToken(HiveParser.TOK_SUBQUERY, "TOK_SUBQUERY")); + ASTNode subq = new ASTNode(SUBQUERY_TOKEN); subq.addChild(newQuery); subq.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, "subq"))); replaceASTChild(nodeOfInterest, subq); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ImmutableCommonToken.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ImmutableCommonToken.java new file mode 100644 index 0000000000000000000000000000000000000000..d8264dde9fd6261ed78772a5a58483b1ae36d3e6 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ImmutableCommonToken.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.parse; + +import org.antlr.runtime.CharStream; +import org.antlr.runtime.CommonToken; +import org.antlr.runtime.Token; + +/** + * This class is designed to hold "constant" CommonTokens, that have fixed type + * and text, and everything else equal to zero. They can therefore be reused + * to save memory. However, to support reuse (canonicalization) we need to + * implement the proper hashCode() and equals() methods. + */ +class ImmutableCommonToken extends CommonToken { + + private static final String SETTERS_DISABLED = "All setter methods are intentionally disabled"; + + private final int hashCode; + + ImmutableCommonToken(int type, String text) { + super(type, text); + hashCode = calculateHash(); + } + + private int calculateHash() { + return type * 31 + (text != null ? text.hashCode() : 0); + } + + @Override + public boolean equals(Object other) { + if (!(other instanceof ImmutableCommonToken)) { + return false; + } + ImmutableCommonToken otherToken = (ImmutableCommonToken) other; + return type == otherToken.type && + ((text == null && otherToken.text == null) || + text != null && text.equals(otherToken.text)); + } + + @Override + public int hashCode() { return hashCode; } + + // All the setter methods are overridden to throw exception, to prevent accidental + // attempts to modify data fields that should be immutable. + + @Override + public void setLine(int line) { + throw new UnsupportedOperationException(SETTERS_DISABLED); + } + + @Override + public void setText(String text) { + throw new UnsupportedOperationException(SETTERS_DISABLED); + } + + @Override + public void setCharPositionInLine(int charPositionInLine) { + throw new UnsupportedOperationException(SETTERS_DISABLED); + } + + @Override + public void setChannel(int channel) { + throw new UnsupportedOperationException(SETTERS_DISABLED); + } + + @Override + public void setType(int type) { + throw new UnsupportedOperationException(SETTERS_DISABLED); + } + + @Override + public void setStartIndex(int start) { + throw new UnsupportedOperationException(SETTERS_DISABLED); + } + + @Override + public void setStopIndex(int stop) { + throw new UnsupportedOperationException(SETTERS_DISABLED); + } + + @Override + public void setTokenIndex(int index) { + throw new UnsupportedOperationException(SETTERS_DISABLED); + } + + @Override + public void setInputStream(CharStream input) { + throw new UnsupportedOperationException(SETTERS_DISABLED); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java index 895c2f2ebc068e48c05949664708d7c73e51c5e3..f7074518b3f754e5d7d9b553196134cbf4cf5579 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import org.antlr.runtime.ANTLRStringStream; import org.antlr.runtime.CharStream; +import org.antlr.runtime.CommonToken; import org.antlr.runtime.NoViableAltException; import org.antlr.runtime.RecognitionException; import org.antlr.runtime.Token; @@ -147,10 +148,19 @@ public Object create(Token payload) { } @Override - public Object dupNode(Object t) { + public Token createToken(int tokenType, String text) { + if (tokenType == HiveParser.TOK_SETCOLREF) { + // ParseUtils.processSetColsNode() can change type of TOK_SETCOLREF nodes later + return new CommonToken(tokenType, text); + } else { + return new ImmutableCommonToken(tokenType, text); + } + } + @Override + public Object dupNode(Object t) { return create(((CommonTree)t).token); - }; + } @Override public Object dupTree(Object t, Object parent) { @@ -166,7 +176,7 @@ public Object dupTree(Object t, Object parent) { @Override public Object errorNode(TokenStream input, Token start, Token stop, RecognitionException e) { return new ASTErrorNode(input, start, stop, e); - }; + } }; public ASTNode parse(String command) throws ParseException { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBExpr.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBExpr.java index f36f7f73d90d71a3f10990320fa5d0a30def8651..e65f126a3665d0bee56b5d2eed5b9bdc1de1b04a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBExpr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBExpr.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.parse; +import org.apache.hadoop.hive.common.StringInternUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,11 +50,11 @@ public String getAlias() { } public void setAlias(String alias) { - this.alias = alias; + this.alias = StringInternUtils.internIfNotNull(alias); } public QBExpr(String alias) { - this.alias = alias; + setAlias(alias); } public QBExpr(QB qb) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java index 5789ee021ae61f0dc45e4e14cb5f1b23e42e2c7c..ed0da84e2620160df6bd98d06090b0942bac2477 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java @@ -29,6 +29,7 @@ import java.util.AbstractMap.SimpleEntry; import org.antlr.runtime.tree.Tree; +import org.apache.hadoop.hive.common.StringInternUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.AnalyzeRewriteContext; @@ -147,7 +148,7 @@ public QBParseInfo(String alias, boolean isSubQ) { destToWindowingExprs = new LinkedHashMap>(); destToDistinctFuncExprs = new HashMap>(); - this.alias = alias; + this.alias = StringInternUtils.internIfNotNull(alias); this.isSubQ = isSubQ; outerQueryLimit = -1; @@ -478,7 +479,7 @@ public boolean hasExprToColumnAlias(ASTNode expr) { } public void setExprToColumnAlias(ASTNode expr, String alias) { - exprToColumnAlias.put(expr, alias); + exprToColumnAlias.put(expr, StringInternUtils.internIfNotNull(alias)); } public void setDestLimit(String dest, Integer offset, Integer limit) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 0ca9b588fa0f4341b3ea4db67262784435924309..a8e235e137f997938200d51ebeef5293cd4b55dd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -65,6 +65,7 @@ import org.apache.hadoop.hive.common.ObjectPair; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.common.StatsSetupConst.StatDB; +import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.common.ValidTxnList; import org.apache.hadoop.hive.common.ValidTxnWriteIdList; import org.apache.hadoop.hive.common.metrics.common.MetricsConstant; @@ -396,6 +397,15 @@ private String invalidQueryMaterializationReason; + private static final CommonToken SELECTDI_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_SELECTDI, "TOK_SELECTDI"); + private static final CommonToken SELEXPR_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_SELEXPR, "TOK_SELEXPR"); + private static final CommonToken TABLEORCOL_TOKEN = + new ImmutableCommonToken(HiveParser.TOK_TABLE_OR_COL, "TOK_TABLE_OR_COL"); + private static final CommonToken DOT_TOKEN = + new ImmutableCommonToken(HiveParser.DOT, "."); + static class Phase1Ctx { String dest; int nextNum; @@ -14562,7 +14572,7 @@ protected void checkAcidTxnManager(Table table) throws SemanticException { public static ASTNode genSelectDIAST(RowResolver rr) { LinkedHashMap> map = rr.getRslvMap(); - ASTNode selectDI = new ASTNode(new CommonToken(HiveParser.TOK_SELECTDI, "TOK_SELECTDI")); + ASTNode selectDI = new ASTNode(SELECTDI_TOKEN); // Note: this will determine the order of columns in the result. For now, the columns for each // table will be together; the order of the tables, as well as the columns within each // table, is deterministic, but undefined - RR stores them in the order of addition. @@ -14574,10 +14584,11 @@ public static ASTNode genSelectDIAST(RowResolver rr) { return selectDI; } private static ASTNode buildSelExprSubTree(String tableAlias, String col) { - ASTNode selexpr = new ASTNode(new CommonToken(HiveParser.TOK_SELEXPR, "TOK_SELEXPR")); - ASTNode tableOrCol = new ASTNode(new CommonToken(HiveParser.TOK_TABLE_OR_COL, - "TOK_TABLE_OR_COL")); - ASTNode dot = new ASTNode(new CommonToken(HiveParser.DOT, ".")); + tableAlias = StringInternUtils.internIfNotNull(tableAlias); + col = StringInternUtils.internIfNotNull(col); + ASTNode selexpr = new ASTNode(SELEXPR_TOKEN); + ASTNode tableOrCol = new ASTNode(TABLEORCOL_TOKEN); + ASTNode dot = new ASTNode(DOT_TOKEN); tableOrCol.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, tableAlias))); dot.addChild(tableOrCol); dot.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, col))); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SubQueryUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SubQueryUtils.java index e8509ee9f5179fe0edaa51f69bf0e23bcf566062..3c4e3d5f758968b8bce1f06c1d7dedd65110fffd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SubQueryUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SubQueryUtils.java @@ -24,6 +24,7 @@ import java.util.List; import java.util.Map; +import org.antlr.runtime.CommonToken; import org.antlr.runtime.tree.CommonTreeAdaptor; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.ErrorMsg; @@ -132,11 +133,17 @@ boolean removeSubQuery(ASTNode node) { ASTNode child = (ASTNode) node.getChild(0); if (child == subQuery) { ASTNode sqOpType = (ASTNode) subQuery.getChild(0).getChild(0); + ASTNode newSqOpType; + // We create a new ASTNode below because its current token is likely an + // ImmutableCommonToken, whose type cannot be modified. if (sqOpType.getType() == HiveParser.KW_EXISTS) { - sqOpType.getToken().setType(HiveParser.TOK_SUBQUERY_OP_NOTEXISTS); + newSqOpType = new ASTNode(new CommonToken( + HiveParser.TOK_SUBQUERY_OP_NOTEXISTS, "TOK_SUBQUERY_OP_NOTEXISTS")); } else { - sqOpType.getToken().setType(HiveParser.TOK_SUBQUERY_OP_NOTIN); + newSqOpType = new ASTNode(new CommonToken( + HiveParser.TOK_SUBQUERY_OP_NOTIN, "TOK_SUBQUERY_OP_NOTIN")); } + subQuery.getChild(0).setChild(0, newSqOpType); ASTNode parent = getParentInWhereClause(node); if (parent == null) { root = subQuery;