Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 804100) +++ CHANGES.txt (working copy) @@ -171,6 +171,13 @@ reusableTokenStream. This is now fixed, such that if reusableTokenStream is invoked on such a subclass, that method will forcefully fallback to tokenStream. (Mike McCandless) + +12. LUCENE-1801: Token.clear() and Token.clearNoTermBuffer() now also clear + startOffset, endOffset and type. This should normally affect no + Tokenizer chains, as Tokenizers normally always set these three values. + This change was made to be conform to the new AttributeImpl.clear() and + AttributeSource.clearAttributes() to work identical for Token as one for all + AttributeImpl and the 6 separate AttributeImpls. (Uwe Schindler, Michael Busch) API Changes @@ -468,6 +475,10 @@ 22. LUCENE-1805: CloseableThreadLocal did not allow a null Object in get(), although it does allow it in set(Object). Fix get() to not assert the object is not null. (Shai Erera via Mike McCandless) + +23. LUCENE-1801: Changed all Tokenizers or TokenStreams in core/contrib) + that are the source of Tokens to always call + AttributeSource.clearAttributes() first. (Uwe Schindler) New features Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 804100) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy) @@ -123,6 +123,7 @@ * */ public boolean incrementToken() throws IOException { + clearAttributes(); /** how many character(s) has been stored in buffer */ while(true) { // loop until we find a non-empty token Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 804100) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy) @@ -96,6 +96,7 @@ } public boolean incrementToken() throws IOException { + clearAttributes(); length = 0; start = offset; Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (revision 804100) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (working copy) @@ -64,6 +64,7 @@ Token clone = (Token) singleToken.clone(); + clearAttributes(); termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength()); offsetAtt.setOffset(clone.startOffset(), clone.endOffset()); flagsAtt.setFlags(clone.getFlags()); Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 804100) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy) @@ -123,6 +123,7 @@ /** Returns the next token in the stream, or null at EOS. */ public final boolean incrementToken() throws IOException { + clearAttributes(); // if we are just starting, read the whole input if (!started) { started = true; Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 804100) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy) @@ -72,6 +72,7 @@ /** Returns the next token in the stream, or null at EOS. */ public final boolean incrementToken() throws IOException { + clearAttributes(); if (!started) { started = true; gramSize = minGram; Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (revision 804100) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (working copy) @@ -54,6 +54,7 @@ } public boolean incrementToken() throws IOException { + clearAttributes(); buffer.setLength(0); int ci; char ch, pch; Index: contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (revision 804100) +++ contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (working copy) @@ -343,7 +343,7 @@ public final boolean incrementToken() { if (matcher == null) return false; - + clearAttributes(); while (true) { // loop takes care of leading and trailing boundary cases int start = pos; int end; @@ -401,6 +401,7 @@ } public boolean incrementToken() { + clearAttributes(); // cache loop instance vars (performance) String s = str; int len = s.length(); Index: contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java =================================================================== --- contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (revision 804100) +++ contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (working copy) @@ -184,6 +184,7 @@ restoreState(state); return true; } + clearAttributes(); int tokenType = scanner.getNextToken(); if (tokenType == WikipediaTokenizerImpl.YYEOF) { Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 804100) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -53,9 +53,9 @@ } public final boolean incrementToken() throws IOException { + clearAttributes(); int length = 0; int start = bufferIndex; - termAtt.clear(); char[] buffer = termAtt.termBuffer(); while (true) { Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 804100) +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy) @@ -49,6 +49,7 @@ public final boolean incrementToken() throws IOException { if (!done) { + clearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.termBuffer(); Index: src/java/org/apache/lucene/analysis/NumericTokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/NumericTokenStream.java (revision 804100) +++ src/java/org/apache/lucene/analysis/NumericTokenStream.java (working copy) @@ -184,6 +184,7 @@ if (shift >= valSize) return false; + clearAttributes(); final char[] buffer; switch (valSize) { case 64: Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 804100) +++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -148,6 +148,7 @@ * @see org.apache.lucene.analysis.TokenStream#next() */ public final boolean incrementToken() throws IOException { + clearAttributes(); int posIncr = 1; while(true) { Index: src/java/org/apache/lucene/analysis/Token.java =================================================================== --- src/java/org/apache/lucene/analysis/Token.java (revision 804100) +++ src/java/org/apache/lucene/analysis/Token.java (working copy) @@ -117,7 +117,7 @@ A few things to note: