Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java (working copy) @@ -19,35 +19,33 @@ import java.io.IOException; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography. * */ -public class ArabicNormalizationFilter extends TokenFilter { +public final class ArabicNormalizationFilter extends TokenFilter { protected ArabicNormalizer normalizer = null; - + private TermAttribute termAtt; + public ArabicNormalizationFilter(TokenStream input) { super(input); normalizer = new ArabicNormalizer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - - - public Token next(Token reusableToken) throws IOException { - if ((reusableToken = input.next(reusableToken)) == null) { - return null; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()); + termAtt.setTermLength(newlen); + return true; } else { - int oldlen = reusableToken.termLength(); - int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen); - if (oldlen != newlen) - reusableToken.setTermLength(newlen); - return reusableToken; + return false; } } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (working copy) @@ -19,43 +19,33 @@ import java.io.IOException; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words.. * */ -public class ArabicStemFilter extends TokenFilter { +public final class ArabicStemFilter extends TokenFilter { protected ArabicStemmer stemmer = null; - + private TermAttribute termAtt; + public ArabicStemFilter(TokenStream input) { super(input); stemmer = new ArabicStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - - - /** - * @return Returns the next token in the stream, or null at EOS - */ - public Token next(Token reusableToken) throws IOException { - /** - * The actual token in the input stream. - */ - - - if ((reusableToken = input.next(reusableToken)) == null) { - return null; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength()); + termAtt.setTermLength(newlen); + return true; } else { - int oldlen = reusableToken.termLength(); - int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen); - if (oldlen != newlen) - reusableToken.setTermLength(newlen); - return reusableToken; + return false; } } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (working copy) @@ -17,14 +17,13 @@ * limitations under the License. */ -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; - import java.io.IOException; -import java.util.HashSet; import java.util.Set; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * Based on GermanStemFilter * @@ -36,10 +35,12 @@ */ private BrazilianStemmer stemmer = null; private Set exclusions = null; - + private TermAttribute termAtt; + public BrazilianStemFilter(TokenStream in) { super(in); stemmer = new BrazilianStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } public BrazilianStemFilter(TokenStream in, Set exclusiontable) { @@ -47,26 +48,20 @@ this.exclusions = exclusiontable; } - /** - * @return Returns the next token in the stream, or null at EOS. - */ - public final Token next(final Token reusableToken) - throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; - - String term = nextToken.term(); - - // Check the exclusion table. - if (exclusions == null || !exclusions.contains(term)) { - String s = stemmer.stem(term); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.equals(term)) - nextToken.setTermBuffer(s); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; } - return nextToken; } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy) @@ -17,12 +17,15 @@ * limitations under the License. */ -import org.apache.lucene.analysis.Token; +import java.io.IOException; +import java.io.Reader; + import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import java.io.Reader; - /** * CJKTokenizer was modified from StopTokenizer which does a decent job for * most European languages. It performs other token methods for double-byte @@ -88,6 +91,10 @@ */ private boolean preIsTokened = false; + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; + //~ Constructors ----------------------------------------------------------- /** @@ -97,25 +104,26 @@ */ public CJKTokenizer(Reader in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } //~ Methods ---------------------------------------------------------------- /** - * Returns the next token in the stream, or null at EOS. + * Returns true for the next token in the stream, or false at EOS. * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html * for detail. * - * @param reusableToken a reusable token - * @return Token + * @return false for end of stream, true otherwise * * @throws java.io.IOException - throw IOException when read error
* happened in the InputStream * */ - public final Token next(final Token reusableToken) throws java.io.IOException { + public boolean incrementToken() throws IOException { /** how many character(s) has been stored in buffer */ - assert reusableToken != null; while(true) { // loop until we find a non-empty token @@ -147,7 +155,7 @@ break; } else { - return null; + return false; } } else { //get current character @@ -252,10 +260,12 @@ } if (length > 0) { - return reusableToken.reinit - (buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length), TOKEN_TYPE_NAMES[tokenType]); + termAtt.setTermBuffer(buffer, 0, length); + offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]); + return true; } else if (dataLen == -1) { - return null; + return false; } // Cycle back and try for the next token (don't Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (working copy) @@ -17,12 +17,13 @@ * limitations under the License. */ +import java.io.IOException; import java.util.HashMap; import java.util.Map; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Title: ChineseFilter @@ -56,19 +57,21 @@ private Map stopTable; + private TermAttribute termAtt; + public ChineseFilter(TokenStream in) { super(in); stopTable = new HashMap(STOP_WORDS.length); for (int i = 0; i < STOP_WORDS.length; i++) stopTable.put(STOP_WORDS[i], STOP_WORDS[i]); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws java.io.IOException { - assert reusableToken != null; + public boolean incrementToken() throws IOException { - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { - String text = nextToken.term(); + while (input.incrementToken()) { + String text = termAtt.term(); // why not key off token type here assuming ChineseTokenizer comes first? if (stopTable.get(text) == null) { @@ -79,7 +82,7 @@ // English word/token should larger than 1 character. if (text.length()>1) { - return nextToken; + return true; } break; case Character.OTHER_LETTER: @@ -87,13 +90,13 @@ // One Chinese character as one Chinese word. // Chinese word extraction to be added later here. - return nextToken; + return true; } } } - return null; + return false; } } \ No newline at end of file Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy) @@ -18,10 +18,12 @@ */ +import java.io.IOException; import java.io.Reader; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** @@ -56,6 +58,8 @@ public ChineseTokenizer(Reader in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } private int offset = 0, bufferIndex=0, dataLen=0; @@ -68,7 +72,9 @@ private int length; private int start; - + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private final void push(char c) { if (length == 0) start = offset-1; // start of token @@ -76,19 +82,20 @@ } - private final Token flush(final Token token) { + private final boolean flush() { if (length>0) { //System.out.println(new String(buffer, 0, //length)); - return token.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length)); + termAtt.setTermBuffer(buffer, 0, length); + offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + return true; } else - return null; + return false; } - public final Token next(final Token reusableToken) throws java.io.IOException { - assert reusableToken != null; + public boolean incrementToken() throws IOException { length = 0; start = offset; @@ -104,7 +111,7 @@ bufferIndex = 0; } - if (dataLen == -1) return flush(reusableToken); + if (dataLen == -1) return flush(); else c = ioBuffer[bufferIndex++]; @@ -115,20 +122,20 @@ case Character.LOWERCASE_LETTER: case Character.UPPERCASE_LETTER: push(c); - if (length == MAX_WORD_LEN) return flush(reusableToken); + if (length == MAX_WORD_LEN) return flush(); break; case Character.OTHER_LETTER: if (length>0) { bufferIndex--; offset--; - return flush(reusableToken); + return flush(); } push(c); - return flush(reusableToken); + return flush(); default: - if (length>0) return flush(reusableToken); + if (length>0) return flush(); break; } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (working copy) @@ -17,13 +17,13 @@ * limitations under the License. */ -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; - import java.io.IOException; import java.util.Set; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * A filter that stems German words. It supports a table of words that should * not be stemmed at all. The stemmer used can be changed at runtime after the @@ -40,10 +40,13 @@ private GermanStemmer stemmer = null; private Set exclusionSet = null; + private TermAttribute termAtt; + public GermanStemFilter( TokenStream in ) { super(in); stemmer = new GermanStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -56,26 +59,22 @@ } /** - * @return Returns the next token in the stream, or null at EOS + * @return Returns true for next token in the stream, or false at EOS */ - public final Token next(final Token reusableToken) - throws IOException - { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - - if (nextToken == null) - return null; - - String term = nextToken.term(); - // Check the exclusion table. - if (exclusionSet == null || !exclusionSet.contains(term)) { - String s = stemmer.stem(term); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.equals(term)) - nextToken.setTermBuffer(s); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); + // Check the exclusion table. + if (exclusionSet == null || !exclusionSet.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; } - return nextToken; } /** Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (working copy) @@ -16,9 +16,11 @@ * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Normalizes token text to lower case, analyzing given ("greek") charset. @@ -28,26 +30,26 @@ { char[] charset; + private TermAttribute termAtt; + public GreekLowerCaseFilter(TokenStream in, char[] charset) { super(in); this.charset = charset; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws java.io.IOException - { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - - if (nextToken == null) - return null; - - char[] chArray = nextToken.termBuffer(); - int chLen = nextToken.termLength(); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] chArray = termAtt.termBuffer(); + int chLen = termAtt.termLength(); for (int i = 0; i < chLen; i++) { - chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset); + chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset); } - return nextToken; + return true; + } else { + return false; + } } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy) @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Removes elisions from a token stream. For example, "l'avion" (the plane) will be @@ -38,7 +39,9 @@ private Set articles = null; private static char[] apostrophes = {'\'', '’'}; - + + private TermAttribute termAtt; + public void setArticles(Set articles) { this.articles = new HashSet(); Iterator iter = articles.iterator(); @@ -54,6 +57,7 @@ super(input); this.articles = new HashSet(Arrays.asList(new String[] { "l", "m", "t", "qu", "n", "s", "j" })); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -62,6 +66,7 @@ public ElisionFilter(TokenStream input, Set articles) { super(input); setArticles(articles); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -70,39 +75,50 @@ public ElisionFilter(TokenStream input, String[] articles) { super(input); setArticles(new HashSet(Arrays.asList(articles))); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** * Returns the next input Token with term() without elisioned start */ - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAtt.termBuffer(); + int termLength = termAtt.termLength(); - char[] termBuffer = nextToken.termBuffer(); - int termLength = nextToken.termLength(); - - int minPoz = Integer.MAX_VALUE; - for (int i = 0; i < apostrophes.length; i++) { - char apos = apostrophes[i]; - // The equivalent of String.indexOf(ch) - for (int poz = 0; poz < termLength ; poz++) { - if (termBuffer[poz] == apos) { + int minPoz = Integer.MAX_VALUE; + for (int i = 0; i < apostrophes.length; i++) { + char apos = apostrophes[i]; + // The equivalent of String.indexOf(ch) + for (int poz = 0; poz < termLength ; poz++) { + if (termBuffer[poz] == apos) { minPoz = Math.min(poz, minPoz); break; + } } } - } - // An apostrophe has been found. If the prefix is an article strip it off. - if (minPoz != Integer.MAX_VALUE - && articles.contains(new String(nextToken.termBuffer(), 0, minPoz).toLowerCase())) { - nextToken.setTermBuffer(nextToken.termBuffer(), minPoz + 1, nextToken.termLength() - (minPoz + 1)); + // An apostrophe has been found. If the prefix is an article strip it off. + if (minPoz != Integer.MAX_VALUE + && articles.contains(new String(termAtt.termBuffer(), 0, minPoz).toLowerCase())) { + termAtt.setTermBuffer(termAtt.termBuffer(), minPoz + 1, termAtt.termLength() - (minPoz + 1)); + } + + return true; + } else { + return false; } + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } - return nextToken; + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } - } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; import java.util.HashSet; @@ -39,10 +40,13 @@ */ private FrenchStemmer stemmer = null; private Set exclusions = null; + + private TermAttribute termAtt; public FrenchStemFilter( TokenStream in ) { super(in); stemmer = new FrenchStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } @@ -52,25 +56,23 @@ } /** - * @return Returns the next token in the stream, or null at EOS + * @return Returns true for the next token in the stream, or false at EOS */ - public final Token next(final Token reusableToken) - throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); - String term = nextToken.term(); - - // Check the exclusion table - if ( exclusions == null || !exclusions.contains( term ) ) { - String s = stemmer.stem( term ); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.equals( term ) ) - nextToken.setTermBuffer(s); - } - return nextToken; + // Check the exclusion table + if ( exclusions == null || !exclusions.contains( term ) ) { + String s = stemmer.stem( term ); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals( term ) ) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; + } } /** * Set a alternative/custom FrenchStemmer for this filter. Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy) @@ -27,8 +27,19 @@ */ public class EmptyTokenStream extends TokenStream { - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - return null; + public final boolean incrementToken() throws IOException { + return false; } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy) @@ -17,16 +17,16 @@ * limitations under the License. */ -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; - import java.io.IOException; import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Set; -import java.util.Map; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * A filter that stems Dutch words. It supports a table of words that should * not be stemmed at all. The stemmer used can be changed at runtime after the @@ -39,10 +39,13 @@ */ private DutchStemmer stemmer = null; private Set exclusions = null; + + private TermAttribute termAtt; public DutchStemFilter(TokenStream _in) { super(_in); stemmer = new DutchStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -62,24 +65,23 @@ } /** - * @return Returns the next token in the stream, or null at EOS + * Returns the next token in the stream, or null at EOS */ - public Token next(Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); - String term = nextToken.term(); - - // Check the exclusion table. - if (exclusions == null || !exclusions.contains(term)) { - String s = stemmer.stem(term); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.equals(term)) - nextToken.setTermBuffer(s); + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; } - return nextToken; } /** Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java (working copy) @@ -16,15 +16,14 @@ * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import java.io.IOException; - /** * Characters before the delimiter are the "token", those after are the payload. *

@@ -37,7 +36,7 @@ * * @see PayloadEncoder */ -public class DelimitedPayloadTokenFilter extends TokenFilter { +public final class DelimitedPayloadTokenFilter extends TokenFilter { public static final char DEFAULT_DELIMITER = '|'; protected char delimiter = DEFAULT_DELIMITER; protected TermAttribute termAtt; @@ -83,27 +82,4 @@ } return result; } - - - public Token next(Token reusableToken) throws IOException { - Token result = input.next(reusableToken); - if (result != null) { - final char[] buffer = result.termBuffer(); - final int length = result.termLength(); - boolean seen = false; - for (int i = 0; i < length; i++) { - if (buffer[i] == delimiter) { - result.setTermBuffer(buffer, 0, i); - result.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1)))); - seen = true; - break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same - } - } - if (seen == false) { - //no delimiter - payAtt.setPayload(null); - } - } - return result; - } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (working copy) @@ -20,6 +20,8 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import java.io.IOException; @@ -34,19 +36,37 @@ private String typeMatch; private Payload thePayload; + private PayloadAttribute payloadAtt; + private TypeAttribute typeAtt; + public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) { super(input); //Need to encode the payload thePayload = new Payload(PayloadHelper.encodeFloat(payload)); this.typeMatch = typeMatch; + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null && nextToken.type().equals(typeMatch)){ - nextToken.setPayload(thePayload); + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (typeAtt.type().equals(typeMatch)) + payloadAtt.setPayload(thePayload); + return true; + } else { + return false; } - return nextToken; } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (working copy) @@ -17,14 +17,16 @@ */ +import java.io.IOException; + import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.index.Payload; -import java.io.IOException; - /** * Adds the {@link org.apache.lucene.analysis.Token#setStartOffset(int)} * and {@link org.apache.lucene.analysis.Token#setEndOffset(int)} @@ -32,22 +34,37 @@ * **/ public class TokenOffsetPayloadTokenFilter extends TokenFilter { + protected OffsetAttribute offsetAtt; + protected PayloadAttribute payAtt; - public TokenOffsetPayloadTokenFilter(TokenStream input) { super(input); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + payAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null){ + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { byte[] data = new byte[8]; - PayloadHelper.encodeInt(nextToken.startOffset(), data, 0); - PayloadHelper.encodeInt(nextToken.endOffset(), data, 4); + PayloadHelper.encodeInt(offsetAtt.startOffset(), data, 0); + PayloadHelper.encodeInt(offsetAtt.endOffset(), data, 4); Payload payload = new Payload(data); - nextToken.setPayload(payload); + payAtt.setPayload(payload); + return true; + } else { + return false; } - return nextToken; } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } } \ No newline at end of file Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (working copy) @@ -20,6 +20,8 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import java.io.IOException; @@ -32,19 +34,37 @@ * **/ public class TypeAsPayloadTokenFilter extends TokenFilter { + private PayloadAttribute payloadAtt; + private TypeAttribute typeAtt; public TypeAsPayloadTokenFilter(TokenStream input) { super(input); - + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null && nextToken.type() != null && nextToken.type().equals("") == false){ - nextToken.setPayload(new Payload(nextToken.type().getBytes("UTF-8"))); + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String type = typeAtt.type(); + if (type != null && type.equals("") == false) { + payloadAtt.setPayload(new Payload(type.getBytes("UTF-8"))); + } + return true; + } else { + return false; } - return nextToken; } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } } \ No newline at end of file Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** Set the positionIncrement of all tokens to the "positionIncrement", * except the first return token which retains its original positionIncrement value. @@ -34,6 +35,8 @@ /** The first token must have non-zero positionIncrement **/ private boolean firstTokenPositioned = false; + + private PositionIncrementAttribute posIncrAtt; /** * Constructs a PositionFilter that assigns a position increment of zero to @@ -43,6 +46,7 @@ */ public PositionFilter(final TokenStream input) { super(input); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); } /** @@ -58,20 +62,31 @@ this.positionIncrement = positionIncrement; } - public Token next(Token reusableToken) throws IOException { - - assert reusableToken != null; - reusableToken = input.next(reusableToken); - if (null != reusableToken) { + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { if (firstTokenPositioned) { - reusableToken.setPositionIncrement(positionIncrement); + posIncrAtt.setPositionIncrement(positionIncrement); } else { firstTokenPositioned = true; } + return true; + } else { + return false; } - return reusableToken; } + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } + public void reset() throws IOException { super.reset(); firstTokenPositioned = false; Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java (working copy) @@ -19,7 +19,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; @@ -30,16 +30,20 @@ */ public final class ReverseStringFilter extends TokenFilter { + private TermAttribute termAtt; + public ReverseStringFilter(TokenStream in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(Token in) throws IOException { - assert in != null; - Token token=input.next(in); - if( token == null ) return null; - reverse( token.termBuffer(), token.termLength() ); - return token; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + reverse( termAtt.termBuffer(), termAtt.termLength() ); + return true; + } else { + return false; + } } public static String reverse( final String input ){ Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (working copy) @@ -17,9 +17,12 @@ * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Normalizes token text to lower case, analyzing given ("russian") charset. @@ -31,26 +34,27 @@ { char[] charset; + private TermAttribute termAtt; + public RussianLowerCaseFilter(TokenStream in, char[] charset) { super(in); this.charset = charset; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws java.io.IOException + public final boolean incrementToken() throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - - if (nextToken == null) - return null; - - char[] chArray = nextToken.termBuffer(); - int chLen = nextToken.termLength(); + if (input.incrementToken()) { + char[] chArray = termAtt.termBuffer(); + int chLen = termAtt.termLength(); for (int i = 0; i < chLen; i++) { - chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); + chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); } - return nextToken; + return true; + } else { + return false; + } } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy) @@ -20,6 +20,8 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + import java.io.IOException; /** @@ -37,29 +39,32 @@ */ private RussianStemmer stemmer = null; + private TermAttribute termAtt; + public RussianStemFilter(TokenStream in, char[] charset) { super(in); stemmer = new RussianStemmer(charset); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** - * @return Returns the next token in the stream, or null at EOS + * Returns the next token in the stream, or null at EOS */ - public final Token next(final Token reusableToken) throws IOException + public final boolean incrementToken() throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; - - String term = nextToken.term(); + if (input.incrementToken()) { + String term = termAtt.term(); String s = stemmer.stem(term); if (s != null && !s.equals(term)) - nextToken.setTermBuffer(s); - return nextToken; + termAtt.setTermBuffer(s); + return true; + } else { + return false; + } } + /** * Set a alternative/custom RussianStemmer for this filter. */ Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 797224) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy) @@ -22,6 +22,9 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + import java.text.BreakIterator; /** @@ -32,46 +35,62 @@ public class ThaiWordFilter extends TokenFilter { private BreakIterator breaker = null; - private Token thaiToken = null; + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + + private State thaiState = null; + public ThaiWordFilter(TokenStream input) { super(input); breaker = BreakIterator.getWordInstance(new Locale("th")); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - if (thaiToken != null) { + public final boolean incrementToken() throws IOException { + if (thaiState != null) { int start = breaker.current(); int end = breaker.next(); if (end != BreakIterator.DONE) { - reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start); - reusableToken.setStartOffset(thaiToken.startOffset()+start); - reusableToken.setEndOffset(thaiToken.startOffset()+end); - return reusableToken; + restoreState(thaiState); + termAtt.setTermBuffer(termAtt.termBuffer(), start, end - start); + offsetAtt.setOffset(offsetAtt.startOffset() + start, offsetAtt.startOffset() + end); + return true; } - thaiToken = null; + thaiState = null; } - Token nextToken = input.next(reusableToken); - if (nextToken == null || nextToken.termLength() == 0) { - return null; - } + if (input.incrementToken() == false || termAtt.termLength() == 0) + return false; - String text = nextToken.term(); + String text = termAtt.term(); if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) { - nextToken.setTermBuffer(text.toLowerCase()); - return nextToken; + termAtt.setTermBuffer(text.toLowerCase()); + return true; } + + thaiState = captureState(); - thaiToken = (Token) nextToken.clone(); breaker.setText(text); int end = breaker.next(); if (end != BreakIterator.DONE) { - nextToken.setTermBuffer(text, 0, end); - nextToken.setEndOffset(nextToken.startOffset() + end); - return nextToken; + termAtt.setTermBuffer(text, 0, end); + offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end); + return true; } - return null; + return false; } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (revision 797224) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (working copy) @@ -17,18 +17,12 @@ * limitations under the License. */ -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStreamReader; import java.io.StringReader; import junit.framework.TestCase; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Arabic Normalization Filter @@ -95,11 +89,10 @@ private void check(final String input, final String expected) throws IOException { ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream); - final Token reusableToken = new Token(); - Token nextToken = filter.next(reusableToken); - if (nextToken == null) - fail(); - assertEquals(expected, nextToken.term()); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + + assertTrue(filter.incrementToken()); + assertEquals(expected, termAtt.term()); filter.close(); } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (revision 797224) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (working copy) @@ -17,17 +17,12 @@ * limitations under the License. */ -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStreamReader; import java.io.StringReader; import junit.framework.TestCase; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Arabic Normalization Filter @@ -118,11 +113,10 @@ private void check(final String input, final String expected) throws IOException { ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); ArabicStemFilter filter = new ArabicStemFilter(tokenStream); - final Token reusableToken = new Token(); - Token nextToken = filter.next(reusableToken); - if (nextToken == null) - fail(); - assertEquals(expected, nextToken.term()); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + + assertTrue(filter.incrementToken()); + assertEquals(expected, termAtt.term()); filter.close(); } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (revision 797224) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (working copy) @@ -23,8 +23,8 @@ import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Brazilian Stem Filter, which only modifies the term text. @@ -122,12 +122,10 @@ private void check(final String input, final String expected) throws IOException { Analyzer analyzer = new BrazilianAnalyzer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); - Token nextToken = stream.next(reusableToken); - if (nextToken == null) - fail(); - assertEquals(expected, nextToken.term()); - assertTrue(stream.next(nextToken) == null); + TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class); + assertTrue(stream.incrementToken()); + assertEquals(expected, text.term()); + assertFalse(stream.incrementToken()); stream.close(); } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (revision 797224) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (working copy) @@ -21,50 +21,49 @@ import java.io.StringReader; import junit.framework.TestCase; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; + public class TestCJKTokenizer extends TestCase{ + + class TestToken { + String termText; + int start; + int end; + String type; + } - public Token newToken(String termText, int start, int end, int type) { - Token token = new Token(start, end); - token.setTermBuffer(termText); - token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]); + public TestToken newToken(String termText, int start, int end, int type) { + TestToken token = new TestToken(); + token.termText = termText; + token.type = CJKTokenizer.TOKEN_TYPE_NAMES[type]; + token.start = start; + token.end = end; return token; } - public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException { + public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException { CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str)); - int i = 0; - System.out.println("string[" + str + "]"); - System.out.print("tokens["); - final Token reusableToken = new Token(); - for (Token token = tokenizer.next(reusableToken) ; - token != null ; - token = tokenizer.next(reusableToken) ) { - if (token.term().equals(out_tokens[i].term()) - && token.startOffset() == out_tokens[i].startOffset() - && token.endOffset() == out_tokens[i].endOffset() - && token.type().equals(out_tokens[i].type()) ) { - System.out.print( token.term() + " "); - } - else { - fail(token.term() + " (start: " + token.startOffset() - + " end: " + token.endOffset() + " type: " + token.type() + ") != " - + out_tokens[i].term() + " (start: " + out_tokens[i].startOffset() - + " end: " + out_tokens[i].endOffset() - + " type: " + out_tokens[i].type() + ")"); - break; - } - ++i; + TermAttribute termAtt = (TermAttribute) tokenizer.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.getAttribute(OffsetAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) tokenizer.getAttribute(TypeAttribute.class); + for (int i = 0; i < out_tokens.length; i++) { + assertTrue(tokenizer.incrementToken()); + assertEquals(termAtt.term(), out_tokens[i].termText); + assertEquals(offsetAtt.startOffset(), out_tokens[i].start); + assertEquals(offsetAtt.endOffset(), out_tokens[i].end); + assertEquals(typeAtt.type(), out_tokens[i].type); } - System.out.println("]" + System.getProperty("line.separator")); + assertFalse(tokenizer.incrementToken()); } public void testJa1() throws IOException { String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), @@ -81,7 +80,7 @@ public void testJa2() throws IOException { String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), @@ -97,7 +96,7 @@ public void testC() throws IOException { String str = "abc defgh ijklmn opqrstu vwxy z"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE), @@ -111,7 +110,7 @@ public void testMix() throws IOException { String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), @@ -128,7 +127,7 @@ public void testMix2() throws IOException { String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), @@ -147,7 +146,7 @@ public void testSingleChar() throws IOException { String str = "\u4e00"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), }; checkCJKToken(str, out_tokens); Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (revision 797224) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (working copy) @@ -22,7 +22,7 @@ import junit.framework.TestCase; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; public class TestChineseTokenizer extends TestCase @@ -34,12 +34,12 @@ int correctStartOffset = 0; int correctEndOffset = 1; - final Token reusableToken = new Token(); - for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) { - assertEquals(correctStartOffset, nextToken.startOffset()); - assertEquals(correctEndOffset, nextToken.endOffset()); - correctStartOffset++; - correctEndOffset++; + OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.getAttribute(OffsetAttribute.class); + while (tokenizer.incrementToken()) { + assertEquals(correctStartOffset, offsetAtt.startOffset()); + assertEquals(correctEndOffset, offsetAtt.endOffset()); + correctStartOffset++; + correctEndOffset++; } } } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 797224) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy) @@ -31,16 +31,15 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import org.apache.lucene.analysis.Token; +import junit.framework.TestCase; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase; -import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter; -import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import junit.framework.TestCase; - public class TestCompoundWordTokenFilter extends TestCase { private static String[] locations = { "http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip", @@ -155,16 +154,18 @@ private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset, int[] endOffset, int[] posIncr) throws Exception { - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) tf.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) tf.getAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tf.getAttribute(PositionIncrementAttribute.class); + for (int i = 0; i < s.length; ++i) { - Token nextToken = tf.next(reusableToken); - assertNotNull(nextToken); - assertEquals(s[i], nextToken.term()); - assertEquals(startOffset[i], nextToken.startOffset()); - assertEquals(endOffset[i], nextToken.endOffset()); - assertEquals(posIncr[i], nextToken.getPositionIncrement()); + assertTrue(tf.incrementToken()); + assertEquals(s[i], termAtt.term()); + assertEquals(startOffset[i], offsetAtt.startOffset()); + assertEquals(endOffset[i], offsetAtt.endOffset()); + assertEquals(posIncr[i], posIncAtt.getPositionIncrement()); } - assertNull(tf.next(reusableToken)); + assertFalse(tf.incrementToken()); } private void getHyphenationPatternFileContents() { Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (revision 797224) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (working copy) @@ -22,8 +22,8 @@ import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the CzechAnalyzer @@ -39,13 +39,12 @@ private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); + TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class); for (int i=0; i */ -public class SentenceTokenizer extends Tokenizer { +public final class SentenceTokenizer extends Tokenizer { /** * End of sentence punctuation: 。,!?;,!?; @@ -39,12 +41,19 @@ private final StringBuffer buffer = new StringBuffer(); private int tokenStart = 0, tokenEnd = 0; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; public SentenceTokenizer(Reader reader) { super(reader); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - public Token next(final Token reusableToken) throws IOException { + public boolean incrementToken() throws IOException { buffer.setLength(0); int ci; char ch, pch; @@ -83,11 +92,12 @@ } } if (buffer.length() == 0) - return null; + return false; else { - reusableToken.clear(); - reusableToken.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence"); - return reusableToken; + termAtt.setTermBuffer(buffer.toString()); + offsetAtt.setOffset(input.correctOffset(tokenStart), input.correctOffset(tokenEnd)); + typeAtt.setType("sentence"); + return true; } } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java (revision 797224) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java (working copy) @@ -20,7 +20,6 @@ import java.util.ArrayList; import java.util.List; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter; import org.apache.lucene.analysis.cn.smart.hhmm.SegToken; import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; @@ -37,11 +36,11 @@ /** * Segment a sentence into words with {@link HHMMSegmenter} * - * @param sentenceToken sentence {@link Token} + * @param sentence input sentence + * @param startOffset start offset of sentence * @return {@link List} of {@link SegToken} */ - public List segmentSentence(Token sentenceToken) { - String sentence = sentenceToken.term(); + public List segmentSentence(String sentence, int startOffset) { List segTokenList = hhmmSegmenter.process(sentence); @@ -49,25 +48,25 @@ // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END for (int i = 1; i < segTokenList.size() - 1; i++) { - result.add(convertSegToken((SegToken) segTokenList.get(i), sentence, - sentenceToken.startOffset(), "word")); + result.add(convertSegToken((SegToken) segTokenList.get(i), sentence, startOffset)); } return result; } /** - * Convert a {@link SegToken} to a Lucene {@link Token} + * Process a {@link SegToken} so that it is ready for indexing. * + * This method calculates offsets and normalizes the token with {@link SegTokenFilter}. + * * @param st input {@link SegToken} * @param sentence associated Sentence * @param sentenceStartOffset offset into sentence - * @param type token type, default is word - * @return Lucene {@link Token} + * @return Lucene {@link SegToken} */ - public Token convertSegToken(SegToken st, String sentence, - int sentenceStartOffset, String type) { - Token result; + public SegToken convertSegToken(SegToken st, String sentence, + int sentenceStartOffset) { + switch (st.wordType) { case WordType.STRING: case WordType.NUMBER: @@ -81,9 +80,8 @@ } st = tokenFilter.filter(st); - - result = new Token(st.charArray, 0, st.charArray.length, st.startOffset - + sentenceStartOffset, st.endOffset + sentenceStartOffset); - return result; + st.startOffset += sentenceStartOffset; + st.endOffset += sentenceStartOffset; + return st; } } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java (revision 797224) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java (working copy) @@ -21,20 +21,27 @@ import java.util.Iterator; import java.util.List; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cn.smart.hhmm.SegToken; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * A {@link TokenFilter} that breaks sentences into words. */ -public class WordTokenFilter extends TokenFilter { +public final class WordTokenFilter extends TokenFilter { private WordSegmenter wordSegmenter; private Iterator tokenIter; private List tokenBuffer; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; /** * Construct a new WordTokenizer. @@ -44,32 +51,34 @@ public WordTokenFilter(TokenStream in) { super(in); this.wordSegmenter = new WordSegmenter(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - - public Token next(final Token reusableSentenceToken) throws IOException { - if (tokenIter != null && tokenIter.hasNext()) - return (Token) tokenIter.next(); - else { - Token nextToken = input.next(reusableSentenceToken); - if (processNextSentence(nextToken)) { - return (Token) tokenIter.next(); - } else - return null; - } + + public boolean incrementToken() throws IOException { + if (tokenIter == null || !tokenIter.hasNext()) { + // there are no remaining tokens from the current sentence... are there more sentences? + if (input.incrementToken()) { + // a new sentence is available: process it. + tokenBuffer = wordSegmenter.segmentSentence(termAtt.term(), offsetAtt.startOffset()); + tokenIter = tokenBuffer.iterator(); + /* + * it should not be possible to have a sentence with 0 words, check just in case. + * returning EOS isn't the best either, but its the behavior of the original code. + */ + if (!tokenIter.hasNext()) + return false; + } else { + return false; // no more sentences, end of stream! + } + } + + // There are remaining tokens from the current sentence, return the next one. + SegToken nextWord = (SegToken) tokenIter.next(); + termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length); + offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset); + typeAtt.setType("word"); + return true; } - - /** - * Process the next input sentence, placing tokens into tokenBuffer - * - * @param reusableSentenceToken input sentence - * @return true if more tokens were placed into tokenBuffer. - * @throws IOException - */ - private boolean processNextSentence(final Token reusableSentenceToken) throws IOException { - if (reusableSentenceToken == null) - return false; - tokenBuffer = wordSegmenter.segmentSentence(reusableSentenceToken); - tokenIter = tokenBuffer.iterator(); - return tokenBuffer != null && tokenIter.hasNext(); - } } Index: contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java =================================================================== --- contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (revision 797224) +++ contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (working copy) @@ -29,6 +29,9 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public class TestSmartChineseAnalyzer extends TestCase { @@ -108,22 +111,23 @@ public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[]) throws Exception { - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); - for (int i = 0; i < output.length; i++) { - Token nextToken = ts.next(reusableToken); - assertNotNull(nextToken); - assertEquals(nextToken.term(), output[i]); + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); + for (int i = 0; i < output.length; i++) { + assertTrue(ts.incrementToken()); + assertEquals(termAtt.term(), output[i]); if (startOffsets != null) - assertEquals(nextToken.startOffset(), startOffsets[i]); + assertEquals(offsetAtt.startOffset(), startOffsets[i]); if (endOffsets != null) - assertEquals(nextToken.endOffset(), endOffsets[i]); + assertEquals(offsetAtt.endOffset(), endOffsets[i]); if (types != null) - assertEquals(nextToken.type(), types[i]); + assertEquals(typeAtt.type(), types[i]); + } + assertFalse(ts.incrementToken()); + ts.close(); } - assertNull(ts.next(reusableToken)); - ts.close(); -} public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { assertAnalyzesTo(a, input, output, null, null, null); Index: contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java =================================================================== --- contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java (revision 797224) +++ contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.IndexableBinaryStringTools; import java.io.IOException; @@ -73,8 +74,9 @@ * {@link ICUCollationKeyFilter} on the query side, or vice versa. *

*/ -public class CollationKeyFilter extends TokenFilter { +public final class CollationKeyFilter extends TokenFilter { private Collator collator = null; + private TermAttribute termAtt; /** * @param input Source token stream @@ -83,25 +85,26 @@ public CollationKeyFilter(TokenStream input, Collator collator) { super(input); this.collator = collator; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { - char[] termBuffer = nextToken.termBuffer(); - String termText = new String(termBuffer, 0, nextToken.termLength()); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAtt.termBuffer(); + String termText = new String(termBuffer, 0, termAtt.termLength()); byte[] collationKey = collator.getCollationKey(termText).toByteArray(); ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey); int encodedLength = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); if (encodedLength > termBuffer.length) { - nextToken.resizeTermBuffer(encodedLength); + termAtt.resizeTermBuffer(encodedLength); } - nextToken.setTermLength(encodedLength); - CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer()); + termAtt.setTermLength(encodedLength); + CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + return true; + } else { + return false; } - return nextToken; } } Index: contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java =================================================================== --- contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (revision 797224) +++ contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.IndexableBinaryStringTools; import java.io.IOException; @@ -69,9 +70,10 @@ * java.text.Collator over several languages. *

*/ -public class ICUCollationKeyFilter extends TokenFilter { +public final class ICUCollationKeyFilter extends TokenFilter { private Collator collator = null; private RawCollationKey reusableKey = new RawCollationKey(); + private TermAttribute termAtt; /** * @@ -81,25 +83,26 @@ public ICUCollationKeyFilter(TokenStream input, Collator collator) { super(input); this.collator = collator; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { - char[] termBuffer = nextToken.termBuffer(); - String termText = new String(termBuffer, 0, nextToken.termLength()); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAtt.termBuffer(); + String termText = new String(termBuffer, 0, termAtt.termLength()); collator.getRawCollationKey(termText, reusableKey); ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size); int encodedLength = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); if (encodedLength > termBuffer.length) { - nextToken.resizeTermBuffer(encodedLength); + termAtt.resizeTermBuffer(encodedLength); } - nextToken.setTermLength(encodedLength); - CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer()); + termAtt.setTermLength(encodedLength); + CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + return true; + } else { + return false; } - return nextToken; } } Index: contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java =================================================================== --- contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (revision 797224) +++ contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.tartarus.snowball.SnowballProgram; /** @@ -33,9 +34,12 @@ private SnowballProgram stemmer; + private TermAttribute termAtt; + public SnowballFilter(TokenStream input, SnowballProgram stemmer) { super(input); this.stemmer = stemmer; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -56,21 +60,34 @@ } catch (Exception e) { throw new RuntimeException(e.toString()); } + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** Returns the next input Token, after being stemmed */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; - String originalTerm = nextToken.term(); - stemmer.setCurrent(originalTerm); - stemmer.stem(); - String finalTerm = stemmer.getCurrent(); - // Don't bother updating, if it is unchanged. - if (!originalTerm.equals(finalTerm)) - nextToken.setTermBuffer(finalTerm); - return nextToken; + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String originalTerm = termAtt.term(); + stemmer.setCurrent(originalTerm); + stemmer.stem(); + String finalTerm = stemmer.getCurrent(); + // Don't bother updating, if it is unchanged. + if (!originalTerm.equals(finalTerm)) + termAtt.setTermBuffer(finalTerm); + return true; + } else { + return false; + } } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } } Index: contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java =================================================================== --- contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (revision 797224) +++ contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (working copy) @@ -22,9 +22,14 @@ import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.index.Payload; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public class TestSnowball extends TestCase { @@ -32,12 +37,12 @@ String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); for (int i = 0; i < output.length; i++) { - Token nextToken = ts.next(reusableToken); - assertEquals(output[i], nextToken.term()); + assertTrue(ts.incrementToken()); + assertEquals(output[i], termAtt.term()); } - assertNull(ts.next(reusableToken)); + assertFalse(ts.incrementToken()); ts.close(); } @@ -49,33 +54,51 @@ public void testFilterTokens() throws Exception { - final Token tok = new Token(2, 7, "wrd"); - tok.setTermBuffer("accents"); - tok.setPositionIncrement(3); - Payload tokPayload = new Payload(new byte[]{0,1,2,3}); - tok.setPayload(tokPayload); - int tokFlags = 77; - tok.setFlags(tokFlags); + SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) filter.getAttribute(OffsetAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) filter.getAttribute(TypeAttribute.class); + PayloadAttribute payloadAtt = (PayloadAttribute) filter.getAttribute(PayloadAttribute.class); + PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) filter.getAttribute(PositionIncrementAttribute.class); + FlagsAttribute flagsAtt = (FlagsAttribute) filter.getAttribute(FlagsAttribute.class); + + filter.incrementToken(); - SnowballFilter filter = new SnowballFilter( - new TokenStream() { - public Token next(final Token reusableToken) { - assert reusableToken != null; - return tok; - } - }, - "English" - ); - - final Token reusableToken = new Token(); - Token nextToken = filter.next(reusableToken); - - assertEquals("accent", nextToken.term()); - assertEquals(2, nextToken.startOffset()); - assertEquals(7, nextToken.endOffset()); - assertEquals("wrd", nextToken.type()); - assertEquals(3, nextToken.getPositionIncrement()); - assertEquals(tokFlags, nextToken.getFlags()); - assertEquals(tokPayload, nextToken.getPayload()); + assertEquals("accent", termAtt.term()); + assertEquals(2, offsetAtt.startOffset()); + assertEquals(7, offsetAtt.endOffset()); + assertEquals("wrd", typeAtt.type()); + assertEquals(3, posIncAtt.getPositionIncrement()); + assertEquals(77, flagsAtt.getFlags()); + assertEquals(new Payload(new byte[]{0,1,2,3}), payloadAtt.getPayload()); } + + private final class TestTokenStream extends TokenStream { + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; + private PayloadAttribute payloadAtt; + private PositionIncrementAttribute posIncAtt; + private FlagsAttribute flagsAtt; + + TestTokenStream() { + super(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); + } + + public boolean incrementToken() { + termAtt.setTermBuffer("accents"); + offsetAtt.setOffset(2, 7); + typeAtt.setType("wrd"); + posIncAtt.setPositionIncrement(3); + payloadAtt.setPayload(new Payload(new byte[]{0,1,2,3})); + flagsAtt.setFlags(77); + return true; + } + } } \ No newline at end of file