Index: backwards/src/test/org/apache/lucene/analysis/TestAnalyzers.java =================================================================== --- backwards/src/test/org/apache/lucene/analysis/TestAnalyzers.java (revision 932172) +++ backwards/src/test/org/apache/lucene/analysis/TestAnalyzers.java (working copy) @@ -120,6 +120,7 @@ String[] y = StandardTokenizer.TOKEN_TYPES; } + /* StandardAnalyzer was made final in 3.1: private static class MyStandardAnalyzer extends StandardAnalyzer { public MyStandardAnalyzer() { super(org.apache.lucene.util.Version.LUCENE_CURRENT); @@ -139,6 +140,7 @@ assertTrue(ts.incrementToken()); assertFalse(ts.incrementToken()); } + */ } class PayloadSetter extends TokenFilter { Index: src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java (revision 932172) +++ src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java (working copy) @@ -19,7 +19,7 @@ import java.io.IOException; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; @@ -61,18 +61,17 @@ public ASCIIFoldingFilter(TokenStream input) { super(input); - termAtt = addAttribute(TermAttribute.class); } private char[] output = new char[512]; private int outputPos; - private TermAttribute termAtt; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { - final char[] buffer = termAtt.termBuffer(); - final int length = termAtt.termLength(); + final char[] buffer = termAtt.buffer(); + final int length = termAtt.length(); // If no characters actually require rewriting then we // just return token as-is: @@ -81,7 +80,7 @@ if (c >= '\u0080') { foldToASCII(buffer, length); - termAtt.setTermBuffer(output, 0, outputPos); + termAtt.copyBuffer(output, 0, outputPos); break; } } Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 932172) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -21,7 +21,7 @@ import java.io.Reader; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.CharacterUtils; import org.apache.lucene.util.Version; @@ -78,10 +78,7 @@ public CharTokenizer(Version matchVersion, Reader input) { super(input); charUtils = CharacterUtils.getInstance(matchVersion); - offsetAtt = addAttribute(OffsetAttribute.class); - termAtt = addAttribute(TermAttribute.class); useOldAPI = useOldAPI(matchVersion); - ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); } @@ -99,10 +96,7 @@ Reader input) { super(source, input); charUtils = CharacterUtils.getInstance(matchVersion); - offsetAtt = addAttribute(OffsetAttribute.class); - termAtt = addAttribute(TermAttribute.class); useOldAPI = useOldAPI(matchVersion); - ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); } /** @@ -119,10 +113,7 @@ Reader input) { super(factory, input); charUtils = CharacterUtils.getInstance(matchVersion); - offsetAtt = addAttribute(OffsetAttribute.class); - termAtt = addAttribute(TermAttribute.class); useOldAPI = useOldAPI(matchVersion); - ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); } /** @@ -164,11 +155,11 @@ private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 4096; - private final TermAttribute termAtt; - private final OffsetAttribute offsetAtt; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);; + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final CharacterUtils charUtils; - private final CharacterBuffer ioBuffer; + private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); /** * @deprecated this will be removed in lucene 4.0 @@ -275,7 +266,7 @@ return incrementTokenOld(); int length = 0; int start = bufferIndex; - char[] buffer = termAtt.termBuffer(); + char[] buffer = termAtt.buffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; @@ -297,7 +288,7 @@ if (length == 0) // start of token start = offset + bufferIndex - 1; else if (length >= buffer.length-1) // check if a supplementary could run out of bounds - buffer = termAtt.resizeTermBuffer(2+length); // make sure a supplementary fits in the buffer + buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test break; @@ -305,7 +296,7 @@ break; // return 'em } - termAtt.setTermLength(length); + termAtt.setLength(length); offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; @@ -320,7 +311,7 @@ private boolean incrementTokenOld() throws IOException { int length = 0; int start = bufferIndex; - char[] buffer = termAtt.termBuffer(); + char[] buffer = termAtt.buffer(); final char[] oldIoBuffer = ioBuffer.getBuffer(); while (true) { @@ -344,7 +335,7 @@ if (length == 0) // start of token start = offset + bufferIndex - 1; else if (length == buffer.length) - buffer = termAtt.resizeTermBuffer(1+length); + buffer = termAtt.resizeBuffer(1+length); buffer[length++] = normalize(c); // buffer it, normalized @@ -355,7 +346,7 @@ break; // return 'em } - termAtt.setTermLength(length); + termAtt.setLength(length); offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; } Index: src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (revision 932172) +++ src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (working copy) @@ -1,7 +1,5 @@ package org.apache.lucene.analysis; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,6 +17,8 @@ * limitations under the License. */ +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + /** * A filter that replaces accented characters in the ISO Latin 1 character set * (ISO-8859-1) by their unaccented equivalent. The case will not be altered. @@ -35,25 +35,24 @@ public final class ISOLatin1AccentFilter extends TokenFilter { public ISOLatin1AccentFilter(TokenStream input) { super(input); - termAtt = addAttribute(TermAttribute.class); } private char[] output = new char[256]; private int outputPos; - private TermAttribute termAtt; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); @Override public final boolean incrementToken() throws java.io.IOException { if (input.incrementToken()) { - final char[] buffer = termAtt.termBuffer(); - final int length = termAtt.termLength(); + final char[] buffer = termAtt.buffer(); + final int length = termAtt.length(); // If no characters actually require rewriting then we // just return token as-is: for(int i=0;i= '\u00c0' && c <= '\uFB06') { removeAccents(buffer, length); - termAtt.setTermBuffer(output, 0, outputPos); + termAtt.copyBuffer(output, 0, outputPos); break; } } Index: src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java (revision 932172) +++ src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java (working copy) @@ -21,7 +21,7 @@ import java.util.Set; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; /** @@ -33,8 +33,8 @@ */ public final class KeywordMarkerTokenFilter extends TokenFilter { - private final KeywordAttribute keywordAttr; - private final TermAttribute termAtt; + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharArraySet keywordSet; /** @@ -50,8 +50,6 @@ public KeywordMarkerTokenFilter(final TokenStream in, final CharArraySet keywordSet) { super(in); - termAtt = addAttribute(TermAttribute.class); - keywordAttr = addAttribute(KeywordAttribute.class); this.keywordSet = keywordSet; } @@ -73,8 +71,8 @@ @Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { - keywordAttr.setKeyword(keywordSet.contains(termAtt.termBuffer(), 0, - termAtt.termLength())); + keywordAttr.setKeyword(keywordSet.contains(termAtt.buffer(), 0, + termAtt.length())); return true; } else return false; Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 932172) +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy) @@ -21,7 +21,7 @@ import java.io.Reader; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.AttributeSource; /** @@ -33,8 +33,8 @@ private boolean done; private int finalOffset; - private TermAttribute termAtt; - private OffsetAttribute offsetAtt; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); public KeywordTokenizer(Reader input) { this(input, DEFAULT_BUFFER_SIZE); @@ -57,9 +57,7 @@ private void init(int bufferSize) { this.done = false; - termAtt = addAttribute(TermAttribute.class); - offsetAtt = addAttribute(OffsetAttribute.class); - termAtt.resizeTermBuffer(bufferSize); + termAtt.resizeBuffer(bufferSize); } @Override @@ -68,15 +66,15 @@ clearAttributes(); done = true; int upto = 0; - char[] buffer = termAtt.termBuffer(); + char[] buffer = termAtt.buffer(); while (true) { final int length = input.read(buffer, upto, buffer.length-upto); if (length == -1) break; upto += length; if (upto == buffer.length) - buffer = termAtt.resizeTermBuffer(1+buffer.length); + buffer = termAtt.resizeBuffer(1+buffer.length); } - termAtt.setTermLength(upto); + termAtt.setLength(upto); finalOffset = correctOffset(upto); offsetAtt.setOffset(correctOffset(0), finalOffset); return true; Index: src/java/org/apache/lucene/analysis/LengthFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/LengthFilter.java (revision 932172) +++ src/java/org/apache/lucene/analysis/LengthFilter.java (working copy) @@ -19,17 +19,17 @@ import java.io.IOException; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** * Removes words that are too long or too short from the stream. */ public final class LengthFilter extends TokenFilter { - final int min; - final int max; + private final int min; + private final int max; - private TermAttribute termAtt; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); /** * Build a filter that removes words that are too long or too @@ -40,7 +40,6 @@ super(in); this.min = min; this.max = max; - termAtt = addAttribute(TermAttribute.class); } /** @@ -50,7 +49,7 @@ public final boolean incrementToken() throws IOException { // return the first non-stop word found while (input.incrementToken()) { - int len = termAtt.termLength(); + int len = termAtt.length(); if (len >= min && len <= max) { return true; } Index: src/java/org/apache/lucene/analysis/LowerCaseFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/LowerCaseFilter.java (revision 932172) +++ src/java/org/apache/lucene/analysis/LowerCaseFilter.java (working copy) @@ -19,7 +19,7 @@ import java.io.IOException; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.CharacterUtils; import org.apache.lucene.util.Version; @@ -34,7 +34,8 @@ */ public final class LowerCaseFilter extends TokenFilter { private final CharacterUtils charUtils; - + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + /** * Create a new LowerCaseFilter, that normalizes token text to lower case. * @@ -43,7 +44,6 @@ */ public LowerCaseFilter(Version matchVersion, TokenStream in) { super(in); - termAtt = addAttribute(TermAttribute.class); charUtils = CharacterUtils.getInstance(matchVersion); } @@ -55,13 +55,11 @@ this(Version.LUCENE_30, in); } - private TermAttribute termAtt; - @Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { - final char[] buffer = termAtt.termBuffer(); - final int length = termAtt.termLength(); + final char[] buffer = termAtt.buffer(); + final int length = termAtt.length(); for (int i = 0; i < length;) { i += Character.toChars( Character.toLowerCase( Index: src/java/org/apache/lucene/analysis/PorterStemFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 932172) +++ src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy) @@ -20,7 +20,7 @@ import java.io.IOException; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** Transforms the token stream as per the Porter stemming algorithm. Note: the input to the stemming filter must already be in lower case, @@ -47,15 +47,12 @@

*/ public final class PorterStemFilter extends TokenFilter { - private final PorterStemmer stemmer; - private final TermAttribute termAtt; - private final KeywordAttribute keywordAttr; + private final PorterStemmer stemmer = new PorterStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); public PorterStemFilter(TokenStream in) { super(in); - stemmer = new PorterStemmer(); - termAtt = addAttribute(TermAttribute.class); - keywordAttr = addAttribute(KeywordAttribute.class); } @Override @@ -63,8 +60,8 @@ if (!input.incrementToken()) return false; - if ((!keywordAttr.isKeyword()) && stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength())) - termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); + if ((!keywordAttr.isKeyword()) && stemmer.stem(termAtt.buffer(), 0, termAtt.length())) + termAtt.copyBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); return true; } } Index: src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (revision 932172) +++ src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (working copy) @@ -42,9 +42,13 @@ * are corrected (see LUCENE-1068) * */ -public class StandardAnalyzer extends Analyzer { - private Set stopSet; +public final class StandardAnalyzer extends StopwordAnalyzerBase { + /** Default maximum allowed token length */ + public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; + + private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + /** * Specifies whether deprecated acronyms should be replaced with HOST type. * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"} @@ -54,8 +58,16 @@ /** An unmodifiable set containing some common English words that are usually not useful for searching. */ public static final Set STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; - private final Version matchVersion; + /** Builds an analyzer with the given stop words. + * @param matchVersion Lucene version to match See {@link + * above} + * @param stopWords stop words */ + public StandardAnalyzer(Version matchVersion, Set stopWords) { + super(matchVersion, stopWords); + replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24); + } + /** Builds an analyzer with the default stop words ({@link * #STOP_WORDS_SET}). * @param matchVersion Lucene version to match See {@link @@ -65,16 +77,6 @@ this(matchVersion, STOP_WORDS_SET); } - /** Builds an analyzer with the given stop words. - * @param matchVersion Lucene version to match See {@link - * above} - * @param stopWords stop words */ - public StandardAnalyzer(Version matchVersion, Set stopWords) { - stopSet = stopWords; - replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24); - this.matchVersion = matchVersion; - } - /** Builds an analyzer with the stop words from the given file. * @see WordlistLoader#getWordSet(File) * @param matchVersion Lucene version to match See {@link @@ -93,28 +95,6 @@ this(matchVersion, WordlistLoader.getWordSet(stopwords)); } - /** Constructs a {@link StandardTokenizer} filtered by a {@link - StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader); - tokenStream.setMaxTokenLength(maxTokenLength); - TokenStream result = new StandardFilter(tokenStream); - result = new LowerCaseFilter(matchVersion, result); - result = new StopFilter(matchVersion, result, stopSet); - return result; - } - - private static final class SavedStreams { - StandardTokenizer tokenStream; - TokenStream filteredTokenStream; - } - - /** Default maximum allowed token length */ - public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; - - private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; - /** * Set maximum allowed token length. If a token is seen * that exceeds this length then it is discarded. This @@ -133,29 +113,20 @@ } @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - if (overridesTokenStreamMethod) { - // LUCENE-1678: force fallback to tokenStream() if we - // have been subclassed and that subclass overrides - // tokenStream but not reusableTokenStream - return tokenStream(fieldName, reader); - } - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - setPreviousTokenStream(streams); - streams.tokenStream = new StandardTokenizer(matchVersion, reader); - streams.filteredTokenStream = new StandardFilter(streams.tokenStream); - streams.filteredTokenStream = new LowerCaseFilter(matchVersion, - streams.filteredTokenStream); - streams.filteredTokenStream = new StopFilter(matchVersion, streams.filteredTokenStream, stopSet); - } else { - streams.tokenStream.reset(reader); - } - streams.tokenStream.setMaxTokenLength(maxTokenLength); - - streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym); - - return streams.filteredTokenStream; + protected TokenStreamComponents createComponents(final String fieldName, + final Reader reader) { + final StandardTokenizer source = new StandardTokenizer(matchVersion, reader); + source.setMaxTokenLength(maxTokenLength); + source.setReplaceInvalidAcronym(replaceInvalidAcronym); + TokenStream tok = new StandardFilter(source); + tok = new LowerCaseFilter(matchVersion, tok); + tok = new StopFilter(matchVersion, tok, stopwords); + return new TokenStreamComponents(source, tok) { + @Override + protected boolean reset(final Reader reader) throws IOException { + source.setMaxTokenLength(maxTokenLength); + return super.reset(reader); + } + }; } } Index: src/java/org/apache/lucene/analysis/standard/StandardFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 932172) +++ src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy) @@ -19,27 +19,24 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** Normalizes tokens extracted with {@link StandardTokenizer}. */ public final class StandardFilter extends TokenFilter { - /** Construct filtering in. */ public StandardFilter(TokenStream in) { super(in); - termAtt = addAttribute(TermAttribute.class); - typeAtt = addAttribute(TypeAttribute.class); } private static final String APOSTROPHE_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.APOSTROPHE]; private static final String ACRONYM_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ACRONYM]; // this filters uses attribute type - private final TypeAttribute typeAtt; - private final TermAttribute termAtt; + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); /** Returns the next token in the stream, or null at EOS. *

Removes 's from the end of words. @@ -51,16 +48,16 @@ return false; } - char[] buffer = termAtt.termBuffer(); - final int bufferLength = termAtt.termLength(); + final char[] buffer = termAtt.buffer(); + final int bufferLength = termAtt.length(); final String type = typeAtt.type(); if (type == APOSTROPHE_TYPE && // remove 's - bufferLength >= 2 && + bufferLength >= 2 && buffer[bufferLength-2] == '\'' && (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { // Strip last 2 characters off - termAtt.setTermLength(bufferLength - 2); + termAtt.setLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for(int i=0;i