Index: src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java (revision 793966) +++ src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java (working copy) @@ -1,5 +1,8 @@ package org.apache.lucene.analysis; +import java.io.IOException; + +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.ArrayUtil; /** @@ -53,25 +56,22 @@ * accents from Latin1 characters. For example, 'à' will be replaced by * 'a'. */ -public class ASCIIFoldingFilter extends TokenFilter { +public final class ASCIIFoldingFilter extends TokenFilter { public ASCIIFoldingFilter(TokenStream input) { super(input); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } private char[] output = new char[512]; private int outputPos; + private TermAttribute termAtt; - public Token next(Token result) - throws java.io.IOException - { - result = input.next(result); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final char[] buffer = termAtt.termBuffer(); + final int length = termAtt.termLength(); - if (result != null) - { - final char[] buffer = result.termBuffer(); - final int length = result.termLength(); - // If no characters actually require rewriting then we // just return token as-is: for(int i = 0 ; i < length ; ++i) { @@ -79,13 +79,13 @@ if (c >= '\u0080') { foldToASCII(buffer, length); - result.setTermBuffer(output, 0, outputPos); + termAtt.setTermBuffer(output, 0, outputPos); break; } } - return result; + return true; } else { - return null; + return false; } } Index: src/java/org/apache/lucene/analysis/CachingTokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/CachingTokenFilter.java (revision 793966) +++ src/java/org/apache/lucene/analysis/CachingTokenFilter.java (working copy) @@ -22,8 +22,6 @@ import java.util.LinkedList; import java.util.List; -import org.apache.lucene.util.AttributeSource; - /** * This class can be used if the Tokens of a TokenStream * are intended to be consumed more than once. It caches @@ -33,33 +31,16 @@ * {@link TokenStream#reset()}, which repositions the * stream to the first Token. * + * @deprecated Use TODO instead */ public class CachingTokenFilter extends TokenFilter { private List cache; - private Iterator iterator; + private Iterator iterator; public CachingTokenFilter(TokenStream input) { super(input); } - public boolean incrementToken() throws IOException { - if (cache == null) { - // fill cache lazily - cache = new LinkedList(); - fillCache(); - iterator = cache.iterator(); - } - - if (!iterator.hasNext()) { - // the cache is exhausted, return null - return false; - } - // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. - AttributeSource state = (AttributeSource) iterator.next(); - state.restoreState(this); - return true; - } - /** @deprecated */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; @@ -85,13 +66,6 @@ } } - private void fillCache() throws IOException { - while(input.incrementToken()) { - cache.add(captureState()); - } - } - - /** @deprecated */ private void fillCache(final Token reusableToken) throws IOException { for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { cache.add(nextToken.clone()); Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 793966) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -96,47 +96,9 @@ /** @deprecated */ public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - reusableToken.clear(); - int length = 0; - int start = bufferIndex; - char[] buffer = reusableToken.termBuffer(); - while (true) { - - if (bufferIndex >= dataLen) { - offset += dataLen; - dataLen = input.read(ioBuffer); - if (dataLen == -1) { - if (length > 0) - break; - else - return null; - } - bufferIndex = 0; - } - - final char c = ioBuffer[bufferIndex++]; - - if (isTokenChar(c)) { // if it's a token char - - if (length == 0) // start of token - start = offset + bufferIndex - 1; - else if (length == buffer.length) - buffer = reusableToken.resizeTermBuffer(1+length); - - buffer[length++] = normalize(c); // buffer it, normalized - - if (length == MAX_WORD_LEN) // buffer overflow! - break; - - } else if (length > 0) // at non-Letter w/ chars - break; // return 'em - } - - reusableToken.setTermLength(length); - reusableToken.setStartOffset(input.correctOffset(start)); - reusableToken.setEndOffset(input.correctOffset(start+length)); - return reusableToken; + // Overriding this method to make it final as before has no effect for the reflection-wrapper in TokenStream. + // TokenStream.hasReusableNext is true because of this, but it is never used, as incrementToken() has preference. + return super.next(reusableToken); } public void reset(Reader input) throws IOException { Index: src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (revision 793966) +++ src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (working copy) @@ -57,28 +57,6 @@ } else return false; } - - /** @deprecated */ - public final Token next(final Token reusableToken) throws java.io.IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { - final char[] buffer = nextToken.termBuffer(); - final int length = nextToken.termLength(); - // If no characters actually require rewriting then we - // just return token as-is: - for(int i=0;i= '\u00c0' && c <= '\uFB06') { - removeAccents(buffer, length); - nextToken.setTermBuffer(output, 0, outputPos); - break; - } - } - return nextToken; - } else - return null; - } /** * To replace accented characters in a String by unaccented equivalents. Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 793966) +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy) @@ -65,30 +65,6 @@ return false; } - /** @deprecated */ - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - if (!done) { - done = true; - int upto = 0; - reusableToken.clear(); - char[] buffer = reusableToken.termBuffer(); - while (true) { - final int length = input.read(buffer, upto, buffer.length-upto); - if (length == -1) break; - upto += length; - if (upto == buffer.length) - buffer = reusableToken.resizeTermBuffer(1+buffer.length); - } - reusableToken.setTermLength(upto); - reusableToken.setStartOffset(input.correctOffset(0)); - reusableToken.setEndOffset(input.correctOffset(upto)); - - return reusableToken; - } - return null; - } - public void reset(Reader input) throws IOException { super.reset(input); this.done = false; Index: src/java/org/apache/lucene/analysis/LengthFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/LengthFilter.java (revision 793966) +++ src/java/org/apache/lucene/analysis/LengthFilter.java (working copy) @@ -61,24 +61,4 @@ // reached EOS -- return null return false; } - - /** - * Returns the next input Token whose term() is the right len - * @deprecated - */ - public final Token next(final Token reusableToken) throws IOException - { - assert reusableToken != null; - // return the first non-stop word found - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) - { - int len = nextToken.termLength(); - if (len >= min && len <= max) { - return nextToken; - } - // note: else we ignore it but should we index each part of it? - } - // reached EOS -- return null - return null; - } } Index: src/java/org/apache/lucene/analysis/LowerCaseFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/LowerCaseFilter.java (revision 793966) +++ src/java/org/apache/lucene/analysis/LowerCaseFilter.java (working copy) @@ -46,20 +46,4 @@ } else return false; } - - /** @deprecated */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { - - final char[] buffer = nextToken.termBuffer(); - final int length = nextToken.termLength(); - for(int i=0;i= valSize) - return null; - - reusableToken.clear(); - - final char[] buffer; - switch (valSize) { - case 64: - buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG); - reusableToken.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer)); - break; - - case 32: - buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_INT); - reusableToken.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer)); - break; - - default: - // should not happen - throw new IllegalArgumentException("valSize must be 32 or 64"); - } - - reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC); - reusableToken.setPositionIncrement((shift == 0) ? 1 : 0); - shift += precisionStep; - return reusableToken; - } // @Override public String toString() { Index: src/java/org/apache/lucene/analysis/PorterStemFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 793966) +++ src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy) @@ -57,16 +57,4 @@ termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); return true; } - - /** @deprecated */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; - - if (stemmer.stem(nextToken.termBuffer(), 0, nextToken.termLength())) - nextToken.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); - return nextToken; - } } Index: src/java/org/apache/lucene/analysis/SinkTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/SinkTokenizer.java (revision 793966) +++ src/java/org/apache/lucene/analysis/SinkTokenizer.java (working copy) @@ -22,19 +22,18 @@ import java.util.Iterator; import java.util.List; -import org.apache.lucene.util.AttributeSource; - /** * A SinkTokenizer can be used to cache Tokens for use in an Analyzer * * @see TeeTokenFilter + * @deprecated Use TODO instead * **/ public class SinkTokenizer extends Tokenizer { protected List/**/ lst = new ArrayList/**/(); protected Iterator/**/ iter; - + public SinkTokenizer(List/**/ input) { this.lst = input; if (this.lst == null) this.lst = new ArrayList/**/(); @@ -64,29 +63,9 @@ } /** - * Increments this stream to the next token out of the list of cached tokens - * @throws IOException - */ - public boolean incrementToken() throws IOException { - if (iter == null) iter = lst.iterator(); - // Since this TokenStream can be reset we have to maintain the tokens as immutable - if (iter.hasNext()) { - AttributeSource state = (AttributeSource) iter.next(); - state.restoreState(this); - return true; - } - return false; - } - - public void add(AttributeSource source) throws IOException { - lst.add(source); - } - - /** * Returns the next token out of the list of cached tokens * @return The next {@link org.apache.lucene.analysis.Token} in the Sink. * @throws IOException - * @deprecated */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; @@ -99,6 +78,8 @@ return null; } + + /** * Override this method to cache only certain tokens, or new tokens based * on the old tokens. Index: src/java/org/apache/lucene/analysis/standard/StandardFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 793966) +++ src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy) @@ -73,39 +73,4 @@ return true; } - - /** Returns the next token in the stream, or null at EOS. - *

Removes 's from the end of words. - *

Removes dots from acronyms. - * @deprecated - */ - public final Token next(final Token reusableToken) throws java.io.IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - - if (nextToken == null) - return null; - - char[] buffer = nextToken.termBuffer(); - final int bufferLength = nextToken.termLength(); - final String type = nextToken.type(); - - if (type == APOSTROPHE_TYPE && // remove 's - bufferLength >= 2 && - buffer[bufferLength-2] == '\'' && - (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { - // Strip last 2 characters off - nextToken.setTermLength(bufferLength - 2); - } else if (type == ACRONYM_TYPE) { // remove dots - int upto = 0; - for(int i=0;i A Token is an occurrence of a term from the text of a field. It consists of a term's text, the start and end offset of the term in the text of the field, and a type string. @@ -117,11 +121,13 @@

+ @deprecated This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9. + See Javadocs in {@link TokenStream} for further details. @see org.apache.lucene.index.Payload - @deprecated A new TokenStream API was introduced with Lucene 2.9. - See javadocs in {@link TokenStream} for further details. */ -public class Token implements Cloneable { +public class Token extends AttributeImpl + implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute, + FlagsAttribute, OffsetAttribute, PayloadAttribute { public static final String DEFAULT_TYPE = "word"; @@ -134,7 +140,7 @@ /** * Characters for the term text. * @deprecated This will be made private. Instead, use: - * {@link termBuffer()}, + * {@link #termBuffer()}, * {@link #setTermBuffer(char[], int, int)}, * {@link #setTermBuffer(String)}, or * {@link #setTermBuffer(String, int, int)} @@ -144,28 +150,28 @@ /** * Length of term text in the buffer. * @deprecated This will be made private. Instead, use: - * {@link termLength()}, or @{link setTermLength(int)}. + * {@link #termLength()}, or @{link setTermLength(int)}. */ int termLength; /** * Start in source text. * @deprecated This will be made private. Instead, use: - * {@link startOffset()}, or @{link setStartOffset(int)}. + * {@link #startOffset()}, or @{link setStartOffset(int)}. */ int startOffset; /** * End in source text. * @deprecated This will be made private. Instead, use: - * {@link endOffset()}, or @{link setEndOffset(int)}. + * {@link #endOffset()}, or @{link setEndOffset(int)}. */ int endOffset; /** * The lexical type of the token. * @deprecated This will be made private. Instead, use: - * {@link type()}, or @{link setType(String)}. + * {@link #type()}, or @{link setType(String)}. */ String type = DEFAULT_TYPE; @@ -173,13 +179,13 @@ /** * @deprecated This will be made private. Instead, use: - * {@link getPayload()}, or @{link setPayload(Payload)}. + * {@link #getPayload()}, or @{link setPayload(Payload)}. */ Payload payload; /** * @deprecated This will be made private. Instead, use: - * {@link getPositionIncrement()}, or @{link setPositionIncrement(String)}. + * {@link #getPositionIncrement()}, or @{link setPositionIncrement(String)}. */ int positionIncrement = 1; @@ -561,6 +567,13 @@ public void setEndOffset(int offset) { this.endOffset = offset; } + + /** Set the starting and ending offset. + @see #startOffset() and #endOffset()*/ + public void setOffset(int startOffset, int endOffset) { + this.startOffset = startOffset; + this.endOffset = endOffset; + } /** Returns this Token's lexical type. Defaults to "word". */ public final String type() { @@ -640,19 +653,15 @@ } public Object clone() { - try { - Token t = (Token)super.clone(); - // Do a deep clone - if (termBuffer != null) { - t.termBuffer = (char[]) termBuffer.clone(); - } - if (payload != null) { - t.setPayload((Payload) payload.clone()); - } - return t; - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // shouldn't happen + Token t = (Token)super.clone(); + // Do a deep clone + if (termBuffer != null) { + t.termBuffer = (char[]) termBuffer.clone(); } + if (payload != null) { + t.setPayload((Payload) payload.clone()); + } + return t; } /** Makes a clone, but replaces the term buffer & @@ -862,4 +871,9 @@ type = prototype.type; payload = prototype.payload; } + + public void copyTo(AttributeImpl target) { + Token to = (Token) target; + to.reinit(this); + } } Index: src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (revision 793966) +++ src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (working copy) @@ -17,8 +17,6 @@ * limitations under the License. */ -import java.io.Serializable; - import org.apache.lucene.util.Attribute; /** @@ -31,9 +29,7 @@ * We will make our best efforts to keep the APIs backwards-compatible. */ -public class FlagsAttribute extends Attribute implements Cloneable, Serializable { - private int flags = 0; - +public interface FlagsAttribute extends Attribute { /** * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. *

@@ -44,43 +40,10 @@ * * @return The bits */ - public int getFlags() { - return flags; - } + public int getFlags(); /** * @see #getFlags() */ - public void setFlags(int flags) { - this.flags = flags; - } - - public void clear() { - flags = 0; - } - - public String toString() { - return "flags=" + flags; - } - - public boolean equals(Object other) { - if (this == other) { - return true; - } - - if (other instanceof FlagsAttribute) { - return ((FlagsAttribute) other).flags == flags; - } - - return false; - } - - public int hashCode() { - return flags; - } - - public void copyTo(Attribute target) { - FlagsAttribute t = (FlagsAttribute) target; - t.setFlags(flags); - } + public void setFlags(int flags); } Index: src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java (revision 0) +++ src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java (revision 0) @@ -0,0 +1,82 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.util.AttributeImpl; + +/** + * This attribute can be used to pass different flags down the tokenizer chain, + * e. g. from one TokenFilter to another one. + * + *

+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + + */ +public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable { + private int flags = 0; + + /** + * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. + *

+ * + * Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes. + * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s. + * + * + * @return The bits + */ + public int getFlags() { + return flags; + } + + /** + * @see #getFlags() + */ + public void setFlags(int flags) { + this.flags = flags; + } + + public void clear() { + flags = 0; + } + + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other instanceof FlagsAttributeImpl) { + return ((FlagsAttributeImpl) other).flags == flags; + } + + return false; + } + + public int hashCode() { + return flags; + } + + public void copyTo(AttributeImpl target) { + FlagsAttribute t = (FlagsAttribute) target; + t.setFlags(flags); + } +} Index: src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java (revision 793966) +++ src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java (working copy) @@ -17,8 +17,6 @@ * limitations under the License. */ -import java.io.Serializable; - import org.apache.lucene.util.Attribute; /** @@ -29,67 +27,23 @@ * The APIs introduced in these classes with Lucene 2.9 might change in the future. * We will make our best efforts to keep the APIs backwards-compatible. */ -public class OffsetAttribute extends Attribute implements Cloneable, Serializable { - private int startOffset; - private int endOffset; - +public interface OffsetAttribute extends Attribute { /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter. */ - public int startOffset() { - return startOffset; - } + public int startOffset(); /** Set the starting and ending offset. @see #startOffset() and #endOffset()*/ - public void setOffset(int startOffset, int endOffset) { - this.startOffset = startOffset; - this.endOffset = endOffset; - } + public void setOffset(int startOffset, int endOffset); /** Returns this Token's ending offset, one greater than the position of the last character corresponding to this token in the source text. The length of the token in the source text is (endOffset - startOffset). */ - public int endOffset() { - return endOffset; - } - - - public void clear() { - startOffset = 0; - endOffset = 0; - } - - public String toString() { - return "start=" + startOffset + ",end=" + endOffset; - } - - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof OffsetAttribute) { - OffsetAttribute o = (OffsetAttribute) other; - return o.startOffset == startOffset && o.endOffset == endOffset; - } - - return false; - } - - public int hashCode() { - int code = startOffset; - code = code * 31 + endOffset; - return code; - } - - public void copyTo(Attribute target) { - OffsetAttribute t = (OffsetAttribute) target; - t.setOffset(startOffset, endOffset); - } + public int endOffset(); } Index: src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java (revision 0) +++ src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java (revision 0) @@ -0,0 +1,91 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.util.AttributeImpl; + +/** + * The start and end character offset of a Token. + * + *

+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribute, Cloneable, Serializable { + private int startOffset; + private int endOffset; + + /** Returns this Token's starting offset, the position of the first character + corresponding to this token in the source text. + + Note that the difference between endOffset() and startOffset() may not be + equal to termText.length(), as the term text may have been altered by a + stemmer or some other filter. */ + public int startOffset() { + return startOffset; + } + + + /** Set the starting and ending offset. + @see #startOffset() and #endOffset()*/ + public void setOffset(int startOffset, int endOffset) { + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + + /** Returns this Token's ending offset, one greater than the position of the + last character corresponding to this token in the source text. The length + of the token in the source text is (endOffset - startOffset). */ + public int endOffset() { + return endOffset; + } + + + public void clear() { + startOffset = 0; + endOffset = 0; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof OffsetAttributeImpl) { + OffsetAttributeImpl o = (OffsetAttributeImpl) other; + return o.startOffset == startOffset && o.endOffset == endOffset; + } + + return false; + } + + public int hashCode() { + int code = startOffset; + code = code * 31 + endOffset; + return code; + } + + public void copyTo(AttributeImpl target) { + OffsetAttribute t = (OffsetAttribute) target; + t.setOffset(startOffset, endOffset); + } +} Index: src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java (revision 793966) +++ src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java (working copy) @@ -17,8 +17,6 @@ * limitations under the License. */ -import java.io.Serializable; - import org.apache.lucene.index.Payload; import org.apache.lucene.util.Attribute; @@ -30,80 +28,14 @@ * The APIs introduced in these classes with Lucene 2.9 might change in the future. * We will make our best efforts to keep the APIs backwards-compatible. */ -public class PayloadAttribute extends Attribute implements Cloneable, Serializable { - private Payload payload; - +public interface PayloadAttribute extends Attribute { /** - * Initialize this attribute with no payload. - */ - public PayloadAttribute() {} - - /** - * Initialize this attribute with the given payload. - */ - public PayloadAttribute(Payload payload) { - this.payload = payload; - } - - /** * Returns this Token's payload. */ - public Payload getPayload() { - return this.payload; - } + public Payload getPayload(); /** * Sets this Token's payload. */ - public void setPayload(Payload payload) { - this.payload = payload; - } - - public void clear() { - payload = null; - } - - public String toString() { - if (payload == null) { - return "payload=null"; - } - - return "payload=" + payload.toString(); - } - - public Object clone() { - PayloadAttribute clone = (PayloadAttribute) super.clone(); - if (payload != null) { - clone.payload = (Payload) payload.clone(); - } - return clone; - } - - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof PayloadAttribute) { - PayloadAttribute o = (PayloadAttribute) other; - if (o.payload == null || payload == null) { - return o.payload == null && payload == null; - } - - return o.payload.equals(payload); - } - - return false; - } - - public int hashCode() { - return (payload == null) ? 0 : payload.hashCode(); - } - - public void copyTo(Attribute target) { - PayloadAttribute t = (PayloadAttribute) target; - t.setPayload((payload == null) ? null : (Payload) payload.clone()); - } - - + public void setPayload(Payload payload); } Index: src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java (revision 0) +++ src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java (revision 0) @@ -0,0 +1,101 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.index.Payload; +import org.apache.lucene.util.AttributeImpl; + +/** + * The payload of a Token. See also {@link Payload}. + * + *

+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttribute, Cloneable, Serializable { + private Payload payload; + + /** + * Initialize this attribute with no payload. + */ + public PayloadAttributeImpl() {} + + /** + * Initialize this attribute with the given payload. + */ + public PayloadAttributeImpl(Payload payload) { + this.payload = payload; + } + + /** + * Returns this Token's payload. + */ + public Payload getPayload() { + return this.payload; + } + + /** + * Sets this Token's payload. + */ + public void setPayload(Payload payload) { + this.payload = payload; + } + + public void clear() { + payload = null; + } + + public Object clone() { + PayloadAttributeImpl clone = (PayloadAttributeImpl) super.clone(); + if (payload != null) { + clone.payload = (Payload) payload.clone(); + } + return clone; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PayloadAttribute) { + PayloadAttributeImpl o = (PayloadAttributeImpl) other; + if (o.payload == null || payload == null) { + return o.payload == null && payload == null; + } + + return o.payload.equals(payload); + } + + return false; + } + + public int hashCode() { + return (payload == null) ? 0 : payload.hashCode(); + } + + public void copyTo(AttributeImpl target) { + PayloadAttribute t = (PayloadAttribute) target; + t.setPayload((payload == null) ? null : (Payload) payload.clone()); + } + + +} Index: src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java (revision 793966) +++ src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java (working copy) @@ -17,13 +17,10 @@ * limitations under the License. */ -import java.io.Serializable; - -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Attribute; /** The positionIncrement determines the position of this token - * relative to the previous Token in a {@link TokenStream}, used in phrase + * relative to the previous Token in a TokenStream, used in phrase * searching. * *

The default value is one. @@ -53,54 +50,15 @@ * * @see org.apache.lucene.index.TermPositions */ -public class PositionIncrementAttribute extends Attribute implements Cloneable, Serializable { - private int positionIncrement = 1; - +public interface PositionIncrementAttribute extends Attribute { /** Set the position increment. The default value is one. * * @param positionIncrement the distance from the prior term */ - public void setPositionIncrement(int positionIncrement) { - if (positionIncrement < 0) - throw new IllegalArgumentException - ("Increment must be zero or greater: " + positionIncrement); - this.positionIncrement = positionIncrement; - } + public void setPositionIncrement(int positionIncrement); /** Returns the position increment of this Token. * @see #setPositionIncrement */ - public int getPositionIncrement() { - return positionIncrement; - } - - public void clear() { - this.positionIncrement = 1; - } - - public String toString() { - return "positionIncrement=" + positionIncrement; - } - - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof PositionIncrementAttribute) { - return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement; - } - - return false; - } - - public int hashCode() { - return positionIncrement; - } - - public void copyTo(Attribute target) { - PositionIncrementAttribute t = (PositionIncrementAttribute) target; - t.setPositionIncrement(positionIncrement); - } - + public int getPositionIncrement(); } Index: src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java (revision 0) +++ src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java (revision 0) @@ -0,0 +1,102 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.AttributeImpl; + +/** The positionIncrement determines the position of this token + * relative to the previous Token in a {@link TokenStream}, used in phrase + * searching. + * + *

The default value is one. + * + *

Some common uses for this are:

    + * + *
  • Set it to zero to put multiple terms in the same position. This is + * useful if, e.g., a word has multiple stems. Searches for phrases + * including either stem will match. In this case, all but the first stem's + * increment should be set to zero: the increment of the first instance + * should be one. Repeating a token with an increment of zero can also be + * used to boost the scores of matches on that token. + * + *
  • Set it to values greater than one to inhibit exact phrase matches. + * If, for example, one does not want phrases to match across removed stop + * words, then one could build a stop word filter that removes stop words and + * also sets the increment to the number of stop words removed before each + * non-stop word. Then exact phrase queries will only match when the terms + * occur with no intervening stop words. + * + *
+ * + *

+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + * + * @see org.apache.lucene.index.TermPositions + */ +public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable, Serializable { + private int positionIncrement = 1; + + /** Set the position increment. The default value is one. + * + * @param positionIncrement the distance from the prior term + */ + public void setPositionIncrement(int positionIncrement) { + if (positionIncrement < 0) + throw new IllegalArgumentException + ("Increment must be zero or greater: " + positionIncrement); + this.positionIncrement = positionIncrement; + } + + /** Returns the position increment of this Token. + * @see #setPositionIncrement + */ + public int getPositionIncrement() { + return positionIncrement; + } + + public void clear() { + this.positionIncrement = 1; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PositionIncrementAttributeImpl) { + return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement; + } + + return false; + } + + public int hashCode() { + return positionIncrement; + } + + public void copyTo(AttributeImpl target) { + PositionIncrementAttribute t = (PositionIncrementAttribute) target; + t.setPositionIncrement(positionIncrement); + } + +} Index: src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java (revision 793966) +++ src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java (working copy) @@ -17,9 +17,6 @@ * limitations under the License. */ -import java.io.Serializable; - -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Attribute; /** @@ -30,12 +27,7 @@ * The APIs introduced in these classes with Lucene 2.9 might change in the future. * We will make our best efforts to keep the APIs backwards-compatible. */ -public class TermAttribute extends Attribute implements Cloneable, Serializable { - private static int MIN_BUFFER_SIZE = 10; - - private char[] termBuffer; - private int termLength; - +public interface TermAttribute extends Attribute { /** Returns the Token's term text. * * This method has a performance penalty @@ -45,38 +37,20 @@ * String, use this method, which is nothing more than * a convenience call to new String(token.termBuffer(), 0, token.termLength()) */ - public String term() { - initTermBuffer(); - return new String(termBuffer, 0, termLength); - } - + public String term(); + /** Copies the contents of buffer, starting at offset for * length characters, into the termBuffer array. * @param buffer the buffer to copy * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ - public void setTermBuffer(char[] buffer, int offset, int length) { - char[] newCharBuffer = growTermBuffer(length); - if (newCharBuffer != null) { - termBuffer = newCharBuffer; - } - System.arraycopy(buffer, offset, termBuffer, 0, length); - termLength = length; - } + public void setTermBuffer(char[] buffer, int offset, int length); /** Copies the contents of buffer into the termBuffer array. * @param buffer the buffer to copy */ - public void setTermBuffer(String buffer) { - int length = buffer.length(); - char[] newCharBuffer = growTermBuffer(length); - if (newCharBuffer != null) { - termBuffer = newCharBuffer; - } - buffer.getChars(0, length, termBuffer, 0); - termLength = length; - } + public void setTermBuffer(String buffer); /** Copies the contents of buffer, starting at offset and continuing * for length characters, into the termBuffer array. @@ -84,17 +58,8 @@ * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ - public void setTermBuffer(String buffer, int offset, int length) { - assert offset <= buffer.length(); - assert offset + length <= buffer.length(); - char[] newCharBuffer = growTermBuffer(length); - if (newCharBuffer != null) { - termBuffer = newCharBuffer; - } - buffer.getChars(offset, offset + length, termBuffer, 0); - termLength = length; - } - + public void setTermBuffer(String buffer, int offset, int length); + /** Returns the internal termBuffer character array which * you can then directly alter. If the array is too * small for your token, use {@link @@ -102,10 +67,7 @@ * altering the buffer be sure to call {@link * #setTermLength} to record the number of valid * characters that were placed into the termBuffer. */ - public char[] termBuffer() { - initTermBuffer(); - return termBuffer; - } + public char[] termBuffer(); /** Grows the termBuffer to at least size newSize, preserving the * existing content. Note: If the next operation is to change @@ -117,63 +79,12 @@ * @param newSize minimum size of the new termBuffer * @return newly created termBuffer with length >= newSize */ - public char[] resizeTermBuffer(int newSize) { - char[] newCharBuffer = growTermBuffer(newSize); - if (termBuffer == null) { - // If there were termText, then preserve it. - // note that if termBuffer is null then newCharBuffer cannot be null - assert newCharBuffer != null; - termBuffer = newCharBuffer; - } else if (newCharBuffer != null) { - // Note: if newCharBuffer != null then termBuffer needs to grow. - // If there were a termBuffer, then preserve it - System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); - termBuffer = newCharBuffer; - } - return termBuffer; - } + public char[] resizeTermBuffer(int newSize); - /** Allocates a buffer char[] of at least newSize - * @param newSize minimum size of the buffer - * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough - */ - private char[] growTermBuffer(int newSize) { - if (termBuffer != null) { - if (termBuffer.length >= newSize) - // Already big enough - return null; - else - // Not big enough; create a new array with slight - // over allocation: - return new char[ArrayUtil.getNextSize(newSize)]; - } else { - - // determine the best size - // The buffer is always at least MIN_BUFFER_SIZE - if (newSize < MIN_BUFFER_SIZE) { - newSize = MIN_BUFFER_SIZE; - } - - return new char[newSize]; - } - } - - // TODO: once we remove the deprecated termText() method - // and switch entirely to char[] termBuffer we don't need - // to use this method anymore - private void initTermBuffer() { - if (termBuffer == null) { - termBuffer = new char[MIN_BUFFER_SIZE]; - termLength = 0; - } - } - /** Return number of valid characters (length of the term) * in the termBuffer array. */ - public int termLength() { - return termLength; - } - + public int termLength(); + /** Set number of valid characters (length of the term) in * the termBuffer array. Use this to truncate the termBuffer * or to synchronize with external manipulation of the termBuffer. @@ -181,61 +92,5 @@ * use {@link #resizeTermBuffer(int)} first. * @param length the truncated length */ - public void setTermLength(int length) { - initTermBuffer(); - if (length > termBuffer.length) - throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); - termLength = length; - } - - public int hashCode() { - initTermBuffer(); - int code = termLength; - code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); - return code; - } - - public void clear() { - termLength = 0; - } - - public Object clone() { - TermAttribute t = (TermAttribute)super.clone(); - // Do a deep clone - if (termBuffer != null) { - t.termBuffer = (char[]) termBuffer.clone(); - } - return t; - } - - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof TermAttribute) { - initTermBuffer(); - TermAttribute o = ((TermAttribute) other); - o.initTermBuffer(); - - for(int i=0;i + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class TermAttributeImpl extends AttributeImpl implements TermAttribute, Cloneable, Serializable { + private static int MIN_BUFFER_SIZE = 10; + + private char[] termBuffer; + private int termLength; + + /** Returns the Token's term text. + * + * This method has a performance penalty + * because the text is stored internally in a char[]. If + * possible, use {@link #termBuffer()} and {@link + * #termLength()} directly instead. If you really need a + * String, use this method, which is nothing more than + * a convenience call to new String(token.termBuffer(), 0, token.termLength()) + */ + public String term() { + initTermBuffer(); + return new String(termBuffer, 0, termLength); + } + + /** Copies the contents of buffer, starting at offset for + * length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public void setTermBuffer(char[] buffer, int offset, int length) { + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + System.arraycopy(buffer, offset, termBuffer, 0, length); + termLength = length; + } + + /** Copies the contents of buffer into the termBuffer array. + * @param buffer the buffer to copy + */ + public void setTermBuffer(String buffer) { + int length = buffer.length(); + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(0, length, termBuffer, 0); + termLength = length; + } + + /** Copies the contents of buffer, starting at offset and continuing + * for length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public void setTermBuffer(String buffer, int offset, int length) { + assert offset <= buffer.length(); + assert offset + length <= buffer.length(); + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(offset, offset + length, termBuffer, 0); + termLength = length; + } + + /** Returns the internal termBuffer character array which + * you can then directly alter. If the array is too + * small for your token, use {@link + * #resizeTermBuffer(int)} to increase it. After + * altering the buffer be sure to call {@link + * #setTermLength} to record the number of valid + * characters that were placed into the termBuffer. */ + public char[] termBuffer() { + initTermBuffer(); + return termBuffer; + } + + /** Grows the termBuffer to at least size newSize, preserving the + * existing content. Note: If the next operation is to change + * the contents of the term buffer use + * {@link #setTermBuffer(char[], int, int)}, + * {@link #setTermBuffer(String)}, or + * {@link #setTermBuffer(String, int, int)} + * to optimally combine the resize with the setting of the termBuffer. + * @param newSize minimum size of the new termBuffer + * @return newly created termBuffer with length >= newSize + */ + public char[] resizeTermBuffer(int newSize) { + char[] newCharBuffer = growTermBuffer(newSize); + if (termBuffer == null) { + // If there were termText, then preserve it. + // note that if termBuffer is null then newCharBuffer cannot be null + assert newCharBuffer != null; + termBuffer = newCharBuffer; + } else if (newCharBuffer != null) { + // Note: if newCharBuffer != null then termBuffer needs to grow. + // If there were a termBuffer, then preserve it + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); + termBuffer = newCharBuffer; + } + return termBuffer; + } + + /** Allocates a buffer char[] of at least newSize + * @param newSize minimum size of the buffer + * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough + */ + private char[] growTermBuffer(int newSize) { + if (termBuffer != null) { + if (termBuffer.length >= newSize) + // Already big enough + return null; + else + // Not big enough; create a new array with slight + // over allocation: + return new char[ArrayUtil.getNextSize(newSize)]; + } else { + + // determine the best size + // The buffer is always at least MIN_BUFFER_SIZE + if (newSize < MIN_BUFFER_SIZE) { + newSize = MIN_BUFFER_SIZE; + } + + return new char[newSize]; + } + } + + // TODO: once we remove the deprecated termText() method + // and switch entirely to char[] termBuffer we don't need + // to use this method anymore + private void initTermBuffer() { + if (termBuffer == null) { + termBuffer = new char[MIN_BUFFER_SIZE]; + termLength = 0; + } + } + + /** Return number of valid characters (length of the term) + * in the termBuffer array. */ + public int termLength() { + return termLength; + } + + /** Set number of valid characters (length of the term) in + * the termBuffer array. Use this to truncate the termBuffer + * or to synchronize with external manipulation of the termBuffer. + * Note: to grow the size of the array, + * use {@link #resizeTermBuffer(int)} first. + * @param length the truncated length + */ + public void setTermLength(int length) { + initTermBuffer(); + if (length > termBuffer.length) + throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); + termLength = length; + } + + public int hashCode() { + initTermBuffer(); + int code = termLength; + code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); + return code; + } + + public void clear() { + termLength = 0; + } + + public Object clone() { + TermAttributeImpl t = (TermAttributeImpl)super.clone(); + // Do a deep clone + if (termBuffer != null) { + t.termBuffer = (char[]) termBuffer.clone(); + } + return t; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof TermAttribute) { + initTermBuffer(); + TermAttributeImpl o = ((TermAttributeImpl) other); + o.initTermBuffer(); + + for(int i=0;i + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, Cloneable, Serializable { + private String type; + public static final String DEFAULT_TYPE = "word"; + + public TypeAttributeImpl() { + this(DEFAULT_TYPE); + } + + public TypeAttributeImpl(String type) { + this.type = type; + } + + /** Returns this Token's lexical type. Defaults to "word". */ + public String type() { + return type; + } + + /** Set the lexical type. + @see #type() */ + public void setType(String type) { + this.type = type; + } + + public void clear() { + type = DEFAULT_TYPE; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof TypeAttributeImpl) { + return type.equals(((TypeAttributeImpl) other).type); + } + + return false; + } + + public int hashCode() { + return type.hashCode(); + } + + public void copyTo(AttributeImpl target) { + TypeAttribute t = (TypeAttribute) target; + t.setType(new String(type)); + } +} Index: src/java/org/apache/lucene/analysis/TokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 793966) +++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy) @@ -42,7 +42,7 @@ super(input); this.input = input; } - + /** Close the input TokenStream. */ public void close() throws IOException { input.close(); @@ -53,17 +53,4 @@ super.reset(); input.reset(); } - - public boolean useNewAPI() { - return input.useNewAPI(); - } - - /** - * Sets whether or not to use the new TokenStream API. Settings this - * will apply to this Filter and all TokenStream/Filters upstream. - */ - public void setUseNewAPI(boolean use) { - input.setUseNewAPI(use); - } - } Index: src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 793966) +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -24,17 +24,10 @@

This is an abstract class.

- NOTE: In order to enable the new API the method - {@link #useNewAPI()} has to be called with useNewAPI=true. - Otherwise the deprecated method {@link #next(Token)} will - be used by Lucene consumers (indexer and queryparser) to - consume the tokens. {@link #next(Token)} will be removed - in Lucene 3.0. -

NOTE: To use the old API subclasses must override {@link #next(Token)}. It's also OK to instead override {@link #next()} but that method is slower compared to {@link #next(Token)}. -

+

NOTE: subclasses overriding {@link #next(Token)} must call {@link Token#clear()}. *

Index: src/java/org/apache/lucene/analysis/TokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenStream.java (revision 793966) +++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy) @@ -21,8 +21,10 @@ import java.util.Iterator; import org.apache.lucene.index.Payload; -import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.Attribute; // javadocs +import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.analysis.tokenattributes.*; /** A TokenStream enumerates the sequence of tokens, either from fields of a document or from query text. @@ -36,13 +38,13 @@ A new TokenStream API is introduced with Lucene 2.9. Since 2.9 Token is deprecated and the preferred way to store - the information of a token is to use {@link Attribute}s. + the information of a token is to use {@link AttributeImpl}s.

For that reason TokenStream extends {@link AttributeSource} - now. Note that only one instance per {@link Attribute} is + now. Note that only one instance per {@link AttributeImpl} is created and reused for every token. This approach reduces object creations and allows local caching of references to - the {@link Attribute}s. See {@link #incrementToken()} for further details. + the {@link AttributeImpl}s. See {@link #incrementToken()} for further details.

The workflow of the new TokenStream API is as follows:

    @@ -61,18 +63,7 @@ Sometimes it is desirable to capture a current state of a TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter}, {@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase - {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used. -

    - NOTE: In order to enable the new API the method - {@link #useNewAPI()} has to be called with useNewAPI=true. - Otherwise the deprecated method {@link #next(Token)} will - be used by Lucene consumers (indexer and queryparser) to - consume the tokens. {@link #next(Token)} will be removed - in Lucene 3.0. -

    - NOTE: To use the old API subclasses must override {@link #next(Token)}. - It's also OK to instead override {@link #next()} but that - method is slower compared to {@link #next(Token)}. + {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} can be used. *

    * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. * The APIs introduced in these classes with Lucene 2.9 might change in the future. @@ -80,9 +71,16 @@ */ public abstract class TokenStream extends AttributeSource { - private static boolean useNewAPIDefault = false; - private boolean useNewAPI = useNewAPIDefault; + + /** @deprecated */ + private TokenWrapper tokenWrapper; + + /** @deprecated */ + private boolean hasIncrementToken, hasReusableNext, hasNext; + /** @deprecated */ + private static boolean onlyUseNewAPI = false; + protected TokenStream() { super(); } @@ -90,68 +88,102 @@ protected TokenStream(AttributeSource input) { super(input); } - + /** - * Returns whether or not the new TokenStream APIs are used - * by default. - * (see {@link #incrementToken()}, {@link AttributeSource}). + * For extra performance you can globally enable the new {@link #incrementToken} + * API using {@link Attribute}s. There will be a small, but in most cases neglectible performance + * increase by enabling this, but it only works if all TokenStreams and -Filters + * use the new API and implement {@link #incrementToken}. This setting can only be enabled + * globally. + *

    All core analyzers are compatible with this setting, if you have own + * TokenStreams/-Filters, that are also compatible, enable this. + *

    When enabled, tokenization may throw {@link UnsupportedOperationException}s, + * if the whole tokenizer chain is not compatible. + *

    The default is false, so there is the fallback to the old API available. + * @deprecated This setting will be true per default in Lucene 3.0, + * when {@link #incrementToken} is abstract and must be always implemented. */ - public static boolean useNewAPIDefault() { - return useNewAPIDefault; + public static void setOnlyUseNewAPI(boolean onlyUseNewAPI) { + TokenStream.onlyUseNewAPI = onlyUseNewAPI; } - - /** - * Use this API to enable or disable the new TokenStream API. - * by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}. - * (see {@link #incrementToken()}, {@link AttributeSource}). - *

    - * If set to true, the indexer will call {@link #incrementToken()} - * to consume Tokens from this stream. - *

    - * If set to false, the indexer will call {@link #next(Token)} - * instead. + + /** Returns if only the new API is used. + * @see #setOnlyUseNewAPI */ - public static void setUseNewAPIDefault(boolean use) { - useNewAPIDefault = use; + public static boolean getOnlyUseNewAPI() { + return onlyUseNewAPI; } - /** - * Returns whether or not the new TokenStream APIs are used - * for this stream. - * (see {@link #incrementToken()}, {@link AttributeSource}). - */ - public boolean useNewAPI() { - return useNewAPI; - } + protected void initialize() { + // TODO: remove this when old API is removed + tokenWrapper = null; + + // use reflection to find out, which methods this class has overridden + try { + final Class[] EMPTY_PARAMS=new Class[0]; + hasIncrementToken = this.getClass().getMethod("incrementToken",EMPTY_PARAMS).getDeclaringClass() != TokenStream.class; + hasNext = this.getClass().getMethod("next",EMPTY_PARAMS).getDeclaringClass() != TokenStream.class; + hasReusableNext = this.getClass().getMethod("next",new Class[]{Token.class}).getDeclaringClass() != TokenStream.class; + } catch (NoSuchMethodException e) { + // should not happen + throw new RuntimeException(e); + } + + /*System.out.println(this.getClass()+" supports: incrementToken()="+ + hasIncrementToken+" next()="+hasNext+" next(Token)="+hasReusableNext);*/ + // if no method was overridden, this should fail early to prevent stack overflows + + // a TokenStream subclass must at least implement one of the methods! + if (!(hasIncrementToken || hasNext || hasReusableNext)) + throw new UnsupportedOperationException(getClass().getName()+" does not implement any of incrementToken(), next(Token), next()."); + + if (onlyUseNewAPI) { + + // to only use the new API, at least incrementToken must be implemented! + if (!hasIncrementToken) + throw new UnsupportedOperationException(getClass().getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI."); + + } else { - /** - * Use this API to enable or disable the new TokenStream API - * for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}. - * (see {@link #incrementToken()}, {@link AttributeSource}). - *

    - * If set to true, the indexer will call {@link #incrementToken()} - * to consume Tokens from this stream. - *

    - * If set to false, the indexer will call {@link #next(Token)} - * instead. - *

    - * NOTE: All streams and filters in one chain must use the - * same API. - */ - public void setUseNewAPI(boolean use) { - useNewAPI = use; + // initialize the wrapper instance + addAttributeImpl(new TokenWrapper()); + if ( + // check that the basic attributes are all TokenWrapper instances, + // and no one registered another instance before. + getAttribute(TermAttribute.class) instanceof TokenWrapper && + getAttribute(TypeAttribute.class) instanceof TokenWrapper && + getAttribute(PositionIncrementAttribute.class) instanceof TokenWrapper && + getAttribute(FlagsAttribute.class) instanceof TokenWrapper && + getAttribute(OffsetAttribute.class) instanceof TokenWrapper && + getAttribute(PayloadAttribute.class) instanceof TokenWrapper + ) { + // get any attribute (its always the same, a Token) + tokenWrapper = (TokenWrapper) getAttribute(TermAttribute.class); + } + + } } + + /** @deprecated */ + private void checkTokenWrapper() { + if (tokenWrapper == null) throw new UnsupportedOperationException( + "The basic token attributes are not implemented by the default TokenWrapper instance, "+ + "because either onlyUseNewAPI==false or you have registered own Attribute instances. "+ + "In this case, all TokenStreams and TokenFilters in the chain must implement incrementToken() "+ + "and consumers must only call incrementToken()." + ); + } /** * Consumers (e. g. the indexer) use this method to advance the stream * to the next token. Implementing classes must implement this method - * and update the appropriate {@link Attribute}s with content of the + * and update the appropriate {@link AttributeImpl}s with content of the * next token. *

    * This method is called for every token of a document, so an efficient * implementation is crucial for good performance. To avoid calls to * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and - * downcasts, references to all {@link Attribute}s that this stream uses + * downcasts, references to all {@link AttributeImpl}s that this stream uses * should be retrieved during instantiation. *

    * To make sure that filters and consumers know which attributes are available @@ -164,29 +196,21 @@ * Note that this method will be defined abstract in Lucene 3.0. */ public boolean incrementToken() throws IOException { - // subclasses must implement this method; will be made abstract in Lucene 3.0 - return false; + checkTokenWrapper(); + + final Token token; + if (hasReusableNext) { + token = next(tokenWrapper.delegate); + } else { + assert hasNext; + token = next(); + } + if (token == null) return false; + tokenWrapper.delegate = token; + return true; } /** Returns the next token in the stream, or null at EOS. - * @deprecated The returned Token is a "full private copy" (not - * re-used across calls to next()) but will be slower - * than calling {@link #next(Token)} instead.. */ - public Token next() throws IOException { - final Token reusableToken = new Token(); - Token nextToken = next(reusableToken); - - if (nextToken != null) { - Payload p = nextToken.getPayload(); - if (p != null) { - nextToken.setPayload((Payload) p.clone()); - } - } - - return nextToken; - } - - /** Returns the next token in the stream, or null at EOS. * When possible, the input Token should be used as the * returned Token (this gives fastest tokenization * performance), but this is not required and a new Token @@ -215,14 +239,42 @@ * good idea to assert that it is not null.) * @return next token in the stream or null if end-of-stream was hit * @deprecated The new {@link #incrementToken()} and {@link AttributeSource} - * APIs should be used instead. See also {@link #useNewAPI()}. + * APIs should be used instead. */ public Token next(final Token reusableToken) throws IOException { - // We don't actually use inputToken, but still add this assert assert reusableToken != null; - return next(); + checkTokenWrapper(); + + if (hasIncrementToken) { + tokenWrapper.delegate = reusableToken; + return incrementToken() ? tokenWrapper.delegate : null; + } else { + assert hasNext; + final Token token = next(); + if (token == null) return null; + tokenWrapper.delegate = token; + return token; + } } + /** Returns the next token in the stream, or null at EOS. + * @deprecated The returned Token is a "full private copy" (not + * re-used across calls to next()) but will be slower + * than calling {@link #next(Token)} instead. */ + public Token next() throws IOException { + checkTokenWrapper(); + + if (hasIncrementToken) { + return incrementToken() ? ((Token) tokenWrapper.delegate.clone()) : null; + } else { + assert hasReusableNext; + final Token token = next(tokenWrapper.delegate); + if (token == null) return null; + tokenWrapper.delegate = token; + return (Token) token.clone(); + } + } + /** Resets this stream to the beginning. This is an * optional operation, so subclasses may or may not * implement this method. Reset() is not needed for @@ -240,24 +292,4 @@ /** Releases resources associated with this stream. */ public void close() throws IOException {} - public String toString() { - StringBuffer sb = new StringBuffer(); - sb.append('('); - - if (hasAttributes()) { - // TODO Java 1.5 - //Iterator it = attributes.values().iterator(); - Iterator it = getAttributesIterator(); - if (it.hasNext()) { - sb.append(it.next().toString()); - } - while (it.hasNext()) { - sb.append(','); - sb.append(it.next().toString()); - } - } - sb.append(')'); - return sb.toString(); - } - } Index: src/java/org/apache/lucene/analysis/TokenWrapper.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenWrapper.java (revision 0) +++ src/java/org/apache/lucene/analysis/TokenWrapper.java (revision 0) @@ -0,0 +1,163 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.index.Payload; +import org.apache.lucene.util.AttributeImpl; + +/** + * This class wraps a Token and supplies a single attribute instance + * where the delegate token can be replaced. + * @deprecated Will be removed, when old TokenStream API is removed. + */ +final class TokenWrapper extends AttributeImpl + implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute, + FlagsAttribute, OffsetAttribute, PayloadAttribute { + + Token delegate; + + TokenWrapper() { + this(new Token()); + } + + TokenWrapper(Token delegate) { + this.delegate = delegate; + } + + // TermAttribute: + + public String term() { + return delegate.term(); + } + + public void setTermBuffer(char[] buffer, int offset, int length) { + delegate.setTermBuffer(buffer, offset, length); + } + + public void setTermBuffer(String buffer) { + delegate.setTermBuffer(buffer); + } + + public void setTermBuffer(String buffer, int offset, int length) { + delegate.setTermBuffer(buffer, offset, length); + } + + public char[] termBuffer() { + return delegate.termBuffer(); + } + + public char[] resizeTermBuffer(int newSize) { + return delegate.resizeTermBuffer(newSize); + } + + public int termLength() { + return delegate.termLength(); + } + + public void setTermLength(int length) { + delegate.setTermLength(length); + } + + // TypeAttribute: + + public String type() { + return delegate.type(); + } + + public void setType(String type) { + delegate.setType(type); + } + + public void setPositionIncrement(int positionIncrement) { + delegate.setPositionIncrement(positionIncrement); + } + + public int getPositionIncrement() { + return delegate.getPositionIncrement(); + } + + // FlagsAttribute + + public int getFlags() { + return delegate.getFlags(); + } + + public void setFlags(int flags) { + delegate.setFlags(flags); + } + + // OffsetAttribute + + public int startOffset() { + return delegate.startOffset(); + } + + public void setOffset(int startOffset, int endOffset) { + delegate.setOffset(startOffset, endOffset); + } + + public int endOffset() { + return delegate.endOffset(); + } + + // PayloadAttribute + public Payload getPayload() { + return delegate.getPayload(); + } + + public void setPayload(Payload payload) { + delegate.setPayload(payload); + } + + // TokenAttribute + + public void clear() { + delegate.clear(); + } + + // AttributeImpl + + public String toString() { + return delegate.toString(); + } + + public int hashCode() { + return delegate.hashCode(); + } + + public boolean equals(Object other) { + if (other instanceof TokenWrapper) { + return ((TokenWrapper) other).delegate.equals(this.delegate); + } + return false; + } + + public Object clone() { + return new TokenWrapper((Token) delegate.clone()); + } + + public void copyTo(AttributeImpl target) { + ((TokenWrapper) target).delegate.reinit(this.delegate); + } +} Property changes on: src\java\org\apache\lucene\analysis\TokenWrapper.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocInverterPerField.java =================================================================== --- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 793966) +++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy) @@ -20,7 +20,6 @@ import java.io.IOException; import java.io.Reader; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -83,7 +82,6 @@ final int valueLength = stringValue.length(); perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength); fieldState.attributeSource = perThread.singleTokenTokenStream; - perThread.localTokenStream.reset(); consumer.start(field); boolean success = false; @@ -132,21 +130,15 @@ try { int offsetEnd = fieldState.offset-1; - boolean useNewTokenStreamAPI = stream.useNewAPI(); - Token localToken = null; - - if (useNewTokenStreamAPI) { - fieldState.attributeSource = stream; - } else { - fieldState.attributeSource = perThread.localTokenStream; - localToken = perThread.localToken; - } - - consumer.start(field); + boolean hasMoreTokens = stream.incrementToken(); + fieldState.attributeSource = stream; + OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class); + consumer.start(field); + for(;;) { // If we hit an exception in stream.next below @@ -155,15 +147,9 @@ // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID - Token token = null; - if (useNewTokenStreamAPI) { - if (!stream.incrementToken()) break; - } else { - token = stream.next(localToken); - if (token == null) break; - perThread.localTokenStream.set(token); - } + if (!hasMoreTokens) break; + final int posIncr = posIncrAttribute.getPositionIncrement(); fieldState.position += posIncr; if (allowMinus1Position || fieldState.position > 0) { @@ -194,6 +180,8 @@ docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens"); break; } + + hasMoreTokens = stream.incrementToken(); } fieldState.offset = offsetEnd+1; } finally { Index: src/java/org/apache/lucene/index/DocInverterPerThread.java =================================================================== --- src/java/org/apache/lucene/index/DocInverterPerThread.java (revision 793966) +++ src/java/org/apache/lucene/index/DocInverterPerThread.java (working copy) @@ -19,15 +19,9 @@ import java.io.IOException; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.Attribute; /** This is a DocFieldConsumer that inverts each field, * separately, from a Document, and accepts a @@ -37,10 +31,8 @@ final DocInverter docInverter; final InvertedDocConsumerPerThread consumer; final InvertedDocEndConsumerPerThread endConsumer; - final Token localToken = new Token(); //TODO: change to SingleTokenTokenStream after Token was removed final SingleTokenTokenStream singleTokenTokenStream = new SingleTokenTokenStream(); - final BackwardsCompatibilityStream localTokenStream = new BackwardsCompatibilityStream(); static class SingleTokenTokenStream extends TokenStream { TermAttribute termAttribute; @@ -55,75 +47,12 @@ termAttribute.setTermBuffer(stringValue); offsetAttribute.setOffset(startOffset, endOffset); } - } - - /** This stream wrapper is only used to maintain backwards compatibility with the - * old TokenStream API and can be removed in Lucene 3.0 - * @deprecated - */ - static class BackwardsCompatibilityStream extends TokenStream { - private Token token; - - TermAttribute termAttribute = new TermAttribute() { - public String term() { - return token.term(); - } - - public char[] termBuffer() { - return token.termBuffer(); - } - - public int termLength() { - return token.termLength(); - } - }; - OffsetAttribute offsetAttribute = new OffsetAttribute() { - public int startOffset() { - return token.startOffset(); - } - - public int endOffset() { - return token.endOffset(); - } - }; - PositionIncrementAttribute positionIncrementAttribute = new PositionIncrementAttribute() { - public int getPositionIncrement() { - return token.getPositionIncrement(); - } - }; - - FlagsAttribute flagsAttribute = new FlagsAttribute() { - public int getFlags() { - return token.getFlags(); - } - }; - - PayloadAttribute payloadAttribute = new PayloadAttribute() { - public Payload getPayload() { - return token.getPayload(); - } - }; - - TypeAttribute typeAttribute = new TypeAttribute() { - public String type() { - return token.type(); - } - }; - - BackwardsCompatibilityStream() { - attributes.put(TermAttribute.class, termAttribute); - attributes.put(OffsetAttribute.class, offsetAttribute); - attributes.put(PositionIncrementAttribute.class, positionIncrementAttribute); - attributes.put(FlagsAttribute.class, flagsAttribute); - attributes.put(PayloadAttribute.class, payloadAttribute); - attributes.put(TypeAttribute.class, typeAttribute); + // this is a dummy, to not throw an UOE because this class does not implement any iteration method + public boolean incrementToken() { + throw new UnsupportedOperationException(); } - - public void set(Token token) { - this.token = token; - } - }; + } final DocumentsWriter.DocState docState; Index: src/java/org/apache/lucene/queryParser/QueryParser.java =================================================================== --- src/java/org/apache/lucene/queryParser/QueryParser.java (revision 793966) +++ src/java/org/apache/lucene/queryParser/QueryParser.java (working copy) @@ -531,67 +531,42 @@ PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; - org.apache.lucene.analysis.Token reusableToken = null; - org.apache.lucene.analysis.Token nextToken = null; - - - boolean useNewAPI = TokenStream.useNewAPIDefault(); - - if (useNewAPI) { - boolean success = false; - try { - buffer.reset(); - success = true; - } catch (IOException e) { - // success==false if we hit an exception + boolean success = false; + try { + buffer.reset(); + success = true; + } catch (IOException e) { + // success==false if we hit an exception + } + if (success) { + if (buffer.hasAttribute(TermAttribute.class)) { + termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); } - if (success) { - if (buffer.hasAttribute(TermAttribute.class)) { - termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); - } - if (buffer.hasAttribute(PositionIncrementAttribute.class)) { - posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); - } + if (buffer.hasAttribute(PositionIncrementAttribute.class)) { + posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); } - } else { - reusableToken = new org.apache.lucene.analysis.Token(); } int positionCount = 0; boolean severalTokensAtSamePosition = false; - if (useNewAPI) { - if (termAtt != null) { - try { - while (buffer.incrementToken()) { - numTokens++; - int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; - if (positionIncrement != 0) { - positionCount += positionIncrement; - } else { - severalTokensAtSamePosition = true; - } + boolean hasMoreTokens = false; + if (termAtt != null) { + try { + hasMoreTokens = buffer.incrementToken(); + while (hasMoreTokens) { + numTokens++; + int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; + if (positionIncrement != 0) { + positionCount += positionIncrement; + } else { + severalTokensAtSamePosition = true; } - } catch (IOException e) { - // ignore + hasMoreTokens = buffer.incrementToken(); } + } catch (IOException e) { + // ignore } - } else { - while (true) { - try { - nextToken = buffer.next(reusableToken); - } - catch (IOException e) { - nextToken = null; - } - if (nextToken == null) - break; - numTokens++; - if (nextToken.getPositionIncrement() != 0) - positionCount += nextToken.getPositionIncrement(); - else - severalTokensAtSamePosition = true; - } } try { // rewind the buffer stream @@ -609,16 +584,9 @@ else if (numTokens == 1) { String term = null; try { - - if (useNewAPI) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - } + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } @@ -631,15 +599,9 @@ for (int i = 0; i < numTokens; i++) { String term = null; try { - if (useNewAPI) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - } + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } @@ -660,18 +622,11 @@ String term = null; int positionIncrement = 1; try { - if (useNewAPI) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - if (posIncrAtt != null) { - positionIncrement = posIncrAtt.getPositionIncrement(); - } - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - positionIncrement = nextToken.getPositionIncrement(); + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens @@ -707,19 +662,11 @@ int positionIncrement = 1; try { - if (useNewAPI) { - - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - if (posIncrAtt != null) { - positionIncrement = posIncrAtt.getPositionIncrement(); - } - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - positionIncrement = nextToken.getPositionIncrement(); + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens Index: src/java/org/apache/lucene/search/QueryTermVector.java =================================================================== --- src/java/org/apache/lucene/search/QueryTermVector.java (revision 793966) +++ src/java/org/apache/lucene/search/QueryTermVector.java (working copy) @@ -27,7 +27,6 @@ import java.util.Map; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.TermFreqVector; @@ -59,17 +58,15 @@ { List terms = new ArrayList(); try { - if (stream.useNewAPI()) { - stream.reset(); - TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); - while (stream.incrementToken()) { - terms.add(termAtt.term()); - } - } else { - final Token reusableToken = new Token(); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - terms.add(nextToken.term()); - } + boolean hasMoreTokens = false; + + stream.reset(); + TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + + hasMoreTokens = stream.incrementToken(); + while (hasMoreTokens) { + terms.add(termAtt.term()); + hasMoreTokens = stream.incrementToken(); } processTerms((String[])terms.toArray(new String[terms.size()])); } catch (IOException e) { Index: src/java/org/apache/lucene/util/Attribute.java =================================================================== --- src/java/org/apache/lucene/util/Attribute.java (revision 793966) +++ src/java/org/apache/lucene/util/Attribute.java (working copy) @@ -17,79 +17,14 @@ * limitations under the License. */ -import java.io.Serializable; - /** - * Base class for Attributes that can be added to a - * {@link org.apache.lucene.util.AttributeSource}. - *

    - * Attributes are used to add data in a dynamic, yet type-safe way to a source - * of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}. + * TODO + * *

    * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. * The APIs introduced in these classes with Lucene 2.9 might change in the future. * We will make our best efforts to keep the APIs backwards-compatible. */ -public abstract class Attribute implements Cloneable, Serializable { - /** - * Clears the values in this Attribute and resets it to its - * default value. - */ - public abstract void clear(); - - /** - * Subclasses must implement this method and should follow a syntax - * similar to this one: - * - *

    -   *   public String toString() {
    -   *     return "start=" + startOffset + ",end=" + endOffset;
    -   *   }
    -   * 
    - */ - public abstract String toString(); - - /** - * Subclasses must implement this method and should compute - * a hashCode similar to this: - *
    -   *   public int hashCode() {
    -   *     int code = startOffset;
    -   *     code = code * 31 + endOffset;
    -   *     return code;
    -   *   }
    -   * 
    - * - * see also {@link #equals(Object)} - */ - public abstract int hashCode(); - - /** - * All values used for computation of {@link #hashCode()} - * should be checked here for equality. - * - * see also {@link Object#equals(Object)} - */ - public abstract boolean equals(Object other); - - /** - * Copies the values from this Attribute into the passed-in - * target attribute. The type of the target must match the type - * of this attribute. - */ - public abstract void copyTo(Attribute target); - - /** - * Shallow clone. Subclasses must override this if they - * need to clone any members deeply, - */ - public Object clone() { - Object clone = null; - try { - clone = super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // shouldn't happen - } - return clone; - } +public interface Attribute { + public void clear(); } Index: src/java/org/apache/lucene/util/AttributeImpl.java =================================================================== --- src/java/org/apache/lucene/util/AttributeImpl.java (revision 0) +++ src/java/org/apache/lucene/util/AttributeImpl.java (revision 0) @@ -0,0 +1,123 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; +import java.lang.reflect.Field; + +/** + * Base class for Attributes that can be added to a + * {@link org.apache.lucene.util.AttributeSource}. + *

    + * Attributes are used to add data in a dynamic, yet type-safe way to a source + * of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}. + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public abstract class AttributeImpl implements Cloneable, Serializable { + /** + * Clears the values in this Attribute and resets it to its + * default value. + */ + public abstract void clear(); + + /** + * The default implementation of this method accesses all declared + * fields of this object and prints the values in the following syntax: + * + *

    +   *   public String toString() {
    +   *     return "start=" + startOffset + ",end=" + endOffset;
    +   *   }
    +   * 
    + * + * This method may be overridden by subclasses. + */ + public String toString() { + StringBuffer buffer = new StringBuffer(); + Class clazz = this.getClass(); + Field[] fields = clazz.getDeclaredFields(); + try { + for (int i = 0; i < fields.length; i++) { + Field f = fields[i]; + f.setAccessible(true); + Object value = f.get(this); + if (value == null) { + buffer.append(f.getName() + "=null"); + } else { + buffer.append(f.getName() + "=" + value); + } + if (i < fields.length - 1) { + buffer.append(','); + } + } + } catch (IllegalAccessException e) { + // this should never happen, because we're just accessing fields + // from 'this' + throw new RuntimeException(e); + } + + return buffer.toString(); + } + + /** + * Subclasses must implement this method and should compute + * a hashCode similar to this: + *
    +   *   public int hashCode() {
    +   *     int code = startOffset;
    +   *     code = code * 31 + endOffset;
    +   *     return code;
    +   *   }
    +   * 
    + * + * see also {@link #equals(Object)} + */ + public abstract int hashCode(); + + /** + * All values used for computation of {@link #hashCode()} + * should be checked here for equality. + * + * see also {@link Object#equals(Object)} + */ + public abstract boolean equals(Object other); + + /** + * Copies the values from this Attribute into the passed-in + * target attribute. The type of the target must match the type + * of this attribute. + */ + public abstract void copyTo(AttributeImpl target); + + /** + * Shallow clone. Subclasses must override this if they + * need to clone any members deeply, + */ + public Object clone() { + Object clone = null; + try { + clone = super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); // shouldn't happen + } + return clone; + } +} Index: src/java/org/apache/lucene/util/AttributeSource.java =================================================================== --- src/java/org/apache/lucene/util/AttributeSource.java (revision 793966) +++ src/java/org/apache/lucene/util/AttributeSource.java (working copy) @@ -19,13 +19,13 @@ import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.Collections; import java.util.Map; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStream; // for javadocs - /** - * An AttributeSource contains a list of different {@link Attribute}s, + * An AttributeSource contains a list of different {@link AttributeImpl}s, * and methods to add and get them. There can only be a single instance * of an attribute in the same AttributeSource instance. This is ensured * by passing in the actual type of the Attribute (Class<Attribute>) to @@ -39,30 +39,44 @@ * We will make our best efforts to keep the APIs backwards-compatible.
    */ public class AttributeSource { - /** - * An AttributeAcceptor defines only a single method {@link #accept(Class)}. - * It can be used for e. g. buffering purposes to specify which attributes - * to buffer. - */ - public static abstract class AttributeAcceptor { - /** Return true, to accept this attribute; false otherwise */ - public abstract boolean accept(Class attClass); + public static abstract class AttributeFactory { + public abstract AttributeImpl createAttributeInstance(Class attClass); + + public static final AttributeFactory DEFAULT_ATTRIBUTE_FACTORY = new AttributeFactory() { + public AttributeImpl createAttributeInstance(Class attClass) { + try { + return (AttributeImpl) resolveClassName(attClass.getName()).newInstance(); + } catch (InstantiationException e) { + throw new IllegalArgumentException("Could not instantiate class " + attClass); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("Could not instantiate class " + attClass); + } + } + + protected Class resolveClassName(String attClassName) { + try { + return Class.forName(attClassName + "Impl"); + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException("Could not find implementing class " + attClassName); + } + } + }; } + + // These two maps must always be in sync!!! + // So they are private, final and read-only from the outside (read-only iterators) + private final Map/*,AttributeImpl>*/ attributes; + private final Map/*,AttributeImpl>*/ attributeImpls; + + private AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; - /** - * Default AttributeAcceptor that accepts all attributes. - */ - public static final AttributeAcceptor AllAcceptor = new AttributeAcceptor() { - public boolean accept(Class attClass) {return true;} - }; - - /** - * Holds the Class<Attribute> -> Attribute mapping - */ - protected Map attributes; - + protected void initialize() { + } + public AttributeSource() { this.attributes = new LinkedHashMap(); + this.attributeImpls = new LinkedHashMap(); + initialize(); } public AttributeSource(AttributeSource input) { @@ -70,33 +84,66 @@ throw new IllegalArgumentException("input AttributeSource must not be null"); } this.attributes = input.attributes; + this.attributeImpls = input.attributeImpls; + initialize(); } - /** Returns an iterator that iterates the attributes + public void setAttributeFactory(AttributeFactory factory) { + this.factory = factory; + } + + public AttributeFactory getAttributeFactory() { + return this.factory; + } + + /** Returns a new iterator that iterates the attribute classes * in the same order they were added in. */ - public Iterator getAttributesIterator() { - return attributes.values().iterator(); + public Iterator/*>*/ getAttributeClassesIterator() { + return Collections.unmodifiableSet(attributes.keySet()).iterator(); } + /** Returns a new iterator that iterates all unique Attribute implementations. + * This iterator may contain less entries that {@link #getAttributeNamesIterator}, + * if one instance implements more than one Attribute interface. + */ + public Iterator/**/ getAttributeImplsIterator() { + return Collections.unmodifiableCollection(attributeImpls.values()).iterator(); + } + + public void addAttributeImpl(AttributeImpl att) { + // find all interfaces that this attribute instance implements + // and that extend the Attribute interface + Class clazz = att.getClass(); + do { + Class[] interfaces = att.getClass().getInterfaces(); + for (int i = 0; i < interfaces.length; i++) { + Class curInterface = interfaces[i]; + if (Attribute.class.isAssignableFrom(curInterface)) { + // Attribute is a superclass of this interface + if (!attributes.containsKey(curInterface)) { + // invalidate state to force recomputation in captureState() + this.currentState = null; + attributes.put(curInterface, att); + attributeImpls.put(att.getClass(), att); + } + } + } + clazz = clazz.getSuperclass(); + } while (clazz != null); + } + /** * The caller must pass in a Class<? extends Attribute> value. * This method first checks if an instance of that class is * already in this AttributeSource and returns it. Otherwise a * new instance is created, added to this AttributeSource and returned. */ - public Attribute addAttribute(Class attClass) { - Attribute att = (Attribute) attributes.get(attClass); + public AttributeImpl addAttribute(Class attClass) { + AttributeImpl att = (AttributeImpl) attributes.get(attClass); if (att == null) { - try { - att = (Attribute) attClass.newInstance(); - } catch (InstantiationException e) { - throw new IllegalArgumentException("Could not instantiate class " + attClass); - } catch (IllegalAccessException e) { - throw new IllegalArgumentException("Could not instantiate class " + attClass); - } - - attributes.put(attClass, att); + att = this.factory.createAttributeInstance(attClass); + addAttributeImpl(att); } return att; } @@ -121,10 +168,10 @@ * @throws IllegalArgumentException if this AttributeSource does not contain the * Attribute */ - public Attribute getAttribute(Class attClass) { - Attribute att = (Attribute) this.attributes.get(attClass); + public AttributeImpl getAttribute(Class attClass) { + AttributeImpl att = (AttributeImpl) this.attributes.get(attClass); if (att == null) { - throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'."); + throw new IllegalArgumentException("This AttributeSource does not have the attribute '" + attClass + "'."); } return att; @@ -132,52 +179,63 @@ /** * Resets all Attributes in this AttributeSource by calling - * {@link Attribute#clear()} on each Attribute. + * {@link AttributeImpl#clear()} on each Attribute implementation. */ public void clearAttributes() { - Iterator it = getAttributesIterator(); + Iterator it = getAttributeImplsIterator(); while (it.hasNext()) { - ((Attribute) it.next()).clear(); + ((AttributeImpl) it.next()).clear(); } } - /** - * Captures the current state of the passed in TokenStream. - *

    - * This state will contain all of the passed in TokenStream's - * {@link Attribute}s. If only a subset of the attributes is needed - * please use {@link #captureState(AttributeAcceptor)} - */ - public AttributeSource captureState() { - return captureState(AllAcceptor); - } - - /** - * Captures the current state of the passed in TokenStream. - *

    - * This state will contain all of the passed in TokenStream's - * {@link Attribute}s which the {@link AttributeAcceptor} accepts. - */ - public AttributeSource captureState(AttributeAcceptor acceptor) { - AttributeSource state = new AttributeSource(); - - Iterator it = getAttributesIterator(); - while(it.hasNext()) { - Attribute att = (Attribute) it.next(); - if (acceptor.accept(att.getClass())) { - Attribute clone = (Attribute) att.clone(); - state.attributes.put(att.getClass(), clone); + public static final class State implements Cloneable { + private AttributeImpl attribute; + private State next; + + public Object clone() { + State clone = new State(); + clone.attribute = (AttributeImpl) attribute.clone(); + + if (next != null) { + clone.next = (State) next.clone(); } + + return clone; } - - return state; } + private State currentState; + + private void computeCurrentState() { + currentState = new State(); + State c = currentState; + Iterator it = getAttributeImplsIterator(); + c.attribute = (AttributeImpl) it.next(); + while (it.hasNext()) { + c.next = new State(); + c = c.next; + c.attribute = (AttributeImpl) it.next(); + } + } + + public State captureState() { + if (!hasAttributes()) { + return null; + } + + if (currentState == null) { + computeCurrentState(); + } + return (State) this.currentState.clone(); + } + /** - * Restores this state by copying the values of all attributes - * that this state contains into the attributes of the targetStream. + * Restores this state by copying the values of all attribute implementations + * that this state contains into the attributes implementations of the targetStream. * The targetStream must contain a corresponding instance for each argument - * contained in this state. + * contained in this state (e.g. it is not possible to restore the state of + * an AttributeSource containing a TermAttribute into a AttributeSource using + * a Token instance as implementation). *

    * Note that this method does not affect attributes of the targetStream * that are not contained in this state. In other words, if for example @@ -186,19 +244,22 @@ * reset its value to the default, in which case the caller should first * call {@link TokenStream#clearAttributes()} on the targetStream. */ - public void restoreState(AttributeSource target) { - Iterator it = getAttributesIterator(); - while (it.hasNext()) { - Attribute att = (Attribute) it.next(); - Attribute targetAtt = target.getAttribute(att.getClass()); - att.copyTo(targetAtt); - } + public void restoreState(State state) { + if (state == null) return; + + do { + AttributeImpl targetImpl = (AttributeImpl) attributeImpls.get(state.attribute.getClass()); + if (targetImpl == null) + throw new IllegalArgumentException("State contains an AttributeImpl that is not in this AttributeSource"); + state.attribute.copyTo(targetImpl); + state = state.next; + } while (state != null); } - + public int hashCode() { int code = 0; if (hasAttributes()) { - Iterator it = getAttributesIterator(); + Iterator it = getAttributeImplsIterator(); while (it.hasNext()) { code = code * 31 + it.next().hashCode(); } @@ -220,16 +281,17 @@ return false; } - if (attributes.size() != other.attributes.size()) { + if (this.attributeImpls.size() != other.attributeImpls.size()) { return false; } - Iterator it = getAttributesIterator(); - while (it.hasNext()) { - Class attName = it.next().getClass(); - - Attribute otherAtt = (Attribute) other.attributes.get(attName); - if (otherAtt == null || !otherAtt.equals(attributes.get(attName))) { + // it is only equal if all attribute impls are the same in the same order + Iterator thisIt = this.getAttributeImplsIterator(); + Iterator otherIt = other.getAttributeImplsIterator(); + while (thisIt.hasNext() && otherIt.hasNext()) { + AttributeImpl thisAtt = (AttributeImpl) thisIt.next(); + AttributeImpl otherAtt = (AttributeImpl) otherIt.next(); + if (otherAtt.getClass() != thisAtt.getClass() || !otherAtt.equals(thisAtt)) { return false; } } @@ -240,38 +302,23 @@ } else return false; } - -// TODO: Java 1.5 -// private Map, Attribute> attributes; -// public T addAttribute(Class attClass) { -// T att = (T) attributes.get(attClass); -// if (att == null) { -// try { -// att = attClass.newInstance(); -// } catch (InstantiationException e) { -// throw new IllegalArgumentException("Could not instantiate class " + attClass); -// } catch (IllegalAccessException e) { -// throw new IllegalArgumentException("Could not instantiate class " + attClass); -// } -// -// attributes.put(attClass, att); -// } -// return att; -// } -// -// public boolean hasAttribute(Class attClass) { -// return this.attributes.containsKey(attClass); -// } -// -// public T getAttribute(Class attClass) { -// Attribute att = this.attributes.get(attClass); -// if (att == null) { -// throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'."); -// } -// -// return (T) att; -// } -// + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('('); + + if (hasAttributes()) { + Iterator it = getAttributeImplsIterator(); + if (it.hasNext()) { + sb.append(it.next().toString()); + } + while (it.hasNext()) { + sb.append(','); + sb.append(it.next().toString()); + } + } + sb.append(')'); + return sb.toString(); + } } Index: src/test/org/apache/lucene/analysis/TestASCIIFoldingFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestASCIIFoldingFilter.java (revision 793966) +++ src/test/org/apache/lucene/analysis/TestASCIIFoldingFilter.java (working copy) @@ -17,6 +17,7 @@ * limitations under the License. */ +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.LuceneTestCase; import java.io.StringReader; @@ -34,84 +35,84 @@ +" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl")); ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); - assertEquals("Des", filter.next(reusableToken).term()); - assertEquals("mot", filter.next(reusableToken).term()); - assertEquals("cles", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("LA", filter.next(reusableToken).term()); - assertEquals("CHAINE", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("AE", filter.next(reusableToken).term()); - assertEquals("C", filter.next(reusableToken).term()); - assertEquals("E", filter.next(reusableToken).term()); - assertEquals("E", filter.next(reusableToken).term()); - assertEquals("E", filter.next(reusableToken).term()); - assertEquals("E", filter.next(reusableToken).term()); - assertEquals("I", filter.next(reusableToken).term()); - assertEquals("I", filter.next(reusableToken).term()); - assertEquals("I", filter.next(reusableToken).term()); - assertEquals("I", filter.next(reusableToken).term()); - assertEquals("IJ", filter.next(reusableToken).term()); - assertEquals("D", filter.next(reusableToken).term()); - assertEquals("N", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("OE", filter.next(reusableToken).term()); - assertEquals("TH", filter.next(reusableToken).term()); - assertEquals("U", filter.next(reusableToken).term()); - assertEquals("U", filter.next(reusableToken).term()); - assertEquals("U", filter.next(reusableToken).term()); - assertEquals("U", filter.next(reusableToken).term()); - assertEquals("Y", filter.next(reusableToken).term()); - assertEquals("Y", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("ae", filter.next(reusableToken).term()); - assertEquals("c", filter.next(reusableToken).term()); - assertEquals("e", filter.next(reusableToken).term()); - assertEquals("e", filter.next(reusableToken).term()); - assertEquals("e", filter.next(reusableToken).term()); - assertEquals("e", filter.next(reusableToken).term()); - assertEquals("i", filter.next(reusableToken).term()); - assertEquals("i", filter.next(reusableToken).term()); - assertEquals("i", filter.next(reusableToken).term()); - assertEquals("i", filter.next(reusableToken).term()); - assertEquals("ij", filter.next(reusableToken).term()); - assertEquals("d", filter.next(reusableToken).term()); - assertEquals("n", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("oe", filter.next(reusableToken).term()); - assertEquals("ss", filter.next(reusableToken).term()); - assertEquals("th", filter.next(reusableToken).term()); - assertEquals("u", filter.next(reusableToken).term()); - assertEquals("u", filter.next(reusableToken).term()); - assertEquals("u", filter.next(reusableToken).term()); - assertEquals("u", filter.next(reusableToken).term()); - assertEquals("y", filter.next(reusableToken).term()); - assertEquals("y", filter.next(reusableToken).term()); - assertEquals("fi", filter.next(reusableToken).term()); - assertEquals("fl", filter.next(reusableToken).term()); - assertNull(filter.next(reusableToken)); + assertTermEquals("Des", filter, termAtt); + assertTermEquals("mot", filter, termAtt); + assertTermEquals("cles", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("LA", filter, termAtt); + assertTermEquals("CHAINE", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("AE", filter, termAtt); + assertTermEquals("C", filter, termAtt); + assertTermEquals("E", filter, termAtt); + assertTermEquals("E", filter, termAtt); + assertTermEquals("E", filter, termAtt); + assertTermEquals("E", filter, termAtt); + assertTermEquals("I", filter, termAtt); + assertTermEquals("I", filter, termAtt); + assertTermEquals("I", filter, termAtt); + assertTermEquals("I", filter, termAtt); + assertTermEquals("IJ", filter, termAtt); + assertTermEquals("D", filter, termAtt); + assertTermEquals("N", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("OE", filter, termAtt); + assertTermEquals("TH", filter, termAtt); + assertTermEquals("U", filter, termAtt); + assertTermEquals("U", filter, termAtt); + assertTermEquals("U", filter, termAtt); + assertTermEquals("U", filter, termAtt); + assertTermEquals("Y", filter, termAtt); + assertTermEquals("Y", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("ae", filter, termAtt); + assertTermEquals("c", filter, termAtt); + assertTermEquals("e", filter, termAtt); + assertTermEquals("e", filter, termAtt); + assertTermEquals("e", filter, termAtt); + assertTermEquals("e", filter, termAtt); + assertTermEquals("i", filter, termAtt); + assertTermEquals("i", filter, termAtt); + assertTermEquals("i", filter, termAtt); + assertTermEquals("i", filter, termAtt); + assertTermEquals("ij", filter, termAtt); + assertTermEquals("d", filter, termAtt); + assertTermEquals("n", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("oe", filter, termAtt); + assertTermEquals("ss", filter, termAtt); + assertTermEquals("th", filter, termAtt); + assertTermEquals("u", filter, termAtt); + assertTermEquals("u", filter, termAtt); + assertTermEquals("u", filter, termAtt); + assertTermEquals("u", filter, termAtt); + assertTermEquals("y", filter, termAtt); + assertTermEquals("y", filter, termAtt); + assertTermEquals("fi", filter, termAtt); + assertTermEquals("fl", filter, termAtt); + assertFalse(filter.incrementToken()); } @@ -1891,11 +1892,16 @@ TokenStream stream = new WhitespaceTokenizer(new StringReader(inputText.toString())); ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); Iterator expectedIter = expectedOutputTokens.iterator(); - while (expectedIter.hasNext()) { - assertEquals(expectedIter.next(), filter.next(reusableToken).term()); + while (expectedIter.hasNext()) {; + assertTermEquals((String)expectedIter.next(), filter, termAtt); } - assertNull(filter.next(reusableToken)); + assertFalse(filter.incrementToken()); } + + void assertTermEquals(String expected, TokenStream stream, TermAttribute termAtt) throws Exception { + assertTrue(stream.incrementToken()); + assertEquals(expected, termAtt.term()); + } } Index: src/test/org/apache/lucene/analysis/TestNumericTokenStream.java =================================================================== --- src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (revision 793966) +++ src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (working copy) @@ -27,9 +27,8 @@ static final long lvalue = 4573245871874382L; static final int ivalue = 123456; - public void testLongStreamNewAPI() throws Exception { + public void testLongStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); - stream.setUseNewAPI(true); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); final TypeAttribute typeAtt = (TypeAttribute) stream.getAttribute(TypeAttribute.class); @@ -40,22 +39,9 @@ } assertFalse("No more tokens available", stream.incrementToken()); } - - public void testLongStreamOldAPI() throws Exception { - final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); - stream.setUseNewAPI(false); - Token tok=new Token(); - for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { - assertNotNull("New token is available", tok=stream.next(tok)); - assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), tok.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, tok.type()); - } - assertNull("No more tokens available", stream.next(tok)); - } - public void testIntStreamNewAPI() throws Exception { + public void testIntStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); - stream.setUseNewAPI(true); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); final TypeAttribute typeAtt = (TypeAttribute) stream.getAttribute(TypeAttribute.class); @@ -67,18 +53,6 @@ assertFalse("No more tokens available", stream.incrementToken()); } - public void testIntStreamOldAPI() throws Exception { - final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); - stream.setUseNewAPI(false); - Token tok=new Token(); - for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { - assertNotNull("New token is available", tok=stream.next(tok)); - assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), tok.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, tok.type()); - } - assertNull("No more tokens available", stream.next(tok)); - } - public void testNotInitialized() throws Exception { final NumericTokenStream stream=new NumericTokenStream(); @@ -89,21 +63,12 @@ // pass } - stream.setUseNewAPI(true); try { stream.incrementToken(); fail("incrementToken() should not succeed."); } catch (IllegalStateException e) { // pass } - - stream.setUseNewAPI(false); - try { - stream.next(new Token()); - fail("next() should not succeed."); - } catch (IllegalStateException e) { - // pass - } } } Index: src/test/org/apache/lucene/analysis/TestTeeTokenFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestTeeTokenFilter.java (revision 793966) +++ src/test/org/apache/lucene/analysis/TestTeeTokenFilter.java (working copy) @@ -18,9 +18,6 @@ import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; @@ -43,8 +40,7 @@ super(s); } - protected void setUp() throws Exception { - super.setUp(); + protected void setUp() { tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"}; tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"}; buffer1 = new StringBuffer(); @@ -66,29 +62,24 @@ public void test() throws IOException { SinkTokenizer sink1 = new SinkTokenizer(null) { - public void add(AttributeSource a) throws IOException { - TermAttribute termAtt = null; - if (a.hasAttribute(TermAttribute.class)) { - termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); + public void add(Token t) { + if (t != null && t.term().equalsIgnoreCase("The")) { + super.add(t); } - if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) { - super.add(a); - } } }; TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1); int i = 0; - TermAttribute termAtt = (TermAttribute) source.getAttribute(TermAttribute.class); - while (source.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true); + final Token reusableToken = new Token(); + for (Token nextToken = source.next(reusableToken); nextToken != null; nextToken = source.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2); i = 0; - termAtt = (TermAttribute) sink1.getAttribute(TermAttribute.class); - while (sink1.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true); + for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) { + assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size()); @@ -96,67 +87,55 @@ public void testMultipleSources() throws Exception { SinkTokenizer theDetector = new SinkTokenizer(null) { - public void add(AttributeSource a) throws IOException { - TermAttribute termAtt = null; - if (a.hasAttribute(TermAttribute.class)) { - termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); + public void add(Token t) { + if (t != null && t.term().equalsIgnoreCase("The")) { + super.add(t); } - if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) { - super.add(a); - } } }; - SinkTokenizer dogDetector = new SinkTokenizer(null) { - public void add(AttributeSource a) throws IOException { - TermAttribute termAtt = null; - if (a.hasAttribute(TermAttribute.class)) { - termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); + SinkTokenizer dogDetector = new SinkTokenizer(null) { + public void add(Token t) { + if (t != null && t.term().equalsIgnoreCase("Dogs")) { + super.add(t); } - if (termAtt != null && termAtt.term().equalsIgnoreCase("Dogs")) { - super.add(a); - } } }; TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector); int i = 0; - TermAttribute termAtt = (TermAttribute) source1.getAttribute(TermAttribute.class); - while (source1.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true); + final Token reusableToken = new Token(); + for (Token nextToken = source1.next(reusableToken); nextToken != null; nextToken = source1.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1); i = 0; - termAtt = (TermAttribute) source2.getAttribute(TermAttribute.class); - while (source2.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + tokens2[i], termAtt.term().equals(tokens2[i]) == true); + for (Token nextToken = source2.next(reusableToken); nextToken != null; nextToken = source2.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + tokens2[i], nextToken.term().equals(tokens2[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2); i = 0; - termAtt = (TermAttribute) theDetector.getAttribute(TermAttribute.class); - while (theDetector.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true); + for (Token nextToken = theDetector.next(reusableToken); nextToken != null; nextToken = theDetector.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + "The", nextToken.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size()); i = 0; - termAtt = (TermAttribute) dogDetector.getAttribute(TermAttribute.class); - while (dogDetector.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + "Dogs", termAtt.term().equalsIgnoreCase("Dogs") == true); + for (Token nextToken = dogDetector.next(reusableToken); nextToken != null; nextToken = dogDetector.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + "Dogs", nextToken.term().equalsIgnoreCase("Dogs") == true); i++; } assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size()); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; - termAtt = (TermAttribute) lowerCasing.getAttribute(TermAttribute.class); - while (lowerCasing.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + tokens1[i].toLowerCase(), termAtt.term().equals(tokens1[i].toLowerCase()) == true); + for (Token nextToken = lowerCasing.next(reusableToken); nextToken != null; nextToken = lowerCasing.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + tokens1[i].toLowerCase(), nextToken.term().equals(tokens1[i].toLowerCase()) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); @@ -178,20 +157,21 @@ } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100); + final Token reusableToken = new Token(); TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); - while (stream.incrementToken()) { + while (stream.next(reusableToken) != null) { } stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100); List tmp = new ArrayList(); - while (stream.incrementToken()) { - tmp.add(stream.captureState()); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + tmp.add(nextToken.clone()); } List sinkList = sink.getTokens(); assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size()); for (int i = 0; i < tmp.size(); i++) { - AttributeSource tfTok = (AttributeSource) tmp.get(i); - AttributeSource sinkTok = (AttributeSource) sinkList.get(i); - assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true); + Token tfTok = (Token) tmp.get(i); + Token sinkTok = (Token) sinkList.get(i); + assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true); } //simulate two fields, each being analyzed once, for 20 documents @@ -200,14 +180,12 @@ long start = System.currentTimeMillis(); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))); - PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); - while (stream.incrementToken()) { - tfPos += posIncrAtt.getPositionIncrement(); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + tfPos += nextToken.getPositionIncrement(); } stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]); - posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); - while (stream.incrementToken()) { - tfPos += posIncrAtt.getPositionIncrement(); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + tfPos += nextToken.getPositionIncrement(); } } long finish = System.currentTimeMillis(); @@ -218,15 +196,13 @@ for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]); stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); - PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); - while (stream.incrementToken()) { - sinkPos += posIncrAtt.getPositionIncrement(); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + sinkPos += nextToken.getPositionIncrement(); } //System.out.println("Modulo--------"); stream = sink; - posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); - while (stream.incrementToken()) { - sinkPos += posIncrAtt.getPositionIncrement(); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + sinkPos += nextToken.getPositionIncrement(); } } finish = System.currentTimeMillis(); @@ -252,15 +228,15 @@ int count = 0; //return every 100 tokens - public boolean incrementToken() throws IOException { - boolean hasNext; - for (hasNext = input.incrementToken(); - hasNext && count % modCount != 0; - hasNext = input.incrementToken()) { + public Token next(final Token reusableToken) throws IOException { + Token nextToken = null; + for (nextToken = input.next(reusableToken); + nextToken != null && count % modCount != 0; + nextToken = input.next(reusableToken)) { count++; } count++; - return hasNext; + return nextToken; } } @@ -274,9 +250,9 @@ lst = new ArrayList(numToks % mc); } - public void add(AttributeSource a) throws IOException { - if (a != null && count % modCount == 0) { - super.add(a); + public void add(Token t) { + if (t != null && count % modCount == 0) { + super.add(t); } count++; } Index: src/test/org/apache/lucene/analysis/TestTokenStreamBWComp.java =================================================================== --- src/test/org/apache/lucene/analysis/TestTokenStreamBWComp.java (revision 0) +++ src/test/org/apache/lucene/analysis/TestTokenStreamBWComp.java (revision 0) @@ -0,0 +1,262 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.index.Payload; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.tokenattributes.*; + +/** This class tests some special cases of backwards compatibility when using the new TokenStream API with old analyzers */ +public class TestTokenStreamBWComp extends LuceneTestCase { + + private final String doc = "This is the new TokenStream api"; + private final String[] stopwords = new String[] {"is", "the", "this"}; + + public static class POSToken extends Token { + public static final int PROPERNOUN = 1; + public static final int NO_NOUN = 2; + + private int partOfSpeech; + + public void setPartOfSpeech(int pos) { + partOfSpeech = pos; + } + + public int getPartOfSpeech() { + return this.partOfSpeech; + } + } + + static class PartOfSpeechTaggingFilter extends TokenFilter { + + protected PartOfSpeechTaggingFilter(TokenStream input) { + super(input); + } + + public Token next() throws IOException { + Token t = input.next(); + if (t == null) return null; + + POSToken pt = new POSToken(); + pt.reinit(t); + if (pt.termLength() > 0) { + if (Character.isUpperCase(pt.termBuffer()[0])) { + pt.setPartOfSpeech(POSToken.PROPERNOUN); + } else { + pt.setPartOfSpeech(POSToken.NO_NOUN); + } + } + return pt; + } + + } + + static class PartOfSpeechAnnotatingFilter extends TokenFilter { + public final static byte PROPER_NOUN_ANNOTATION = 1; + + + protected PartOfSpeechAnnotatingFilter(TokenStream input) { + super(input); + } + + public Token next() throws IOException { + Token t = input.next(); + if (t == null) return null; + + if (t instanceof POSToken) { + POSToken pt = (POSToken) t; + if (pt.getPartOfSpeech() == POSToken.PROPERNOUN) { + pt.setPayload(new Payload(new byte[] {PROPER_NOUN_ANNOTATION})); + } + return pt; + } else { + return t; + } + } + + } + + // test the chain: The one and only term "TokenStream" should be declared as proper noun: + + public void testTeeSinkCustomTokenNewAPI() throws IOException { + testTeeSinkCustomToken(0); + } + + public void testTeeSinkCustomTokenOldAPI() throws IOException { + testTeeSinkCustomToken(1); + } + + public void testTeeSinkCustomTokenVeryOldAPI() throws IOException { + testTeeSinkCustomToken(2); + } + + private void testTeeSinkCustomToken(int api) throws IOException { + TokenStream stream = new WhitespaceTokenizer(new StringReader(doc)); + stream = new PartOfSpeechTaggingFilter(stream); + stream = new LowerCaseFilter(stream); + stream = new StopFilter(stream, stopwords); + + SinkTokenizer sink = new SinkTokenizer(); + TokenStream stream1 = new PartOfSpeechAnnotatingFilter(sink); + + stream = new TeeTokenFilter(stream, sink); + stream = new PartOfSpeechAnnotatingFilter(stream); + + switch (api) { + case 0: + consumeStreamNewAPI(stream); + consumeStreamNewAPI(stream1); + break; + case 1: + consumeStreamOldAPI(stream); + consumeStreamOldAPI(stream1); + break; + case 2: + consumeStreamVeryOldAPI(stream); + consumeStreamVeryOldAPI(stream1); + break; + } + } + + private static void consumeStreamNewAPI(TokenStream stream) throws IOException { + stream.reset(); + PayloadAttribute payloadAtt = (PayloadAttribute) stream.addAttribute(PayloadAttribute.class); + TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); + + while (stream.incrementToken()) { + String term = termAtt.term(); + Payload p = payloadAtt.getPayload(); + if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { + assertTrue("only TokenStream is a proper noun", "tokenstream".equals(term)); + } else { + assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term)); + } + } + } + + private static void consumeStreamOldAPI(TokenStream stream) throws IOException { + stream.reset(); + Token reusableToken = new Token(); + + while ((reusableToken = stream.next(reusableToken)) != null) { + String term = reusableToken.term(); + Payload p = reusableToken.getPayload(); + if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { + assertTrue("only TokenStream is a proper noun", "tokenstream".equals(term)); + } else { + assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term)); + } + } + } + + private static void consumeStreamVeryOldAPI(TokenStream stream) throws IOException { + stream.reset(); + + Token token; + while ((token = stream.next()) != null) { + String term = token.term(); + Payload p = token.getPayload(); + if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { + assertTrue("only TokenStream is a proper noun", "tokenstream".equals(term)); + } else { + assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term)); + } + } + } + + // test if tokenization fails, if only the new API is allowed and an old TokenStream is in the chain + public void testOnlyNewAPI() throws IOException { + TokenStream.setOnlyUseNewAPI(true); + try { + + // this should fail with UOE + try { + TokenStream stream = new WhitespaceTokenizer(new StringReader(doc)); + stream = new PartOfSpeechTaggingFilter(stream); // <-- this one is evil! + stream = new LowerCaseFilter(stream); + stream = new StopFilter(stream, stopwords); + while (stream.incrementToken()); + fail("If only the new API is allowed, this should fail with an UOE"); + } catch (UnsupportedOperationException uoe) { + assertTrue((PartOfSpeechTaggingFilter.class.getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI.").equals(uoe.getMessage())); + } + + // this should pass, as all core token streams support the new API + TokenStream stream = new WhitespaceTokenizer(new StringReader(doc)); + stream = new LowerCaseFilter(stream); + stream = new StopFilter(stream, stopwords); + while (stream.incrementToken()); + + // Test, if all attributes are implemented by their implementation, not Token/TokenWrapper + assertTrue("TermAttribute is implemented by TermAttributeImpl", + stream.addAttribute(TermAttribute.class) instanceof TermAttributeImpl); + assertTrue("OffsetAttribute is implemented by OffsetAttributeImpl", + stream.addAttribute(OffsetAttribute.class) instanceof OffsetAttributeImpl); + assertTrue("FlagsAttribute is implemented by FlagsAttributeImpl", + stream.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl); + assertTrue("PayloadAttribute is implemented by PayloadAttributeImpl", + stream.addAttribute(PayloadAttribute.class) instanceof PayloadAttributeImpl); + assertTrue("PositionIncrementAttribute is implemented by PositionIncrementAttributeImpl", + stream.addAttribute(PositionIncrementAttribute.class) instanceof PositionIncrementAttributeImpl); + assertTrue("TypeAttribute is implemented by TypeAttributeImpl", + stream.addAttribute(TypeAttribute.class) instanceof TypeAttributeImpl); + + // Test if the wrapper API (onlyUseNewAPI==false) uses TokenWrapper + // as attribute instance. + // TokenWrapper encapsulates a Token instance that can be exchanged + // by another Token instance without changing the AttributeImpl instance + // itsself. + TokenStream.setOnlyUseNewAPI(false); + stream = new WhitespaceTokenizer(new StringReader(doc)); + assertTrue("TermAttribute is implemented by TokenWrapper", + stream.addAttribute(TermAttribute.class) instanceof TokenWrapper); + assertTrue("OffsetAttribute is implemented by TokenWrapper", + stream.addAttribute(OffsetAttribute.class) instanceof TokenWrapper); + assertTrue("FlagsAttribute is implemented by TokenWrapper", + stream.addAttribute(FlagsAttribute.class) instanceof TokenWrapper); + assertTrue("PayloadAttribute is implemented by TokenWrapper", + stream.addAttribute(PayloadAttribute.class) instanceof TokenWrapper); + assertTrue("PositionIncrementAttribute is implemented by TokenWrapper", + stream.addAttribute(PositionIncrementAttribute.class) instanceof TokenWrapper); + assertTrue("TypeAttribute is implemented by TokenWrapper", + stream.addAttribute(TypeAttribute.class) instanceof TokenWrapper); + + } finally { + TokenStream.setOnlyUseNewAPI(false); + } + } + + public void testOverridesAny() throws Exception { + try { + TokenStream stream = new WhitespaceTokenizer(new StringReader(doc)); + stream = new TokenFilter(stream) { + // we implement nothing, only un-abstract it + }; + stream = new LowerCaseFilter(stream); + stream = new StopFilter(stream, stopwords); + while (stream.incrementToken()); + fail("One TokenFilter does not override any of the required methods, so it should fail."); + } catch (UnsupportedOperationException uoe) { + assertTrue(uoe.getMessage().endsWith("does not implement any of incrementToken(), next(Token), next().")); + } + } + +} \ No newline at end of file Property changes on: src\test\org\apache\lucene\analysis\TestTokenStreamBWComp.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/test/org/apache/lucene/index/TestDocumentWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 793966) +++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy) @@ -141,11 +141,11 @@ public TokenStream tokenStream(String fieldName, Reader reader) { return new TokenFilter(new WhitespaceTokenizer(reader)) { boolean first=true; - AttributeSource state; + AttributeSource.State state; public boolean incrementToken() throws IOException { if (state != null) { - state.restoreState(this); + restoreState(state); payloadAtt.setPayload(null); posIncrAtt.setPositionIncrement(0); termAtt.setTermBuffer(new char[]{'b'}, 0, 1); Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 793966) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SinkTokenizer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; @@ -3521,47 +3522,21 @@ } } - private static class MyAnalyzer extends Analyzer { - - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream s = new WhitespaceTokenizer(reader); - s.addAttribute(PositionIncrementAttribute.class); - return s; - } - - } - // LUCENE-1255 public void testNegativePositions() throws Throwable { SinkTokenizer tokens = new SinkTokenizer(); - tokens.addAttribute(TermAttribute.class); - tokens.addAttribute(PositionIncrementAttribute.class); + Token t = new Token(); + t.setTermBuffer("a"); + t.setPositionIncrement(0); + tokens.add(t); + t.setTermBuffer("b"); + t.setPositionIncrement(1); + tokens.add(t); + t.setTermBuffer("c"); + tokens.add(t); - AttributeSource state = new AttributeSource(); - TermAttribute termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); - PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); - termAtt.setTermBuffer("a"); - posIncrAtt.setPositionIncrement(0); - tokens.add(state); - - state = new AttributeSource(); - termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); - posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); - - termAtt.setTermBuffer("b"); - posIncrAtt.setPositionIncrement(1); - tokens.add(state); - - state = new AttributeSource(); - termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); - posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); - - termAtt.setTermBuffer("c"); - posIncrAtt.setPositionIncrement(1); - tokens.add(state); - MockRAMDirectory dir = new MockRAMDirectory(); - IndexWriter w = new IndexWriter(dir, new MyAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("field", tokens)); w.addDocument(doc); Index: src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java =================================================================== --- src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (revision 793966) +++ src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (working copy) @@ -17,6 +17,7 @@ * limitations under the License. */ +import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.Map; @@ -317,8 +318,8 @@ } private static class EmptyTokenStream extends TokenStream { - public Token next(final Token reusableToken) { - return null; + public boolean incrementToken() throws IOException { + return false; } } } Index: src/test/org/apache/lucene/util/LuceneTestCase.java =================================================================== --- src/test/org/apache/lucene/util/LuceneTestCase.java (revision 793966) +++ src/test/org/apache/lucene/util/LuceneTestCase.java (working copy) @@ -44,7 +44,6 @@ protected void setUp() throws Exception { ConcurrentMergeScheduler.setTestMode(); - TokenStream.setUseNewAPIDefault(true); } protected void tearDown() throws Exception { Index: src/test/org/apache/lucene/util/TestAttributeSource.java =================================================================== --- src/test/org/apache/lucene/util/TestAttributeSource.java (revision 0) +++ src/test/org/apache/lucene/util/TestAttributeSource.java (revision 0) @@ -0,0 +1,101 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.*; + +import java.util.Iterator; + +public class TestAttributeSource extends LuceneTestCase { + + public void testCaptureState() { + // init a first instance + AttributeSource src = new AttributeSource(); + TermAttribute termAtt = (TermAttribute) src.addAttribute(TermAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) src.addAttribute(TypeAttribute.class); + termAtt.setTermBuffer("TestTerm"); + typeAtt.setType("TestType"); + final int hashCode = src.hashCode(); + + AttributeSource.State state = src.captureState(); + + // modify the attributes + termAtt.setTermBuffer("AnotherTestTerm"); + typeAtt.setType("AnotherTestType"); + assertTrue("Hash code should be different", hashCode != src.hashCode()); + + src.restoreState(state); + assertEquals("TestTerm", termAtt.term()); + assertEquals("TestType", typeAtt.type()); + assertEquals("Hash code should be equal after restore", hashCode, src.hashCode()); + + // restore into an exact configured copy + AttributeSource copy = new AttributeSource(); + copy.addAttribute(TermAttribute.class); + copy.addAttribute(TypeAttribute.class); + copy.restoreState(state); + assertEquals("Both AttributeSources should have same hashCode after restore", src.hashCode(), copy.hashCode()); + assertEquals("Both AttributeSources should be equal after restore", src, copy); + + // init a second instance (with attributes in different order and one additional attribute) + AttributeSource src2 = new AttributeSource(); + typeAtt = (TypeAttribute) src2.addAttribute(TypeAttribute.class); + FlagsAttribute flagsAtt = (FlagsAttribute) src2.addAttribute(FlagsAttribute.class); + termAtt = (TermAttribute) src2.addAttribute(TermAttribute.class); + flagsAtt.setFlags(12345); + + src2.restoreState(state); + assertEquals("TestTerm", termAtt.term()); + assertEquals("TestType", typeAtt.type()); + assertEquals("FlagsAttribute should not be touched", 12345, flagsAtt.getFlags()); + + // init a third instance missing one Attribute + AttributeSource src3 = new AttributeSource(); + termAtt = (TermAttribute) src3.addAttribute(TermAttribute.class); + try { + src3.restoreState(state); + fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException"); + } catch (IllegalArgumentException iae) { + // pass + } + } + + public void testToStringAndMultiAttributeImplementations() { + AttributeSource src = new AttributeSource(); + TermAttribute termAtt = (TermAttribute) src.addAttribute(TermAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) src.addAttribute(TypeAttribute.class); + termAtt.setTermBuffer("TestTerm"); + typeAtt.setType("TestType"); + assertEquals("Attributes should appear in original order", "("+termAtt.toString()+","+typeAtt.toString()+")", src.toString()); + + src = new AttributeSource(); + src.addAttributeImpl(new Token()); + // this should not add a new attribute as Token implements TermAttribute, too + termAtt = (TermAttribute) src.addAttribute(TermAttribute.class); + assertTrue("TermAttribute should be implemented by Token", termAtt instanceof Token); + // get the Token attribute and check, that it is the only one + final Iterator it = src.getAttributeImplsIterator(); + Token tok = (Token) it.next(); + assertFalse("There should be only one attribute implementation instance", it.hasNext()); + + termAtt.setTermBuffer("TestTerm"); + assertEquals("Token should only printed once", "("+tok.toString()+")", src.toString()); + } + +} Property changes on: src\test\org\apache\lucene\util\TestAttributeSource.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native