Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 1188236) +++ lucene/contrib/CHANGES.txt (working copy) @@ -82,6 +82,9 @@ * LUCENE-3446: Removed BooleanFilter.finalResult() due to change to FixedBitSet. (Uwe Schindler) + * LUCENE-3508: Changed some method signatures in decompounding TokenFilters + to make them no longer use the Token class. (Uwe Schindler) + New Features * LUCENE-1824: Add BoundaryScanner interface and its implementation classes, @@ -129,6 +132,10 @@ Java that crash on certain inputs containing supplementary characters. (Robert Muir) + * LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be + used with custom attributes. All those attributes are preserved and set on all + added decompounded tokens. (Spyros Kapnissis, Uwe Schindler) + API Changes * LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 1188236) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -24,20 +24,17 @@ import java.util.Locale; import java.util.Set; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; /** - * Base class for decomposition token filters. + * Base class for decomposition token filters. *

* You must specify the required {@link Version} compatibility when creating * CompoundWordTokenFilterBase: @@ -46,6 +43,13 @@ * supplementary characters in strings and char arrays provided as compound word * dictionaries. * + *

If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary, + * it should be case-insensitive unless it contains only lowercased entries and you + * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain. + * For optional performance (as this filter does lots of lookups to the dictionary, + * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary + * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * transformed to case-insensitive! */ public abstract class CompoundWordTokenFilterBase extends TokenFilter { /** @@ -64,37 +68,22 @@ public static final int DEFAULT_MAX_SUBWORD_SIZE = 15; protected final CharArraySet dictionary; - protected final LinkedList tokens; + protected final LinkedList tokens; protected final int minWordSize; protected final int minSubwordSize; protected final int maxSubwordSize; protected final boolean onlyLongestMatch; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); + protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); - private final Token wrapper = new Token(); + private AttributeSource.State current; - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { - this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); - } - - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) { - this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); - } - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, boolean onlyLongestMatch) { this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); } - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) { - this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); - } - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary) { this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); } @@ -102,7 +91,7 @@ protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(input); - this.tokens=new LinkedList(); + this.tokens=new LinkedList(); this.minWordSize=minWordSize; this.minSubwordSize=minSubwordSize; this.maxSubwordSize=maxSubwordSize; @@ -111,113 +100,95 @@ if (dictionary==null || dictionary instanceof CharArraySet) { this.dictionary = (CharArraySet) dictionary; } else { - this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false); - addAllLowerCase(this.dictionary, dictionary); + this.dictionary = new CharArraySet(matchVersion, dictionary, true); } } - /** - * Create a set of words from an array - * The resulting Set does case insensitive matching - * TODO We should look for a faster dictionary lookup approach. - * @param dictionary - * @return {@link Set} of lowercased terms - */ - public static Set makeDictionary(final String[] dictionary) { - return makeDictionary(Version.LUCENE_30, dictionary); + /** @deprecated Use the constructors taking {@link Set} */ + @Deprecated + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) { + this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); } + + /** @deprecated Use the constructors taking {@link Set} */ + @Deprecated + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { + this(matchVersion, input,makeDictionary(matchVersion,dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); + } - public static Set makeDictionary(final Version matchVersion, final String[] dictionary) { + /** @deprecated Use the constructors taking {@link Set} */ + @Deprecated + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) { + this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + } + + /** @deprecated Only available for backwards compatibility. */ + @Deprecated + public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) { if (dictionary == null) { return null; } - // is the below really case insensitive? - CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false); - addAllLowerCase(dict, Arrays.asList(dictionary)); - return dict; + return new CharArraySet(matchVersion, Arrays.asList(dictionary), true); } - private void setToken(final Token token) throws IOException { - clearAttributes(); - termAtt.copyBuffer(token.buffer(), 0, token.length()); - flagsAtt.setFlags(token.getFlags()); - typeAtt.setType(token.type()); - offsetAtt.setOffset(token.startOffset(), token.endOffset()); - posIncAtt.setPositionIncrement(token.getPositionIncrement()); - payloadAtt.setPayload(token.getPayload()); - } - @Override public final boolean incrementToken() throws IOException { - if (tokens.size() > 0) { - setToken(tokens.removeFirst()); + if (!tokens.isEmpty()) { + assert current != null; + CompoundToken token = tokens.removeFirst(); + restoreState(current); // keep all other attributes untouched + termAtt.setEmpty().append(token.txt); + offsetAtt.setOffset(token.startOffset, token.endOffset); + posIncAtt.setPositionIncrement(0); return true; } - if (!input.incrementToken()) - return false; - - wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length()); - wrapper.setStartOffset(offsetAtt.startOffset()); - wrapper.setEndOffset(offsetAtt.endOffset()); - wrapper.setFlags(flagsAtt.getFlags()); - wrapper.setType(typeAtt.type()); - wrapper.setPositionIncrement(posIncAtt.getPositionIncrement()); - wrapper.setPayload(payloadAtt.getPayload()); - - decompose(wrapper); - - if (tokens.size() > 0) { - setToken(tokens.removeFirst()); + current = null; // not really needed, but for safety + if (input.incrementToken()) { + // Only words longer than minWordSize get processed + if (termAtt.length() >= this.minWordSize) { + decompose(); + // only capture the state if we really need it for producing new tokens + if (!tokens.isEmpty()) { + current = captureState(); + } + } + // return original token: return true; } else { return false; } } - - protected static void addAllLowerCase(CharArraySet target, Collection col) { - for (Object obj : col) { - String string = (String) obj; - target.add(string.toLowerCase(Locale.ENGLISH)); - } - } - - protected static char[] makeLowerCaseCopy(final char[] buffer) { - char[] result=new char[buffer.length]; - System.arraycopy(buffer, 0, result, 0, buffer.length); - - for (int i=0;i + *

+ * You must specify the required {@link Version} compatibility when creating + * CompoundWordTokenFilterBase: + *

+ *

If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary, + * it should be case-insensitive unless it contains only lowercased entries and you + * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain. + * For optional performance (as this filter does lots of lookups to the dictionary, + * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary + * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * transformed to case-insensitive! */ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase { /** - * Creates a new {@link DictionaryCompoundWordTokenFilter} - * + * Creates a new {@link DictionaryCompoundWordTokenFilter}. * @param matchVersion * Lucene version to enable correct Unicode 4.0 behavior in the * dictionaries if Version > 3.0. See dictionary) { super(matchVersion, input, dictionary); } @@ -109,10 +122,7 @@ * @param input * the {@link TokenStream} to process * @param dictionary - * the word dictionary to match against. If this is a - * {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it - * must have set ignoreCase=false and only contain lower case - * strings. + * the word dictionary to match against. * @param minWordSize * only words longer than this get processed * @param minSubwordSize @@ -122,37 +132,31 @@ * @param onlyLongestMatch * Add only the longest matching subword to the stream */ - public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary, + public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } @Override - protected void decomposeInternal(final Token token) { - // Only words longer than minWordSize get processed - if (token.length() < this.minWordSize) { - return; - } - - char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer()); - - for (int i=0;i<=token.length()-this.minSubwordSize;++i) { - Token longestMatchToken=null; + protected void decompose() { + final int len = termAtt.length(); + for (int i=0;i<=len-this.minSubwordSize;++i) { + CompoundToken longestMatchToken=null; for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) { - if(i+j>token.length()) { + if(i+j>len) { break; } - if(dictionary.contains(lowerCaseTermBuffer, i, j)) { + if(dictionary.contains(termAtt.buffer(), i, j)) { if (this.onlyLongestMatch) { if (longestMatchToken!=null) { - if (longestMatchToken.length() + *

+ * You must specify the required {@link Version} compatibility when creating + * CompoundWordTokenFilterBase: + *

+ *

If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary, + * it should be case-insensitive unless it contains only lowercased entries and you + * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain. + * For optional performance (as this filter does lots of lookups to the dictionary, + * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary + * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * transformed to case-insensitive! */ public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterBase { @@ -62,7 +75,9 @@ * only subwords shorter than this get to the output stream * @param onlyLongestMatch * Add only the longest matching subword to the stream + * @deprecated Use the constructors taking {@link Set} */ + @Deprecated public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { @@ -86,10 +101,12 @@ * the hyphenation pattern tree to use for hyphenation * @param dictionary * the word dictionary to match against + * @deprecated Use the constructors taking {@link Set} */ + @Deprecated public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, String[] dictionary) { - this(matchVersion, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE, + this(matchVersion, input, hyphenator, makeDictionary(matchVersion,dictionary), DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false); } @@ -106,10 +123,7 @@ * @param hyphenator * the hyphenation pattern tree to use for hyphenation * @param dictionary - * the word dictionary to match against. If this is a - * {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it - * must have set ignoreCase=false and only contain lower case - * strings. + * the word dictionary to match against. */ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, Set dictionary) { @@ -130,10 +144,7 @@ * @param hyphenator * the hyphenation pattern tree to use for hyphenation * @param dictionary - * the word dictionary to match against. If this is a - * {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it - * must have set ignoreCase=false and only contain lower case - * strings. + * the word dictionary to match against. * @param minWordSize * only words longer than this get processed * @param minSubwordSize @@ -218,22 +229,20 @@ } @Override - protected void decomposeInternal(final Token token) { + protected void decompose() { // get the hyphenation points - Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token - .length(), 1, 1); + Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1); // No hyphen points found -> exit if (hyphens == null) { return; } final int[] hyp = hyphens.getHyphenationPoints(); - char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer()); for (int i = 0; i < hyp.length; ++i) { int remaining = hyp.length - i; int start = hyp[i]; - Token longestMatchToken = null; + CompoundToken longestMatchToken = null; for (int j = 1; j < remaining; j++) { int partLength = hyp[i + j] - start; @@ -250,34 +259,33 @@ } // check the dictionary - if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) { + if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) { if (this.onlyLongestMatch) { if (longestMatchToken != null) { - if (longestMatchToken.length() < partLength) { - longestMatchToken = createToken(start, partLength, token); + if (longestMatchToken.txt.length() < partLength) { + longestMatchToken = new CompoundToken(start, partLength); } } else { - longestMatchToken = createToken(start, partLength, token); + longestMatchToken = new CompoundToken(start, partLength); } } else { - tokens.add(createToken(start, partLength, token)); + tokens.add(new CompoundToken(start, partLength)); } - } else if (dictionary.contains(lowerCaseTermBuffer, start, - partLength - 1)) { + } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) { // check the dictionary again with a word that is one character // shorter // to avoid problems with genitive 's characters and other binding // characters if (this.onlyLongestMatch) { if (longestMatchToken != null) { - if (longestMatchToken.length() < partLength - 1) { - longestMatchToken = createToken(start, partLength - 1, token); + if (longestMatchToken.txt.length() < partLength - 1) { + longestMatchToken = new CompoundToken(start, partLength - 1); } } else { - longestMatchToken = createToken(start, partLength - 1, token); + longestMatchToken = new CompoundToken(start, partLength - 1); } } else { - tokens.add(createToken(start, partLength - 1, token)); + tokens.add(new CompoundToken(start, partLength - 1)); } } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 1188236) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy) @@ -17,15 +17,20 @@ * limitations under the License. */ +import java.io.IOException; import java.io.StringReader; -import org.xml.sax.InputSource; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.xml.sax.InputSource; public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { public void testHyphenationCompoundWordsDA() throws Exception { @@ -166,45 +171,45 @@ String[] dict = {"ab", "cd", "ef"}; DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, - new WhitespaceTokenizer(TEST_VERSION_CURRENT, - new StringReader( - "abcdef") - ), - dict, - CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader( + "abcdef") + ), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); assertTokenStreamContents(tf, - new String[] { "abcdef", "ab", "cd", "ef" }, - new int[] { 0, 0, 2, 4}, - new int[] { 6, 2, 4, 6}, - new int[] { 1, 0, 0, 0} - ); + new String[] { "abcdef", "ab", "cd", "ef" }, + new int[] { 0, 0, 2, 4}, + new int[] { 6, 2, 4, 6}, + new int[] { 1, 0, 0, 0} + ); } public void testWordComponentWithLessThanMinimumLength() throws Exception { String[] dict = {"abc", "d", "efg"}; DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, - new WhitespaceTokenizer(TEST_VERSION_CURRENT, - new StringReader( - "abcdefg") - ), - dict, - CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader( + "abcdefg") + ), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); - // since "d" is shorter than the minimum subword size, it should not be added to the token stream + // since "d" is shorter than the minimum subword size, it should not be added to the token stream assertTokenStreamContents(tf, - new String[] { "abcdefg", "abc", "efg" }, - new int[] { 0, 0, 4}, - new int[] { 7, 3, 7}, - new int[] { 1, 0, 0} - ); + new String[] { "abcdefg", "abc", "efg" }, + new int[] { 0, 0, 4}, + new int[] { 7, 3, 7}, + new int[] { 1, 0, 0} + ); } - + public void testReset() throws Exception { String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung" }; @@ -228,4 +233,64 @@ assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); } + public void testRetainMockAttribute() throws Exception { + String[] dict = { "abc", "d", "efg" }; + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader("abcdefg")); + TokenStream stream = new MockRetainAttributeFilter(tokenizer); + stream = new DictionaryCompoundWordTokenFilter( + TEST_VERSION_CURRENT, stream, dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); + while (stream.incrementToken()) { + assertTrue("Custom attribute value was lost", retAtt.getRetain()); + } + + } + + public static interface MockRetainAttribute extends Attribute { + void setRetain(boolean attr); + boolean getRetain(); + } + + public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute { + private boolean retain = false; + @Override + public void clear() { + retain = false; + } + public boolean getRetain() { + return retain; + } + public void setRetain(boolean retain) { + this.retain = retain; + } + @Override + public void copyTo(AttributeImpl target) { + MockRetainAttribute t = (MockRetainAttribute) target; + t.setRetain(retain); + } + } + + private static class MockRetainAttributeFilter extends TokenFilter { + + MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class); + + MockRetainAttributeFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()){ + retainAtt.setRetain(true); + return true; + } else { + return false; + } + } + } + }