Index: modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 1187651) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy) @@ -17,15 +17,20 @@ * limitations under the License. */ +import java.io.IOException; import java.io.StringReader; -import org.xml.sax.InputSource; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.xml.sax.InputSource; public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { public void testHyphenationCompoundWordsDA() throws Exception { @@ -227,5 +232,65 @@ assertTrue(tf.incrementToken()); assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); } + + public void testRetainMockAttribute() throws Exception { + String[] dict = { "abc", "d", "efg" }; + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader("abcdefg")); + TokenStream stream = new MockRetainAttributeFilter(tokenizer); + stream = new DictionaryCompoundWordTokenFilter( + TEST_VERSION_CURRENT, stream, dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); + while (stream.incrementToken()) { + assertTrue("Custom attribute value was lost", retAtt.getRetain()); + } + } + + public static interface MockRetainAttribute extends Attribute { + void setRetain(boolean attr); + boolean getRetain(); + } + + public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute { + private boolean retain = false; + @Override + public void clear() { + retain = false; + } + public boolean getRetain() { + return retain; + } + public void setRetain(boolean retain) { + this.retain = retain; + } + @Override + public void copyTo(AttributeImpl target) { + MockRetainAttribute t = (MockRetainAttribute) target; + t.setRetain(retain); + } + } + + private static class MockRetainAttributeFilter extends TokenFilter { + + MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class); + + MockRetainAttributeFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()){ + retainAtt.setRetain(true); + return true; + } else { + return false; + } + } + } + } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 1187651) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -24,16 +24,13 @@ import java.util.Locale; import java.util.Set; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; /** @@ -64,7 +61,7 @@ public static final int DEFAULT_MAX_SUBWORD_SIZE = 15; protected final CharArraySet dictionary; - protected final LinkedList tokens; + protected final LinkedList tokens; protected final int minWordSize; protected final int minSubwordSize; protected final int maxSubwordSize; @@ -72,12 +69,9 @@ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); - private final Token wrapper = new Token(); + private AttributeSource.State current; protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); @@ -102,7 +96,7 @@ protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(input); - this.tokens=new LinkedList(); + this.tokens=new LinkedList(); this.minWordSize=minWordSize; this.minSubwordSize=minSubwordSize; this.maxSubwordSize=maxSubwordSize; @@ -137,38 +131,26 @@ return dict; } - private void setToken(final Token token) throws IOException { - clearAttributes(); - termAtt.copyBuffer(token.buffer(), 0, token.length()); - flagsAtt.setFlags(token.getFlags()); - typeAtt.setType(token.type()); - offsetAtt.setOffset(token.startOffset(), token.endOffset()); - posIncAtt.setPositionIncrement(token.getPositionIncrement()); - payloadAtt.setPayload(token.getPayload()); - } - @Override public final boolean incrementToken() throws IOException { if (tokens.size() > 0) { - setToken(tokens.removeFirst()); + CompoundToken token = tokens.removeFirst(); + restoreState(current); + termAtt.copyBuffer(token.txt.toCharArray(), 0, token.txt.length()); + offsetAtt.setOffset(token.startOffset, token.endOffset); + posIncAtt.setPositionIncrement(0); return true; } if (!input.incrementToken()) return false; - - wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length()); - wrapper.setStartOffset(offsetAtt.startOffset()); - wrapper.setEndOffset(offsetAtt.endOffset()); - wrapper.setFlags(flagsAtt.getFlags()); - wrapper.setType(typeAtt.type()); - wrapper.setPositionIncrement(posIncAtt.getPositionIncrement()); - wrapper.setPayload(payloadAtt.getPayload()); - - decompose(wrapper); + current = captureState(); + decompose(new CompoundToken(termAtt.toString(), offsetAtt.startOffset(), offsetAtt.endOffset())); + if (tokens.size() > 0) { - setToken(tokens.removeFirst()); + CompoundToken token = tokens.removeFirst(); + termAtt.copyBuffer(token.txt.toCharArray(), 0, token.txt.length()); return true; } else { return false; @@ -192,32 +174,71 @@ return result; } - - protected final Token createToken(final int offset, final int length, - final Token prototype) { - int newStart = prototype.startOffset() + offset; - Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length); - t.setPositionIncrement(0); - return t; - } - protected void decompose(final Token token) { + protected void decompose(final CompoundToken token) { // In any case we give the original token back - tokens.add((Token) token.clone()); + tokens.add(token); // Only words longer than minWordSize get processed - if (token.length() < this.minWordSize) { + if (token.txt.length() < this.minWordSize) { return; } decomposeInternal(token); } - protected abstract void decomposeInternal(final Token token); + protected abstract void decomposeInternal(final CompoundToken token); @Override public void reset() throws IOException { super.reset(); tokens.clear(); } + + /** + * Helper class to hold compound token information + */ + class CompoundToken { + private String txt; + private int startOffset, endOffset; + + CompoundToken(String txt, int startOffset, int endOffset) { + this.txt = txt; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + // Construct the compound token based on the original token + CompoundToken(int offset, int length, CompoundToken token) { + int newStart = token.startOffset + offset; + this.txt = token.txt.substring(offset, offset+length); + this.startOffset = newStart; + this.endOffset = newStart + length; + } + + String getTxt() { + return txt; + } + + void setTxt(String txt) { + this.txt = txt; + } + + int getStartOffset() { + return startOffset; + } + + void setStartOffset(int startOffset) { + this.startOffset = startOffset; + } + + int getEndOffset() { + return endOffset; + } + + void setEndOffset(int endOffset) { + this.endOffset = endOffset; + } + + } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (revision 1187651) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (working copy) @@ -20,7 +20,6 @@ import java.util.Set; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Version; @@ -128,31 +127,31 @@ } @Override - protected void decomposeInternal(final Token token) { + protected void decomposeInternal(final CompoundToken token) { // Only words longer than minWordSize get processed - if (token.length() < this.minWordSize) { + if (token.getTxt().length() < this.minWordSize) { return; } - char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer()); + char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.getTxt().toCharArray()); - for (int i=0;i<=token.length()-this.minSubwordSize;++i) { - Token longestMatchToken=null; + for (int i=0;i<=token.getTxt().length()-this.minSubwordSize;++i) { + CompoundToken longestMatchToken=null; for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) { - if(i+j>token.length()) { + if(i+j>token.getTxt().length()) { break; } if(dictionary.contains(lowerCaseTermBuffer, i, j)) { if (this.onlyLongestMatch) { if (longestMatchToken!=null) { - if (longestMatchToken.length() exit if (hyphens == null) { return; } final int[] hyp = hyphens.getHyphenationPoints(); - char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer()); + char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.getTxt().toCharArray()); for (int i = 0; i < hyp.length; ++i) { int remaining = hyp.length - i; int start = hyp[i]; - Token longestMatchToken = null; + CompoundToken longestMatchToken = null; for (int j = 1; j < remaining; j++) { int partLength = hyp[i + j] - start; @@ -253,14 +252,14 @@ if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) { if (this.onlyLongestMatch) { if (longestMatchToken != null) { - if (longestMatchToken.length() < partLength) { - longestMatchToken = createToken(start, partLength, token); + if (longestMatchToken.getTxt().length() < partLength) { + longestMatchToken = new CompoundToken(start, partLength, token); } } else { - longestMatchToken = createToken(start, partLength, token); + longestMatchToken = new CompoundToken(start, partLength, token); } } else { - tokens.add(createToken(start, partLength, token)); + tokens.add(new CompoundToken(start, partLength, token)); } } else if (dictionary.contains(lowerCaseTermBuffer, start, partLength - 1)) { @@ -270,14 +269,14 @@ // characters if (this.onlyLongestMatch) { if (longestMatchToken != null) { - if (longestMatchToken.length() < partLength - 1) { - longestMatchToken = createToken(start, partLength - 1, token); + if (longestMatchToken.getTxt().length() < partLength - 1) { + longestMatchToken = new CompoundToken(start, partLength - 1, token); } } else { - longestMatchToken = createToken(start, partLength - 1, token); + longestMatchToken = new CompoundToken(start, partLength - 1, token); } } else { - tokens.add(createToken(start, partLength - 1, token)); + tokens.add(new CompoundToken(start, partLength - 1, token)); } } }