Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 1188236) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -24,16 +24,14 @@ import java.util.Locale; import java.util.Set; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; /** @@ -64,27 +62,24 @@ public static final int DEFAULT_MAX_SUBWORD_SIZE = 15; protected final CharArraySet dictionary; - protected final LinkedList tokens; + protected final LinkedList tokens; protected final int minWordSize; protected final int minSubwordSize; protected final int maxSubwordSize; protected final boolean onlyLongestMatch; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); + protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); - private final Token wrapper = new Token(); + private AttributeSource.State current; protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { - this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); + this(matchVersion, input,makeDictionary(matchVersion,dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); } protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) { - this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); } protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, boolean onlyLongestMatch) { @@ -92,7 +87,7 @@ } protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) { - this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); + this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); } protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary) { @@ -102,7 +97,7 @@ protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(input); - this.tokens=new LinkedList(); + this.tokens=new LinkedList(); this.minWordSize=minWordSize; this.minSubwordSize=minSubwordSize; this.maxSubwordSize=maxSubwordSize; @@ -111,113 +106,71 @@ if (dictionary==null || dictionary instanceof CharArraySet) { this.dictionary = (CharArraySet) dictionary; } else { - this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false); - addAllLowerCase(this.dictionary, dictionary); + this.dictionary = new CharArraySet(matchVersion, dictionary, true); } } - /** - * Create a set of words from an array - * The resulting Set does case insensitive matching - * TODO We should look for a faster dictionary lookup approach. - * @param dictionary - * @return {@link Set} of lowercased terms - */ - public static Set makeDictionary(final String[] dictionary) { - return makeDictionary(Version.LUCENE_30, dictionary); - } - - public static Set makeDictionary(final Version matchVersion, final String[] dictionary) { + public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) { if (dictionary == null) { return null; } - // is the below really case insensitive? - CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false); - addAllLowerCase(dict, Arrays.asList(dictionary)); - return dict; + return new CharArraySet(matchVersion, Arrays.asList(dictionary), true); } - private void setToken(final Token token) throws IOException { - clearAttributes(); - termAtt.copyBuffer(token.buffer(), 0, token.length()); - flagsAtt.setFlags(token.getFlags()); - typeAtt.setType(token.type()); - offsetAtt.setOffset(token.startOffset(), token.endOffset()); - posIncAtt.setPositionIncrement(token.getPositionIncrement()); - payloadAtt.setPayload(token.getPayload()); - } - @Override public final boolean incrementToken() throws IOException { - if (tokens.size() > 0) { - setToken(tokens.removeFirst()); + if (!tokens.isEmpty()) { + CompoundToken token = tokens.removeFirst(); + restoreState(current); + termAtt.setEmpty().append(token.txt); + offsetAtt.setOffset(token.startOffset, token.endOffset); + posIncAtt.setPositionIncrement(0); return true; } - if (!input.incrementToken()) - return false; - - wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length()); - wrapper.setStartOffset(offsetAtt.startOffset()); - wrapper.setEndOffset(offsetAtt.endOffset()); - wrapper.setFlags(flagsAtt.getFlags()); - wrapper.setType(typeAtt.type()); - wrapper.setPositionIncrement(posIncAtt.getPositionIncrement()); - wrapper.setPayload(payloadAtt.getPayload()); - - decompose(wrapper); - - if (tokens.size() > 0) { - setToken(tokens.removeFirst()); + current = null; // not really needed, but for safety + if (input.incrementToken()) { + // Only words longer than minWordSize get processed + if (termAtt.length() >= this.minWordSize) { + decompose(); + // only capture the state if we really need it for producing new tokens + if (!tokens.isEmpty()) { + current = captureState(); + } + } + // return original token: return true; } else { return false; } } - - protected static void addAllLowerCase(CharArraySet target, Collection col) { - for (Object obj : col) { - String string = (String) obj; - target.add(string.toLowerCase(Locale.ENGLISH)); - } - } - - protected static char[] makeLowerCaseCopy(final char[] buffer) { - char[] result=new char[buffer.length]; - System.arraycopy(buffer, 0, result, 0, buffer.length); - - for (int i=0;itoken.length()) { + if(i+j>len) { break; } - if(dictionary.contains(lowerCaseTermBuffer, i, j)) { + if(dictionary.contains(termAtt.buffer(), i, j)) { if (this.onlyLongestMatch) { if (longestMatchToken!=null) { - if (longestMatchToken.length() exit if (hyphens == null) { return; } final int[] hyp = hyphens.getHyphenationPoints(); - char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer()); for (int i = 0; i < hyp.length; ++i) { int remaining = hyp.length - i; int start = hyp[i]; - Token longestMatchToken = null; + CompoundToken longestMatchToken = null; for (int j = 1; j < remaining; j++) { int partLength = hyp[i + j] - start; @@ -250,34 +247,33 @@ } // check the dictionary - if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) { + if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) { if (this.onlyLongestMatch) { if (longestMatchToken != null) { - if (longestMatchToken.length() < partLength) { - longestMatchToken = createToken(start, partLength, token); + if (longestMatchToken.txt.length() < partLength) { + longestMatchToken = new CompoundToken(start, partLength); } } else { - longestMatchToken = createToken(start, partLength, token); + longestMatchToken = new CompoundToken(start, partLength); } } else { - tokens.add(createToken(start, partLength, token)); + tokens.add(new CompoundToken(start, partLength)); } - } else if (dictionary.contains(lowerCaseTermBuffer, start, - partLength - 1)) { + } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) { // check the dictionary again with a word that is one character // shorter // to avoid problems with genitive 's characters and other binding // characters if (this.onlyLongestMatch) { if (longestMatchToken != null) { - if (longestMatchToken.length() < partLength - 1) { - longestMatchToken = createToken(start, partLength - 1, token); + if (longestMatchToken.txt.length() < partLength - 1) { + longestMatchToken = new CompoundToken(start, partLength - 1); } } else { - longestMatchToken = createToken(start, partLength - 1, token); + longestMatchToken = new CompoundToken(start, partLength - 1); } } else { - tokens.add(createToken(start, partLength - 1, token)); + tokens.add(new CompoundToken(start, partLength - 1)); } } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 1188236) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy) @@ -17,15 +17,20 @@ * limitations under the License. */ +import java.io.IOException; import java.io.StringReader; -import org.xml.sax.InputSource; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.xml.sax.InputSource; public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { public void testHyphenationCompoundWordsDA() throws Exception { @@ -166,45 +171,45 @@ String[] dict = {"ab", "cd", "ef"}; DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, - new WhitespaceTokenizer(TEST_VERSION_CURRENT, - new StringReader( - "abcdef") - ), - dict, - CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader( + "abcdef") + ), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); assertTokenStreamContents(tf, - new String[] { "abcdef", "ab", "cd", "ef" }, - new int[] { 0, 0, 2, 4}, - new int[] { 6, 2, 4, 6}, - new int[] { 1, 0, 0, 0} - ); + new String[] { "abcdef", "ab", "cd", "ef" }, + new int[] { 0, 0, 2, 4}, + new int[] { 6, 2, 4, 6}, + new int[] { 1, 0, 0, 0} + ); } public void testWordComponentWithLessThanMinimumLength() throws Exception { String[] dict = {"abc", "d", "efg"}; DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, - new WhitespaceTokenizer(TEST_VERSION_CURRENT, - new StringReader( - "abcdefg") - ), - dict, - CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader( + "abcdefg") + ), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); - // since "d" is shorter than the minimum subword size, it should not be added to the token stream + // since "d" is shorter than the minimum subword size, it should not be added to the token stream assertTokenStreamContents(tf, - new String[] { "abcdefg", "abc", "efg" }, - new int[] { 0, 0, 4}, - new int[] { 7, 3, 7}, - new int[] { 1, 0, 0} - ); + new String[] { "abcdefg", "abc", "efg" }, + new int[] { 0, 0, 4}, + new int[] { 7, 3, 7}, + new int[] { 1, 0, 0} + ); } - + public void testReset() throws Exception { String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung" }; @@ -228,4 +233,64 @@ assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); } + public void testRetainMockAttribute() throws Exception { + String[] dict = { "abc", "d", "efg" }; + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader("abcdefg")); + TokenStream stream = new MockRetainAttributeFilter(tokenizer); + stream = new DictionaryCompoundWordTokenFilter( + TEST_VERSION_CURRENT, stream, dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); + while (stream.incrementToken()) { + assertTrue("Custom attribute value was lost", retAtt.getRetain()); + } + + } + + public static interface MockRetainAttribute extends Attribute { + void setRetain(boolean attr); + boolean getRetain(); + } + + public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute { + private boolean retain = false; + @Override + public void clear() { + retain = false; + } + public boolean getRetain() { + return retain; + } + public void setRetain(boolean retain) { + this.retain = retain; + } + @Override + public void copyTo(AttributeImpl target) { + MockRetainAttribute t = (MockRetainAttribute) target; + t.setRetain(retain); + } + } + + private static class MockRetainAttributeFilter extends TokenFilter { + + MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class); + + MockRetainAttributeFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()){ + retainAtt.setRetain(true); + return true; + } else { + return false; + } + } + } + }