Index: modules/analysis/CHANGES.txt =================================================================== --- modules/analysis/CHANGES.txt (Revision 1092052) +++ modules/analysis/CHANGES.txt (Arbeitskopie) @@ -3,6 +3,8 @@ ======================= Trunk (not yet released) ======================= API Changes + * LUCENE-3022: Fixed the behaviour of DictionaryCompoundWordTokenFilter + for the onlyLongestMatch flag * LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor of the pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir) Index: modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilter.java (Revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilter.java (Revision 0) @@ -0,0 +1,38 @@ +package org.apache.lucene.analysis.compound; + +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +public class TestDictionaryCompoundWordTokenFilter extends BaseTokenStreamTestCase { + + /** + * With the flag onlyLongestMatch true you should only get the longest partial word + * from the dictionary as result, otherwise you get tokens not matching the context + */ + public void testOnlyLongestMatch() throws Exception{ + String[] dictionary = {"streifen","reifen"}; + String test_stream_string = "streifenbluse"; + + DictionaryCompoundWordTokenFilter filter_onlylongest = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader(test_stream_string)), + dictionary, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + true); + DictionaryCompoundWordTokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader(test_stream_string)), + dictionary, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + false); + assertTokenStreamContents(filter_onlylongest, new String[]{"streifenbluse","streifen"}); + assertTokenStreamContents(filter, new String[]{"streifenbluse","streifen","reifen"}); + } + +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (Revision 1092052) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (Arbeitskopie) @@ -29,14 +29,14 @@ * A {@link TokenFilter} that decomposes compound words found in many Germanic languages. *
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find - * "Donaudampfschiff" even when you only enter "schiff". + * "Donaudampfschiff" even when you only enter "schiff". * It uses a brute-force algorithm to achieve this. *
*/ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase { /** * Creates a new {@link DictionaryCompoundWordTokenFilter} - * + * * @param matchVersion * Lucene version to enable correct Unicode 4.0 behavior in the * dictionaries if Version > 3.0. See 3.0. See CompoundWordTokenFilterBase for details. - * + * * @param input * the {@link TokenStream} to process * @param dictionary @@ -77,10 +77,10 @@ public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) { super(matchVersion, input, dictionary); } - + /** * Creates a new {@link DictionaryCompoundWordTokenFilter} - * + * * @param matchVersion * Lucene version to enable correct Unicode 4.0 behavior in the * dictionaries if Version > 3.0. See 3.0. See token.length()) { break; @@ -154,11 +154,11 @@ } else { tokens.add(createToken(i,j,token)); } - } + } } - if (this.onlyLongestMatch && longestMatchToken!=null) { - tokens.add(longestMatchToken); - } } + if (this.onlyLongestMatch && longestMatchToken!=null) { + tokens.add(longestMatchToken); + } } }