Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 1188604) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -48,7 +48,7 @@ * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain. * For optional performance (as this filter does lots of lookups to the dictionary, * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary - * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * {@link Set Sets} to the ctors, they will be automatically * transformed to case-insensitive! */ public abstract class CompoundWordTokenFilterBase extends TokenFilter { @@ -103,34 +103,7 @@ this.dictionary = new CharArraySet(matchVersion, dictionary, true); } } - - /** @deprecated Use the constructors taking {@link Set} */ - @Deprecated - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) { - this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); - } - - /** @deprecated Use the constructors taking {@link Set} */ - @Deprecated - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { - this(matchVersion, input,makeDictionary(matchVersion,dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); - } - /** @deprecated Use the constructors taking {@link Set} */ - @Deprecated - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) { - this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); - } - - /** @deprecated Only available for backwards compatibility. */ - @Deprecated - public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) { - if (dictionary == null) { - return null; - } - return new CharArraySet(matchVersion, Arrays.asList(dictionary), true); - } - @Override public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { @@ -190,5 +163,5 @@ this.endOffset = newStart + length; } - } + } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (revision 1188604) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (working copy) @@ -43,56 +43,10 @@ * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain. * For optional performance (as this filter does lots of lookups to the dictionary, * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary - * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * {@link Set Sets} to the ctors, they will be automatically * transformed to case-insensitive! */ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase { - /** - * Creates a new {@link DictionaryCompoundWordTokenFilter}. - * @param matchVersion - * Lucene version to enable correct Unicode 4.0 behavior in the - * dictionaries if Version > 3.0. See CompoundWordTokenFilterBase for details. - * @param input - * the {@link TokenStream} to process - * @param dictionary - * the word dictionary to match against - * @param minWordSize - * only words longer than this get processed - * @param minSubwordSize - * only subwords longer than this get to the output stream - * @param maxSubwordSize - * only subwords shorter than this get to the output stream - * @param onlyLongestMatch - * Add only the longest matching subword to the stream - * @deprecated Use the constructors taking {@link Set} - */ - @Deprecated - public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary, - int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { - super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); - } - - /** - * Creates a new {@link DictionaryCompoundWordTokenFilter} - * - * @param matchVersion - * Lucene version to enable correct Unicode 4.0 behavior in the - * dictionaries if Version > 3.0. See CompoundWordTokenFilterBase for details. - * - * @param input - * the {@link TokenStream} to process - * @param dictionary - * the word dictionary to match against - * @deprecated Use the constructors taking {@link Set} - */ - @Deprecated - public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) { - super(matchVersion, input, dictionary); - } /** * Creates a new {@link DictionaryCompoundWordTokenFilter} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (revision 1188604) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (working copy) @@ -46,71 +46,14 @@ * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain. * For optional performance (as this filter does lots of lookups to the dictionary, * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary - * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * {@link Set Sets} to the ctors, they will be automatically * transformed to case-insensitive! */ public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterBase { private HyphenationTree hyphenator; - - /** - * Creates a new {@link HyphenationCompoundWordTokenFilter} instance. - * - * @param matchVersion - * Lucene version to enable correct Unicode 4.0 behavior in the - * dictionaries if Version > 3.0. See CompoundWordTokenFilterBase for details. - * @param input - * the {@link TokenStream} to process - * @param hyphenator - * the hyphenation pattern tree to use for hyphenation - * @param dictionary - * the word dictionary to match against - * @param minWordSize - * only words longer than this get processed - * @param minSubwordSize - * only subwords longer than this get to the output stream - * @param maxSubwordSize - * only subwords shorter than this get to the output stream - * @param onlyLongestMatch - * Add only the longest matching subword to the stream - * @deprecated Use the constructors taking {@link Set} - */ - @Deprecated - public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, - HyphenationTree hyphenator, String[] dictionary, int minWordSize, - int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { - super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, - onlyLongestMatch); - this.hyphenator = hyphenator; - } - /** - * Creates a new {@link HyphenationCompoundWordTokenFilter} instance. - * - * @param matchVersion - * Lucene version to enable correct Unicode 4.0 behavior in the - * dictionaries if Version > 3.0. See CompoundWordTokenFilterBase for details. - * @param input - * the {@link TokenStream} to process - * @param hyphenator - * the hyphenation pattern tree to use for hyphenation - * @param dictionary - * the word dictionary to match against - * @deprecated Use the constructors taking {@link Set} - */ - @Deprecated - public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, - HyphenationTree hyphenator, String[] dictionary) { - this(matchVersion, input, hyphenator, makeDictionary(matchVersion,dictionary), DEFAULT_MIN_WORD_SIZE, - DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false); - } - - /** * Creates a new {@link HyphenationCompoundWordTokenFilter} instance. * * @param matchVersion Index: modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 1188604) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy) @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.StringReader; +import java.util.Arrays; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; @@ -27,14 +28,20 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; import org.xml.sax.InputSource; public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { + + private static CharArraySet makeDictionary(String... dictionary) { + return new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(dictionary), true); + } + public void testHyphenationCompoundWordsDA() throws Exception { - String[] dict = { "læse", "hest" }; + CharArraySet dict = makeDictionary("læse", "hest"); InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter @@ -53,7 +60,7 @@ } public void testHyphenationCompoundWordsDELongestMatch() throws Exception { - String[] dict = { "basketball", "basket", "ball", "kurv" }; + CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv"); InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter @@ -121,9 +128,9 @@ } public void testDumbCompoundWordsSE() throws Exception { - String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", + CharArraySet dict = makeDictionary("Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll", - "Sko", "Vind", "Rute", "Torkare", "Blad" }; + "Sko", "Vind", "Rute", "Torkare", "Blad"); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer( @@ -151,9 +158,9 @@ } public void testDumbCompoundWordsSELongestMatch() throws Exception { - String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", + CharArraySet dict = makeDictionary("Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll", - "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" }; + "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral"); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("Basfiolsfodralmakaregesäll"), MockTokenizer.WHITESPACE, false), @@ -168,7 +175,7 @@ } public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception { - String[] dict = {"ab", "cd", "ef"}; + CharArraySet dict = makeDictionary("ab", "cd", "ef"); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, @@ -189,7 +196,7 @@ } public void testWordComponentWithLessThanMinimumLength() throws Exception { - String[] dict = {"abc", "d", "efg"}; + CharArraySet dict = makeDictionary("abc", "d", "efg"); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, @@ -211,8 +218,8 @@ } public void testReset() throws Exception { - String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz", - "Aufgabe", "Überwachung" }; + CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", + "Aufgabe", "Überwachung"); Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader( "Rindfleischüberwachungsgesetz")); @@ -234,7 +241,7 @@ } public void testRetainMockAttribute() throws Exception { - String[] dict = { "abc", "d", "efg" }; + CharArraySet dict = makeDictionary("abc", "d", "efg"); Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdefg")); TokenStream stream = new MockRetainAttributeFilter(tokenizer);