Index: modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 945275) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy) @@ -70,6 +70,54 @@ } + /** + * With hyphenation-only, you can get a lot of nonsense tokens. + * This can be controlled with the min/max subword size. + */ + public void testHyphenationOnly() throws Exception { + Reader reader = getHyphenationReader(); + HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter + .getHyphenationTree(reader); + + HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( + TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")), + hyphenator, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + 2, 4); + + // min=2, max=4 + assertTokenStreamContents(tf, + new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" } + ); + + tf = new HyphenationCompoundWordTokenFilter( + TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")), + hyphenator, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + 4, 6); + + // min=4, max=6 + assertTokenStreamContents(tf, + new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" } + ); + + tf = new HyphenationCompoundWordTokenFilter( + TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")), + hyphenator, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + 4, 10); + + // min=4, max=10 + assertTokenStreamContents(tf, + new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", + "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" } + ); + + } + public void testDumbCompoundWordsSE() throws Exception { String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll", Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 945275) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -154,7 +154,7 @@ this.maxSubwordSize=maxSubwordSize; this.onlyLongestMatch=onlyLongestMatch; - if (dictionary instanceof CharArraySet) { + if (dictionary==null || dictionary instanceof CharArraySet) { this.dictionary = (CharArraySet) dictionary; } else { this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false); @@ -181,6 +181,9 @@ } public static final Set> makeDictionary(final Version matchVersion, final String[] dictionary) { + if (dictionary == null) { + return null; + } // is the below really case insensitive? CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false); addAllLowerCase(dict, Arrays.asList(dictionary)); Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (revision 945275) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (working copy) @@ -154,6 +154,33 @@ } /** + * Create a HyphenationCompoundWordTokenFilter with no dictionary. + *
+ * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean) + * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, + * null, minWordSize, minSubwordSize, maxSubwordSize } + */ + public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, + HyphenationTree hyphenator, int minWordSize, int minSubwordSize, + int maxSubwordSize) { + this(matchVersion, input, hyphenator, (Set>) null, minWordSize, minSubwordSize, + maxSubwordSize, false); + } + + /** + * Create a HyphenationCompoundWordTokenFilter with no dictionary. + *
+ * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int) + * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, + * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE } + */ + public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, + HyphenationTree hyphenator) { + this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, + DEFAULT_MAX_SUBWORD_SIZE); + } + + /** * Creates a new {@link HyphenationCompoundWordTokenFilter} instance. * * @param input the {@link TokenStream} to process @@ -305,7 +332,7 @@ } // check the dictionary - if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) { + if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) { if (this.onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.termLength() < partLength) { Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html (revision 945275) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html (working copy) @@ -81,8 +81,9 @@ The {@link org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find -potential subwords that a worth to check against the dictionary. The -quality of the output tokens is directly connected to the quality of the +potential subwords that a worth to check against the dictionary. It can be used +without a dictionary as well but then produces a lot of "nonword" tokens. +The quality of the output tokens is directly connected to the quality of the grammar file you use. For languages like German they are quite good.