Index: modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 945275) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy) @@ -70,6 +70,54 @@ } + /** + * With hyphenation-only, you can get a lot of nonsense tokens. + * This can be controlled with the min/max subword size. + */ + public void testHyphenationOnly() throws Exception { + Reader reader = getHyphenationReader(); + HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter + .getHyphenationTree(reader); + + HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( + TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")), + hyphenator, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + 2, 4); + + // min=2, max=4 + assertTokenStreamContents(tf, + new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" } + ); + + tf = new HyphenationCompoundWordTokenFilter( + TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")), + hyphenator, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + 4, 6); + + // min=4, max=6 + assertTokenStreamContents(tf, + new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" } + ); + + tf = new HyphenationCompoundWordTokenFilter( + TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")), + hyphenator, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + 4, 10); + + // min=4, max=10 + assertTokenStreamContents(tf, + new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", + "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" } + ); + + } + public void testDumbCompoundWordsSE() throws Exception { String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll", Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 945275) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -154,7 +154,7 @@ this.maxSubwordSize=maxSubwordSize; this.onlyLongestMatch=onlyLongestMatch; - if (dictionary instanceof CharArraySet) { + if (dictionary==null || dictionary instanceof CharArraySet) { this.dictionary = (CharArraySet) dictionary; } else { this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false); @@ -181,6 +181,9 @@ } public static final Set makeDictionary(final Version matchVersion, final String[] dictionary) { + if (dictionary == null) { + return null; + } // is the below really case insensitive? CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false); addAllLowerCase(dict, Arrays.asList(dictionary)); Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (revision 945275) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (working copy) @@ -154,6 +154,33 @@ } /** + * Create a HyphenationCompoundWordTokenFilter with no dictionary. + *

+ * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean) + * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, + * null, minWordSize, minSubwordSize, maxSubwordSize } + */ + public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, + HyphenationTree hyphenator, int minWordSize, int minSubwordSize, + int maxSubwordSize) { + this(matchVersion, input, hyphenator, (Set) null, minWordSize, minSubwordSize, + maxSubwordSize, false); + } + + /** + * Create a HyphenationCompoundWordTokenFilter with no dictionary. + *

+ * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int) + * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, + * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE } + */ + public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, + HyphenationTree hyphenator) { + this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, + DEFAULT_MAX_SUBWORD_SIZE); + } + + /** * Creates a new {@link HyphenationCompoundWordTokenFilter} instance. * * @param input the {@link TokenStream} to process @@ -305,7 +332,7 @@ } // check the dictionary - if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) { + if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) { if (this.onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.termLength() < partLength) { Index: modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html (revision 945275) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html (working copy) @@ -81,8 +81,9 @@ The {@link org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find -potential subwords that a worth to check against the dictionary. The -quality of the output tokens is directly connected to the quality of the +potential subwords that a worth to check against the dictionary. It can be used +without a dictionary as well but then produces a lot of "nonword" tokens. +The quality of the output tokens is directly connected to the quality of the grammar file you use. For languages like German they are quite good.

Grammar file
Unfortunately we cannot bundle the hyphenation grammar files with Lucene @@ -157,11 +158,27 @@ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); - Token t; - while ((t=tf.next())!=null) { + CharTermAttribute t = tf.addAttribute(CharTermAttribute.class); + while (tf.incrementToken()) { System.out.println(t); } } + + public void testHyphenationCompoundWordsWithoutDictionaryDE() throws Exception { + Reader reader = new FileReader("de_DR.xml"); + + HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter + .getHyphenationTree(reader); + + HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( + new WhitespaceTokenizer(new StringReader( + "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator); + + CharTermAttribute t = tf.addAttribute(CharTermAttribute.class); + while (tf.incrementToken()) { + System.out.println(t); + } + } public void testDumbCompoundWordsSE() throws Exception { String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", @@ -173,8 +190,8 @@ new StringReader( "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")), dict); - Token t; - while ((t=tf.next())!=null) { + CharTermAttribute t = tf.addAttribute(CharTermAttribute.class); + while (tf.incrementToken()) { System.out.println(t); } } Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 945275) +++ lucene/contrib/CHANGES.txt (working copy) @@ -160,6 +160,9 @@ * LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words when Version is set to 3.1 or higher. (Robert Muir) + * LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without dictionary. + (Thomas Peuss via Robert Muir) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation