+ *
+ * You must specify the required {@link Version} compatibility when creating
+ * CompoundWordTokenFilterBase:
+ *
+ * - As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ * supplementary characters in strings and char arrays provided as compound word
+ * dictionaries.
+ *
+ * If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
*/
public class HyphenationCompoundWordTokenFilter extends
CompoundWordTokenFilterBase {
@@ -62,7 +75,9 @@
* only subwords shorter than this get to the output stream
* @param onlyLongestMatch
* Add only the longest matching subword to the stream
+ * @deprecated Use the constructors taking {@link Set}
*/
+ @Deprecated
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
@@ -86,10 +101,12 @@
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
* the word dictionary to match against
+ * @deprecated Use the constructors taking {@link Set}
*/
+ @Deprecated
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, String[] dictionary) {
- this(matchVersion, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
+ this(matchVersion, input, hyphenator, makeDictionary(matchVersion,dictionary), DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}
@@ -106,10 +123,7 @@
* @param hyphenator
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
- * the word dictionary to match against. If this is a
- * {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
- * must have set ignoreCase=false and only contain lower case
- * strings.
+ * the word dictionary to match against.
*/
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, Set> dictionary) {
@@ -130,10 +144,7 @@
* @param hyphenator
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
- * the word dictionary to match against. If this is a
- * {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
- * must have set ignoreCase=false and only contain lower case
- * strings.
+ * the word dictionary to match against.
* @param minWordSize
* only words longer than this get processed
* @param minSubwordSize
@@ -218,22 +229,20 @@
}
@Override
- protected void decomposeInternal(final Token token) {
+ protected void decompose() {
// get the hyphenation points
- Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token
- .length(), 1, 1);
+ Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
// No hyphen points found -> exit
if (hyphens == null) {
return;
}
final int[] hyp = hyphens.getHyphenationPoints();
- char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
for (int i = 0; i < hyp.length; ++i) {
int remaining = hyp.length - i;
int start = hyp[i];
- Token longestMatchToken = null;
+ CompoundToken longestMatchToken = null;
for (int j = 1; j < remaining; j++) {
int partLength = hyp[i + j] - start;
@@ -250,34 +259,33 @@
}
// check the dictionary
- if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
+ if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
if (this.onlyLongestMatch) {
if (longestMatchToken != null) {
- if (longestMatchToken.length() < partLength) {
- longestMatchToken = createToken(start, partLength, token);
+ if (longestMatchToken.txt.length() < partLength) {
+ longestMatchToken = new CompoundToken(start, partLength);
}
} else {
- longestMatchToken = createToken(start, partLength, token);
+ longestMatchToken = new CompoundToken(start, partLength);
}
} else {
- tokens.add(createToken(start, partLength, token));
+ tokens.add(new CompoundToken(start, partLength));
}
- } else if (dictionary.contains(lowerCaseTermBuffer, start,
- partLength - 1)) {
+ } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
// check the dictionary again with a word that is one character
// shorter
// to avoid problems with genitive 's characters and other binding
// characters
if (this.onlyLongestMatch) {
if (longestMatchToken != null) {
- if (longestMatchToken.length() < partLength - 1) {
- longestMatchToken = createToken(start, partLength - 1, token);
+ if (longestMatchToken.txt.length() < partLength - 1) {
+ longestMatchToken = new CompoundToken(start, partLength - 1);
}
} else {
- longestMatchToken = createToken(start, partLength - 1, token);
+ longestMatchToken = new CompoundToken(start, partLength - 1);
}
} else {
- tokens.add(createToken(start, partLength - 1, token));
+ tokens.add(new CompoundToken(start, partLength - 1));
}
}
}
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 1188236)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy)
@@ -17,15 +17,20 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.io.StringReader;
-import org.xml.sax.InputSource;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.xml.sax.InputSource;
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
public void testHyphenationCompoundWordsDA() throws Exception {
@@ -166,45 +171,45 @@
String[] dict = {"ab", "cd", "ef"};
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
- new WhitespaceTokenizer(TEST_VERSION_CURRENT,
- new StringReader(
- "abcdef")
- ),
- dict,
- CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
- CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
- CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ new StringReader(
+ "abcdef")
+ ),
+ dict,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
assertTokenStreamContents(tf,
- new String[] { "abcdef", "ab", "cd", "ef" },
- new int[] { 0, 0, 2, 4},
- new int[] { 6, 2, 4, 6},
- new int[] { 1, 0, 0, 0}
- );
+ new String[] { "abcdef", "ab", "cd", "ef" },
+ new int[] { 0, 0, 2, 4},
+ new int[] { 6, 2, 4, 6},
+ new int[] { 1, 0, 0, 0}
+ );
}
public void testWordComponentWithLessThanMinimumLength() throws Exception {
String[] dict = {"abc", "d", "efg"};
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
- new WhitespaceTokenizer(TEST_VERSION_CURRENT,
- new StringReader(
- "abcdefg")
- ),
- dict,
- CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
- CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
- CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ new StringReader(
+ "abcdefg")
+ ),
+ dict,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
- // since "d" is shorter than the minimum subword size, it should not be added to the token stream
+ // since "d" is shorter than the minimum subword size, it should not be added to the token stream
assertTokenStreamContents(tf,
- new String[] { "abcdefg", "abc", "efg" },
- new int[] { 0, 0, 4},
- new int[] { 7, 3, 7},
- new int[] { 1, 0, 0}
- );
+ new String[] { "abcdefg", "abc", "efg" },
+ new int[] { 0, 0, 4},
+ new int[] { 7, 3, 7},
+ new int[] { 1, 0, 0}
+ );
}
-
+
public void testReset() throws Exception {
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Überwachung" };
@@ -228,4 +233,64 @@
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
+ public void testRetainMockAttribute() throws Exception {
+ String[] dict = { "abc", "d", "efg" };
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ new StringReader("abcdefg"));
+ TokenStream stream = new MockRetainAttributeFilter(tokenizer);
+ stream = new DictionaryCompoundWordTokenFilter(
+ TEST_VERSION_CURRENT, stream, dict,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+ MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
+ while (stream.incrementToken()) {
+ assertTrue("Custom attribute value was lost", retAtt.getRetain());
+ }
+
+ }
+
+ public static interface MockRetainAttribute extends Attribute {
+ void setRetain(boolean attr);
+ boolean getRetain();
+ }
+
+ public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute {
+ private boolean retain = false;
+ @Override
+ public void clear() {
+ retain = false;
+ }
+ public boolean getRetain() {
+ return retain;
+ }
+ public void setRetain(boolean retain) {
+ this.retain = retain;
+ }
+ @Override
+ public void copyTo(AttributeImpl target) {
+ MockRetainAttribute t = (MockRetainAttribute) target;
+ t.setRetain(retain);
+ }
+ }
+
+ private static class MockRetainAttributeFilter extends TokenFilter {
+
+ MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);
+
+ MockRetainAttributeFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()){
+ retainAtt.setRetain(true);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
}