commit 6402a5556efe3aeb828990b460f9a6fe0757824b Author: synhershko Date: Mon Dec 24 21:20:38 2012 +0200 LUCENE-2841 Add option to CommonGramsFilter to not unigram common words diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java index 2b56245..82e7391 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java @@ -48,6 +48,7 @@ public final class CommonGramsFilter extends TokenFilter { private static final char SEPARATOR = '_'; private final CharArraySet commonWords; + private final boolean keepOrigin; private final StringBuilder buffer = new StringBuilder(); @@ -69,10 +70,27 @@ public final class CommonGramsFilter extends TokenFilter { * * @param input TokenStream input in filter chain * @param commonWords The set of common words. + * @deprecated use the one with keepOrigin */ + @Deprecated public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) { + this(matchVersion, input, commonWords, true); + } + + /** + * Construct a token stream filtering the given input using a Set of common + * words to create bigrams. Outputs both unigrams with position increment and + * bigrams with position increment 0 type=gram where one or both of the words + * in a potential bigram are in the set of common words . + * + * @param input TokenStream input in filter chain + * @param commonWords The set of common words. + * @param keepOrigin Whether to keep the original common word as a unigram or not. + */ + public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords, boolean keepOrigin) { super(input); this.commonWords = commonWords; + this.keepOrigin = keepOrigin; } /** @@ -93,13 +111,22 @@ public final class CommonGramsFilter extends TokenFilter { */ @Override public boolean incrementToken() throws IOException { - // get the next piece of input - if (savedState != null) { + // if we have a token from a previous iteration, return it now + if (restoreMaintainedToken()) { + saveTermBuffer(); + if (!isCommon()) + return true; + } + else if (savedState != null) { // only relevant if we are keeping originals restoreState(savedState); savedState = null; saveTermBuffer(); + lastWasCommon = isCommon(); return true; - } else if (!input.incrementToken()) { + } + + // get the next piece of input + if (!input.incrementToken()) { return false; } @@ -107,10 +134,29 @@ public final class CommonGramsFilter extends TokenFilter { * When valid, the buffer always contains at least the separator. * If its empty, there is nothing before this stopword. */ - if (lastWasCommon || (isCommon() && buffer.length() > 0)) { - savedState = captureState(); - gramToken(); - return true; + boolean isCommon = isCommon(); + if (keepOrigin) { + if (lastWasCommon || (isCommon && buffer.length() > 0)) { + savedState = captureState(); + gramToken(); + return true; + } + lastWasCommon = isCommon; + } else { + if (!lastWasCommon && isCommon && buffer.length() == 0) { + lastWasCommon = true; + saveTermBuffer(); + if (!input.incrementToken()) + return false; + isCommon = isCommon(); + } + + if (lastWasCommon || (isCommon && buffer.length() > 0)) { + lastWasCommon = isCommon; + rememberCurrentToken(); + gramToken(); + return true; + } } saveTermBuffer(); @@ -126,6 +172,7 @@ public final class CommonGramsFilter extends TokenFilter { lastWasCommon = false; savedState = null; buffer.setLength(0); + maintainedToken = false; } // ================================================= Helper Methods ================================================ @@ -147,7 +194,6 @@ public final class CommonGramsFilter extends TokenFilter { buffer.append(termAttribute.buffer(), 0, termAttribute.length()); buffer.append(SEPARATOR); lastStartOffset = offsetAttribute.startOffset(); - lastWasCommon = isCommon(); } /** @@ -167,10 +213,53 @@ public final class CommonGramsFilter extends TokenFilter { buffer.getChars(0, length, termText, 0); termAttribute.setLength(length); - posIncAttribute.setPositionIncrement(0); + posIncAttribute.setPositionIncrement(keepOrigin ? 0 : 1); posLenAttribute.setPositionLength(2); // bigram offsetAttribute.setOffset(lastStartOffset, endOffset); typeAttribute.setType(GRAM_TYPE); buffer.setLength(0); } + + boolean maintainedToken = false; + int maintainedTokenTextLen, maintainedTokenPosInc, maintainedTokenPosLen; + int maintainedTokenStartOffset, maintainedTokenEndOffset; + String maintainedTokenType; + char maintainedTokenText[] = new char[Byte.MAX_VALUE]; + + private void rememberCurrentToken() { + maintainedTokenStartOffset = offsetAttribute.startOffset(); + maintainedTokenEndOffset = offsetAttribute.endOffset(); + maintainedTokenPosInc = posIncAttribute.getPositionIncrement(); + maintainedTokenPosLen = posLenAttribute.getPositionLength(); + maintainedTokenType = typeAttribute.type(); + + if (maintainedTokenText.length < termAttribute.length()) + maintainedTokenText = new char[termAttribute.length()]; + System.arraycopy(termAttribute.buffer(), 0, maintainedTokenText, 0, termAttribute.length()); + maintainedTokenTextLen = termAttribute.length(); + maintainedToken = true; + } + + private boolean restoreMaintainedToken() { + if (!maintainedToken) + return false; + + clearAttributes(); + + char termText[] = termAttribute.buffer(); + if (maintainedTokenTextLen > termText.length) { + termText = termAttribute.resizeBuffer(maintainedTokenTextLen); + } + + System.arraycopy(maintainedTokenText, 0, termText, 0, maintainedTokenTextLen); + termAttribute.setLength(maintainedTokenTextLen); + posIncAttribute.setPositionIncrement(maintainedTokenPosInc); + posLenAttribute.setPositionLength(maintainedTokenPosLen); + offsetAttribute.setOffset(maintainedTokenStartOffset, maintainedTokenEndOffset); + typeAttribute.setType(maintainedTokenType); + buffer.setLength(0); + + maintainedToken = false; + return true; + } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java index b9be9d6..29dfa23 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java @@ -36,7 +36,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { public void testReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); - CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); + CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, true); CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class); cgf.reset(); @@ -58,7 +58,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { public void testQueryReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); - CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); + CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, true); CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); CharTermAttribute term = wt.addAttribute(CharTermAttribute.class); @@ -90,7 +90,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { public TokenStreamComponents createComponents(String field, Reader in) { Tokenizer tokenizer = new MockTokenizer(in, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new CommonGramsQueryFilter(new CommonGramsFilter(TEST_VERSION_CURRENT, - tokenizer, commonWords))); + tokenizer, commonWords, true))); } }; @@ -153,15 +153,15 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { assertAnalyzesTo(a, "of the of", new String[] { "of_the", "the_of" }); } - + public void testCommonGramsFilter() throws Exception { Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String field, Reader in) { - Tokenizer tokenizer = new MockTokenizer(in, MockTokenizer.WHITESPACE, false); + Tokenizer tokenizer = new MockTokenizer(in, MockTokenizer.WHITESPACE, true); return new TokenStreamComponents(tokenizer, new CommonGramsFilter(TEST_VERSION_CURRENT, - tokenizer, commonWords)); - } + tokenizer, commonWords, true)); + } }; // Stop words used below are "of" "the" and "s" @@ -170,76 +170,163 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { assertAnalyzesTo(a, "foo", new String[] { "foo" }); // two word queries - assertAnalyzesTo(a, "brown fox", - new String[] { "brown", "fox" }, + assertAnalyzesTo(a, "brown fox", + new String[] { "brown", "fox" }, new int[] { 1, 1 }); - assertAnalyzesTo(a, "the fox", - new String[] { "the", "the_fox", "fox" }, + assertAnalyzesTo(a, "the fox", + new String[] { "the", "the_fox", "fox" }, new int[] { 1, 0, 1 }); - assertAnalyzesTo(a, "fox of", - new String[] { "fox", "fox_of", "of" }, + assertAnalyzesTo(a, "fox of", + new String[] { "fox", "fox_of", "of" }, new int[] { 1, 0, 1 }); - assertAnalyzesTo(a, "of the", - new String[] { "of", "of_the", "the" }, + assertAnalyzesTo(a, "of the", + new String[] { "of", "of_the", "the" }, new int[] { 1, 0, 1 }); // 3 word combinations s=stopword/common word n=not a stop word - assertAnalyzesTo(a, "n n n", - new String[] { "n", "n", "n" }, + assertAnalyzesTo(a, "n n n", + new String[] { "n", "n", "n" }, new int[] { 1, 1, 1 }); - assertAnalyzesTo(a, "quick brown fox", - new String[] { "quick", "brown", "fox" }, + assertAnalyzesTo(a, "quick brown fox", + new String[] { "quick", "brown", "fox" }, new int[] { 1, 1, 1 }); - assertAnalyzesTo(a, "n n s", - new String[] { "n", "n", "n_s", "s" }, + assertAnalyzesTo(a, "n n s", + new String[] { "n", "n", "n_s", "s" }, new int[] { 1, 1, 0, 1 }); - assertAnalyzesTo(a, "quick brown the", - new String[] { "quick", "brown", "brown_the", "the" }, + assertAnalyzesTo(a, "quick brown the", + new String[] { "quick", "brown", "brown_the", "the" }, new int[] { 1, 1, 0, 1 }); - assertAnalyzesTo(a, "n s n", - new String[] { "n", "n_s", "s", "s_n", "n" }, + assertAnalyzesTo(a, "n s n", + new String[] { "n", "n_s", "s", "s_n", "n" }, new int[] { 1, 0, 1, 0, 1 }); - assertAnalyzesTo(a, "quick the fox", - new String[] { "quick", "quick_the", "the", "the_fox", "fox" }, + assertAnalyzesTo(a, "quick the fox", + new String[] { "quick", "quick_the", "the", "the_fox", "fox" }, new int[] { 1, 0, 1, 0, 1 }); - assertAnalyzesTo(a, "n s s", - new String[] { "n", "n_s", "s", "s_s", "s" }, + assertAnalyzesTo(a, "n s s", + new String[] { "n", "n_s", "s", "s_s", "s" }, new int[] { 1, 0, 1, 0, 1 }); - assertAnalyzesTo(a, "fox of the", - new String[] { "fox", "fox_of", "of", "of_the", "the" }, + assertAnalyzesTo(a, "fox of the", + new String[] { "fox", "fox_of", "of", "of_the", "the" }, new int[] { 1, 0, 1, 0, 1 }); - assertAnalyzesTo(a, "s n n", - new String[] { "s", "s_n", "n", "n" }, + assertAnalyzesTo(a, "s n n", + new String[] { "s", "s_n", "n", "n" }, new int[] { 1, 0, 1, 1 }); - assertAnalyzesTo(a, "the quick brown", - new String[] { "the", "the_quick", "quick", "brown" }, + assertAnalyzesTo(a, "the quick brown", + new String[] { "the", "the_quick", "quick", "brown" }, new int[] { 1, 0, 1, 1 }); - assertAnalyzesTo(a, "s n s", - new String[] { "s", "s_n", "n", "n_s", "s" }, + assertAnalyzesTo(a, "s n s", + new String[] { "s", "s_n", "n", "n_s", "s" }, new int[] { 1, 0, 1, 0, 1 }); - assertAnalyzesTo(a, "the fox of", - new String[] { "the", "the_fox", "fox", "fox_of", "of" }, + assertAnalyzesTo(a, "the fox of", + new String[] { "the", "the_fox", "fox", "fox_of", "of" }, new int[] { 1, 0, 1, 0, 1 }); - assertAnalyzesTo(a, "s s n", - new String[] { "s", "s_s", "s", "s_n", "n" }, + assertAnalyzesTo(a, "s s n", + new String[] { "s", "s_s", "s", "s_n", "n" }, new int[] { 1, 0, 1, 0, 1 }); - assertAnalyzesTo(a, "of the fox", - new String[] { "of", "of_the", "the", "the_fox", "fox" }, + assertAnalyzesTo(a, "of the fox", + new String[] { "of", "of_the", "the", "the_fox", "fox" }, new int[] { 1, 0, 1, 0, 1 }); - assertAnalyzesTo(a, "s s s", - new String[] { "s", "s_s", "s", "s_s", "s" }, + assertAnalyzesTo(a, "s s s", + new String[] { "s", "s_s", "s", "s_s", "s" }, new int[] { 1, 0, 1, 0, 1 }); - assertAnalyzesTo(a, "of the of", - new String[] { "of", "of_the", "the", "the_of", "of" }, + assertAnalyzesTo(a, "of the of", + new String[] { "of", "of_the", "the", "the_of", "of" }, new int[] { 1, 0, 1, 0, 1 }); } + + public void testCommonGramsFilterWithoutKeepOrig() throws Exception { + Analyzer a = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String field, Reader in) { + Tokenizer tokenizer = new MockTokenizer(in, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new CommonGramsFilter(TEST_VERSION_CURRENT, + tokenizer, commonWords, false)); + } + }; + + // Stop words used below are "of" "the" and "s" + // one word queries + assertAnalyzesTo(a, "the", new String[] { }); + assertAnalyzesTo(a, "foo", new String[] { "foo" }); + + // two word queries + assertAnalyzesTo(a, "brown fox", + new String[] { "brown", "fox" }, + new int[] { 1, 1 }); + assertAnalyzesTo(a, "the fox", + new String[] { "the_fox", "fox" }, + new int[] { 1, 1 }); + assertAnalyzesTo(a, "fox of", + new String[] { "fox", "fox_of" }, + new int[] { 1, 1 }); + assertAnalyzesTo(a, "of the", + new String[] { "of_the" }, + new int[] { 1 }); + + // 3 word combinations s=stopword/common word n=not a stop word + assertAnalyzesTo(a, "n n n", + new String[] { "n", "n", "n" }, + new int[] { 1, 1, 1 }); + assertAnalyzesTo(a, "quick brown fox", + new String[] { "quick", "brown", "fox" }, + new int[] { 1, 1, 1 }); + + assertAnalyzesTo(a, "n n s", + new String[] { "n", "n", "n_s" }, + new int[] { 1, 1, 1 }); + assertAnalyzesTo(a, "quick brown the", + new String[] { "quick", "brown", "brown_the" }, + new int[] { 1, 1, 1 }); + + assertAnalyzesTo(a, "n s n", + new String[] { "n", "n_s", "s_n", "n" }, + new int[] { 1, 1, 1, 1 }); + assertAnalyzesTo(a, "quick the fox", + new String[] { "quick", "quick_the", "the_fox", "fox" }, + new int[] { 1, 1, 1, 1 }); + + assertAnalyzesTo(a, "n s s", + new String[] { "n", "n_s", "s_s" }, + new int[] { 1, 1, 1 }); + assertAnalyzesTo(a, "fox of the", + new String[] { "fox", "fox_of", "of_the" }, + new int[] { 1, 1, 1 }); + + assertAnalyzesTo(a, "s n n", + new String[] { "s_n", "n", "n" }, + new int[] { 1, 1, 1 }); + assertAnalyzesTo(a, "the quick brown", + new String[] { "the_quick", "quick", "brown" }, + new int[] { 1, 1, 1 }); + + assertAnalyzesTo(a, "s n s", + new String[] { "s_n", "n", "n_s" }, + new int[] { 1, 1, 1 }); + assertAnalyzesTo(a, "the fox of", + new String[] { "the_fox", "fox", "fox_of" }, + new int[] { 1, 1, 1 }); + + assertAnalyzesTo(a, "s s n", + new String[] { "s_s", "s_n", "n" }, + new int[] { 1, 1, 1 }); + assertAnalyzesTo(a, "of the fox", + new String[] { "of_the", "the_fox", "fox" }, + new int[] { 1, 1, 1 }); + + assertAnalyzesTo(a, "s s s", + new String[] { "s_s", "s_s" }, + new int[] { 1, 1 }); + assertAnalyzesTo(a, "of the of", + new String[] { "of_the", "the_of" }, + new int[] { 1, 1 }); + } /** * Test that CommonGramsFilter works correctly in case-insensitive mode @@ -247,10 +334,16 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { public void testCaseSensitive() throws Exception { final String input = "How The s a brown s cow d like A B thing?"; MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); - TokenFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); + TokenFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, true); assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s", "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow", "cow_d", "d", "d_like", "like", "A", "B", "thing?"}); + + wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); + cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, false); + assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", + "s_a", "a_brown", "brown", "brown_s", "s_cow", "cow", + "cow_d", "d_like", "like", "A", "B", "thing?"}); } /** @@ -259,9 +352,14 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { public void testLastWordisStopWord() throws Exception { final String input = "dog the"; MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); - CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); + CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, true); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "dog_the" }); + + wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); + cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, false); + nsf = new CommonGramsQueryFilter(cgf); + assertTokenStreamContents(nsf, new String[] { "dog_the" }); } /** @@ -270,9 +368,14 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { public void testFirstWordisStopWord() throws Exception { final String input = "the dog"; MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); - CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); + CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, false); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "the_dog" }); + + wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); + cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, true); + nsf = new CommonGramsQueryFilter(cgf); + assertTokenStreamContents(nsf, new String[] { "the_dog" }); } /** @@ -281,9 +384,14 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { public void testOneWordQueryStopWord() throws Exception { final String input = "the"; MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); - CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); + CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, true); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "the" }); + + wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); + cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, false); + nsf = new CommonGramsQueryFilter(cgf); + assertTokenStreamContents(nsf, new String[] { }); } /** @@ -292,9 +400,14 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { public void testOneWordQuery() throws Exception { final String input = "monster"; MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); - CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); + CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, true); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "monster" }); + + wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); + cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, false); + nsf = new CommonGramsQueryFilter(cgf); + assertTokenStreamContents(nsf, new String[] { "monster" }); } /** @@ -303,9 +416,14 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase { public void TestFirstAndLastStopWord() throws Exception { final String input = "the of"; MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); - CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); + CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, true); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "the_of" }); + + wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); + cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords, false); + nsf = new CommonGramsQueryFilter(cgf); + assertTokenStreamContents(nsf, new String[] { "the_of" }); } /** blast some random strings through the analyzer */