Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java (revision 0) @@ -0,0 +1,46 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.snowball.SnowballFilter; +import org.tartarus.snowball.SnowballProgram; + +import java.io.IOException; +import java.io.StringReader; + +public class TestKeywordRepeatFilter extends BaseTokenStreamTestCase { + + public void testBasic() throws IOException { + TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter( + new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false)), "English")); + assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0}); + } + + + public void testComposition() throws IOException { + TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(new KeywordRepeatFilter( + new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false))), "English")); + assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0}); + } + + + +} Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link KeywordRepeatFilter}. + *
+ * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.KeywordRepeatFilter"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public final class KeywordRepeatFilterFactory extends TokenFilterFactory { + @Override + public TokenStream create(TokenStream input) { + return new KeywordRepeatFilter(input); + } +} Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java (revision 0) @@ -0,0 +1,68 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +import java.io.IOException; + + +/** + * This TokenFilterĀ emits each incoming token twice once as keyword and once non-keyword, in other words once with + * {@link KeywordAttribute#setKeyword(boolean)} set to true and once set to false. + * This is useful if used with a stem filter that respects the {@link KeywordAttribute} to index the stemmed and the + * un-stemmed version of a term into the same field. + */ +public final class KeywordRepeatFilter extends TokenFilter { + + private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); + private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class); + private State state; + /** + * Construct a token stream filtering the given input. + */ + public KeywordRepeatFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (state != null) { + restoreState(state); + posIncAttr.setPositionIncrement(0); + keywordAttribute.setKeyword(false); + state = null; + return true; + } + if (input.incrementToken()) { + state = captureState(); + keywordAttribute.setKeyword(true); + return true; + } + return false; + } + + @Override + public void reset() throws IOException { + super.reset(); + state = null; + } +} Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory =================================================================== --- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 1454297) +++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) @@ -58,6 +58,7 @@ org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory +org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory org.apache.lucene.analysis.miscellaneous.LengthFilterFactory org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1454297) +++ lucene/CHANGES.txt (working copy) @@ -32,6 +32,10 @@ * LUCENE-4815: DrillSideways now allows more than one FacetRequest per dimension (Mike McCandless) +* LUCENE-4817: Added KeywordRepeatFilter that allows to emit a token twice + once as a keyword and once as an ordinary token allow stemmers to emit + a stemmed version along with the un-stemmed version. (Simon Willnauer) + ======================= Lucene 4.2.0 ======================= Changes in backwards compatibility policy