Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/TestCombinedNGramTokenFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/TestCombinedNGramTokenFilter.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/TestCombinedNGramTokenFilter.java (revision 0) @@ -0,0 +1,134 @@ +package org.apache.lucene.analysis.ngram; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +import java.io.IOException; +import java.io.StringReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCombinedNGramTokenFilter extends TestCase { + + public void test() throws Exception { + + try { + new CombinedNGramTokenFilter(null, 4, 1); + fail("Minimum size can not be less than or equal to maximum size!"); + } catch (IllegalArgumentException e) { + // all good + } + + TokenStream ts; + + ts = new WhitespaceTokenizer(new StringReader("hello world")); + ts = new CombinedNGramTokenFilter(ts); + + assertNext(ts, "^h"); + assertNext(ts, "^he"); + assertNext(ts, "^hel"); + assertNext(ts, "^hell"); + assertNext(ts, "^hello"); + assertNext(ts, "^hello$"); + + assertNext(ts, "h"); + assertNext(ts, "he"); + assertNext(ts, "hel"); + assertNext(ts, "hell"); + assertNext(ts, "hello"); + assertNext(ts, "hello$"); + + assertNext(ts, "e"); + assertNext(ts, "el"); + assertNext(ts, "ell"); + assertNext(ts, "ello"); + assertNext(ts, "ello$"); + + assertNext(ts, "l"); + assertNext(ts, "ll"); + assertNext(ts, "llo"); + assertNext(ts, "llo$"); + + assertNext(ts, "l"); + assertNext(ts, "lo"); + assertNext(ts, "lo$"); + + assertNext(ts, "o"); + assertNext(ts, "o$"); + + assertNext(ts, "^w"); + assertNext(ts, "^wo"); + assertNext(ts, "^wor"); + assertNext(ts, "^worl"); + assertNext(ts, "^world"); + assertNext(ts, "^world$"); + + assertNext(ts, "w"); + assertNext(ts, "wo"); + assertNext(ts, "wor"); + assertNext(ts, "worl"); + assertNext(ts, "world"); + assertNext(ts, "world$"); + + + assertNext(ts, "o"); + assertNext(ts, "or"); + assertNext(ts, "orl"); + assertNext(ts, "orld"); + assertNext(ts, "orld$"); + + + assertNext(ts, "r"); + assertNext(ts, "rl"); + assertNext(ts, "rld"); + assertNext(ts, "rld$"); + + assertNext(ts, "l"); + assertNext(ts, "ld"); + assertNext(ts, "ld$"); + + + assertNext(ts, "d"); + assertNext(ts, "d$"); + + assertNull(ts.next()); + + + ts = new WhitespaceTokenizer(new StringReader("hello")); + ts = new CombinedNGramTokenFilter(ts, 2, 2); + + assertNext(ts, "^h"); + assertNext(ts, "he"); + assertNext(ts, "el"); + assertNext(ts, "ll"); + assertNext(ts, "lo"); + assertNext(ts, "o$"); + + assertNull(ts.next()); + + } + + private Token assertNext(TokenStream ts, String text) throws IOException { + Token token = ts.next(); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + return token; + } + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/CombinedNGramTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/CombinedNGramTokenFilter.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/CombinedNGramTokenFilter.java (revision 0) @@ -0,0 +1,162 @@ +package org.apache.lucene.analysis.ngram; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CombinedNGramTokenFilter extends TokenFilter { + + public CombinedNGramTokenFilter(TokenStream input) { + super(input); + } + + private int minimumSize = 1; + private int maximumSize = Integer.MAX_VALUE; + + private char prefix = '^'; + private char suffix = '$'; + + private int start = -1; + private int size; + + private char[] text = new char[100]; + private int textLength; + + + public CombinedNGramTokenFilter(TokenStream input, int minimumSize, int maximumSize) { + super(input); + this.minimumSize = minimumSize; + this.maximumSize = maximumSize; + + if (minimumSize > maximumSize) { + throw new IllegalArgumentException("Minimum size must be less than or equal to maximum size!"); + } + } + + public Token next(Token token) throws IOException { + + if (start == -1) { + token = input.next(token); + if (token == null) { + return null; + } + + textLength = 1 + token.termLength() + 1; + if (textLength > text.length) { + text = new char[textLength]; + } + text[0] = prefix; + System.arraycopy(token.termBuffer(), 0, text, 1, token.termLength()); + text[textLength - 1] = suffix; + start = 0; + size = minimumSize; + } else { + size++; + if (size > maximumSize || size + start > textLength) { + start++; + if (start >= textLength) { + start = -1; + } + size = minimumSize - 1; + return next(token); + } + } + + // don't create ngrams with only prefix or suffix! + if (size == 1 && start == 0) { + return next(token); + } else if (start == textLength - 1) { + start = -1; + return next(token); + } + + + token.setTermBuffer(text, start, size); + token.setTermLength(size); + token.setStartOffset(0); + token.setEndOffset(0); + token.setPayload(null); + token.setFlags(0); + token.setType(type); + + updateGramToken(token, start, size, text, textLength); + + return token; + } + + public static final String type = "gram"; + + + /** + * You might want to override this to calculate payload weight or something. + * + * @param token ngram token + * @param start ngram start index in current input text + * @param size ngram size + * @param text input text + * @param textLength input text length + */ + public void updateGramToken(Token token, int start, int size, char[] text, int textLength) { + } + + public int getMinimumSize() { + return minimumSize; + } + + public void setMinimumSize(int minimumSize) { + this.minimumSize = minimumSize; + } + + public int getMaximumSize() { + return maximumSize; + } + + public void setMaximumSize(int maximumSize) { + this.maximumSize = maximumSize; + } + + public char getPrefix() { + return prefix; + } + + public void setPrefix(char prefix) { + this.prefix = prefix; + } + + public char getSuffix() { + return suffix; + } + + public void setSuffix(char suffix) { + this.suffix = suffix; + } + + + public String toString() { + return "CombinedNGramTokenFilter{" + + "minimumSize=" + minimumSize + + ", maximumSize=" + maximumSize + + ", prefix=" + prefix + + ", suffix=" + suffix + + '}'; + } +}