Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/TestCombinedNGramTokenFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/TestCombinedNGramTokenFilter.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/TestCombinedNGramTokenFilter.java (revision 0) @@ -0,0 +1,233 @@ +package org.apache.lucene.analysis.ngram; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCombinedNGramTokenFilter extends TestCase { + + public void test() throws Exception { + + try { + new CombinedNGramTokenFilter(null, 4, 1); + fail("Minimum size can not be less than maximum size!"); + } catch (IllegalArgumentException e) { + // all good + } + + TokenStream ts; + + + ts = new WhitespaceTokenizer(new StringReader("0123 hello world")); + ts = new CombinedNGramTokenFilter(ts); + + assertNext(ts, "^0", 0, 1, "^gram", 0); + assertNext(ts, "^01", 0, 2, "^gram", 0); + assertNext(ts, "^012", 0, 3, "^gram", 0); + assertNext(ts, "^0123", 0, 4, "^gram", 0); + assertNext(ts, "^0123$", 0, 4, "^gram$", 0); + + assertNext(ts, "0", 0, 1, "gram", 0); + assertNext(ts, "01", 0, 2, "gram", 0); + assertNext(ts, "012", 0, 3, "gram", 0); + assertNext(ts, "0123", 0, 4, "gram", 0); + assertNext(ts, "0123$", 0, 4, "gram$", 0); + + assertNext(ts, "1", 1, 2, "gram", 0); + assertNext(ts, "12", 1, 3, "gram", 0); + assertNext(ts, "123", 1, 4, "gram", 0); + assertNext(ts, "123$", 1, 4, "gram$", 0); + + assertNext(ts, "2", 2, 3, "gram", 0); + assertNext(ts, "23", 2, 4, "gram", 0); + assertNext(ts, "23$", 2, 4, "gram$", 0); + + assertNext(ts, "3", 3, 4, "gram", 0); + assertNext(ts, "3$", 3, 4, "gram$", 0); + + assertNext(ts, "^h", 0, 1, "^gram", 0); + assertNext(ts, "^he", 0, 2, "^gram", 0); + assertNext(ts, "^hel", 0, 3, "^gram", 0); + assertNext(ts, "^hell", 0, 4, "^gram", 0); + assertNext(ts, "^hello", 0, 5, "^gram", 0); + assertNext(ts, "^hello$", 0, 5, "^gram$", 0); + + assertNext(ts, "h", 0, 1, "gram", 0); + assertNext(ts, "he", 0, 2, "gram", 0); + assertNext(ts, "hel", 0, 3, "gram", 0); + assertNext(ts, "hell", 0, 4, "gram", 0); + assertNext(ts, "hello", 0, 5, "gram", 0); + assertNext(ts, "hello$", 0, 5, "gram$", 0); + + assertNext(ts, "e", 1, 2, "gram", 0); + assertNext(ts, "el", 1, 3, "gram", 0); + assertNext(ts, "ell", 1, 4, "gram", 0); + assertNext(ts, "ello", 1, 5, "gram", 0); + assertNext(ts, "ello$", 1, 5, "gram$", 0); + + assertNext(ts, "l", 2, 3, "gram", 0); + assertNext(ts, "ll", 2, 4, "gram", 0); + assertNext(ts, "llo", 2, 5, "gram", 0); + assertNext(ts, "llo$", 2, 5, "gram$", 0); + + assertNext(ts, "l", 3, 4, "gram", 0); + assertNext(ts, "lo", 3, 5, "gram", 0); + assertNext(ts, "lo$", 3, 5, "gram$", 0); + + assertNext(ts, "o", 4, 5, "gram", 0); + assertNext(ts, "o$", 4, 5, "gram$", 0); + + assertNext(ts, "^w", 0, 1, "^gram", 0); + assertNext(ts, "^wo", 0, 2, "^gram", 0); + assertNext(ts, "^wor", 0, 3, "^gram", 0); + assertNext(ts, "^worl", 0, 4, "^gram", 0); + assertNext(ts, "^world", 0, 5, "^gram", 0); + assertNext(ts, "^world$", 0, 5, "^gram$", 0); + + assertNext(ts, "w", 0, 1, "gram", 0); + assertNext(ts, "wo", 0, 2, "gram", 0); + assertNext(ts, "wor", 0, 3, "gram", 0); + assertNext(ts, "worl", 0, 4, "gram", 0); + assertNext(ts, "world", 0, 5, "gram", 0); + assertNext(ts, "world$", 0, 5, "gram$", 0); + + assertNext(ts, "o", 1, 2, "gram", 0); + assertNext(ts, "or", 1, 3, "gram", 0); + assertNext(ts, "orl", 1, 4, "gram", 0); + assertNext(ts, "orld", 1, 5, "gram", 0); + assertNext(ts, "orld$", 1, 5, "gram$", 0); + + assertNext(ts, "r", 2, 3, "gram", 0); + assertNext(ts, "rl", 2, 4, "gram", 0); + assertNext(ts, "rld", 2, 5, "gram", 0); + assertNext(ts, "rld$", 2, 5, "gram$", 0); + + assertNext(ts, "l", 3, 4, "gram", 0); + assertNext(ts, "ld", 3, 5, "gram", 0); + assertNext(ts, "ld$", 3, 5, "gram$", 0); + + assertNext(ts, "d", 4, 5, "gram", 0); + assertNext(ts, "d$", 4, 5, "gram$", 0); + + assertNull(ts.next()); + + + + ts = new WhitespaceTokenizer(new StringReader("hello")); + ts = new CombinedNGramTokenFilter(ts, 2, 2); + + assertNext(ts, "^h"); + assertNext(ts, "he"); + assertNext(ts, "el"); + assertNext(ts, "ll"); + assertNext(ts, "lo"); + assertNext(ts, "o$"); + + assertNull(ts.next()); + + + + } + + private Token assertNext(TokenStream ts, String text) throws IOException { + Token token = ts.next(); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + return token; + } + + private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException { + Token token = ts.next(); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(startOffset, token.startOffset()); + assertEquals(endOffset, token.endOffset()); + return token; + } + + private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset, String type) throws IOException { + Token token = ts.next(); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(startOffset, token.startOffset()); + assertEquals(endOffset, token.endOffset()); + assertEquals(type, token.type()); + return token; + } + + private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset, String type, int flags) throws IOException { + Token token = ts.next(); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(startOffset, token.startOffset()); + assertEquals(endOffset, token.endOffset()); + assertEquals(type, token.type()); + assertEquals(flags, token.getFlags()); + return token; + } + +// public void testCreate() throws IOException { +// PrintWriter out = new PrintWriter(System.out); +// out.append(" TokenStream ts;\n"); +// appendTest("0123 hello world", out); +// out.flush(); +// } +// +// protected TokenStream tokenize(String text) { +// return tokenize(new WhitespaceTokenizer(new StringReader(text))); +// } +// +// protected TokenStream tokenize(TokenStream ts) { +// //return new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.BACK, 1, 100); +// //return new NGramTokenFilter(ts, 1, 100); +// return new CombinedNGramTokenFilter(ts); +// } +// +// private void appendTest(String text, PrintWriter out) throws IOException { +// out.append(" ts = tokenize(\""); +// out.append(text); +// out.append("\"));\n"); +// +// TokenStream ts = tokenize(text); +// Token token = new Token(); +// while ((token = ts.next(token)) != null) { +// appendNextAssert(out, token); +// } +// out.append("assertNull(ts.next());\n"); +// } +// +// private void appendNextAssert(PrintWriter out, Token token) throws IOException { +// out.append(" "); +// out.append("assertNext(ts, \""); +// out.append(new String(token.termBuffer(), 0, token.termLength())); +// out.append("\", "); +// out.append(String.valueOf(token.startOffset())); +// out.append(", "); +// out.append(String.valueOf(token.endOffset())); +// out.append(", \""); +// out.append(token.type()); +// out.append("\", "); +// out.append(String.valueOf(token.getFlags())); +// out.append(");\n"); +// } + + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/CombinedNGramTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/CombinedNGramTokenFilter.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/CombinedNGramTokenFilter.java (revision 0) @@ -0,0 +1,224 @@ +package org.apache.lucene.analysis.ngram; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tokenizes the input into n-grams of the given size(s) with composite prefix and/or suffix markers. + */ +public class CombinedNGramTokenFilter extends TokenFilter { + + private int minimumSize; + private int maximumSize; + + private char prefix; + private char suffix; + +// todo +// /** +// * if true, prefix and suffix does not count as a part of the ngram size. +// * E.g. '^he' has as n of 2 if true, 3 if false +// */ +// private boolean usingBoundaryCharsAsPartOfN; + + /** current token text ngrams are created from */ + private char[] text = new char[100]; + private int textLength; + + /** previous ngram start position in text */ + private int start = -1; + + /** previous ngram text size */ + private int size; + + private static final int defaultMinimumN = 1; + private static final int defaultMaximumN = Integer.MAX_VALUE; + + public static final char defaultPrefix = '^'; + public static final char defaultSuffix = '$'; + +// public static final boolean defaultUsingBoundaryCharsAsPartOfN = true; + + public CombinedNGramTokenFilter(TokenStream input) { + this(input, defaultMinimumN, defaultMaximumN); + } + + public CombinedNGramTokenFilter(TokenStream input, int minimumSize, int maximumSize) { + this(input, minimumSize, maximumSize, defaultPrefix, defaultSuffix/*, defaultUsingBoundaryCharsAsPartOfN*/); + } + + public CombinedNGramTokenFilter(TokenStream input, int minimumSize, int maximumSize, char prefix, char suffix/*, boolean usingBoundaryCharsAsPartOfN*/) { + super(input); + this.minimumSize = minimumSize; + this.maximumSize = maximumSize; + + this.prefix = prefix; + this.suffix = suffix; + +// this.usingBoundaryCharsAsPartOfN = usingBoundaryCharsAsPartOfN; + + if (minimumSize > maximumSize) { + throw new IllegalArgumentException("Minimum size must be less than or equal to maximum size!"); + } + } + + public Token next(Token token) throws IOException { + + if (start == -1) { + token = input.next(token); + if (token == null) { + return null; + } + + textLength = 1 + token.termLength() + 1; + if (textLength > text.length) { + text = new char[textLength]; + } + text[0] = prefix; + System.arraycopy(token.termBuffer(), 0, text, 1, token.termLength()); + text[textLength - 1] = suffix; + start = 0; + size = minimumSize; + } else { + size++; + + if (size > maximumSize || size + start > textLength) { + start++; + if (start >= textLength) { + start = -1; + } + size = minimumSize - 1; + return next(token); + } + } + + // don't create ngrams with only prefix or suffix! + if (size == 1 && start == 0) { + return next(token); + } else if (start == textLength - 1) { + start = -1; + return next(token); + } + + token.setTermBuffer(text, start, size); + token.setTermLength(size); + + if (start == 0) { + token.setStartOffset(start); + if (size == textLength) { + token.setType(completeGramType); + token.setEndOffset(start + size - 2); + } else { + token.setType(prefixGramType); + token.setEndOffset(start + size - 1); + } + } else { + token.setStartOffset(start - 1); + if (size == textLength - start) { + token.setType(suffixGramType); + token.setEndOffset(start + size - 2); + } else { + token.setType(innerGramType); + token.setEndOffset(start + size - 1); + } + } + + token.setPayload(null); + token.setFlags(0); + + updateGramToken(token, start, size, text, textLength); + + return token; + } + + public static final String innerGramType = "gram"; + public static final String prefixGramType = "^gram"; + public static final String suffixGramType = "gram$"; + public static final String completeGramType = "^gram$"; + + /** + * You might want to override this to + * calculate payload weight, + * set token type or token bit in case of prefix or suffix is a part of the gram, + * et c. + * + * @param token ngram token + * @param start ngram start index in current input text + * @param size ngram size + * @param text input text ngram token is based on. don't modify this! + * @param textLength input text length + */ + public void updateGramToken(Token token, int start, int size, char[] text, int textLength) { + } + + public int getMinimumSize() { + return minimumSize; + } + + public void setMinimumSize(int minimumSize) { + this.minimumSize = minimumSize; + } + + public int getMaximumSize() { + return maximumSize; + } + + public void setMaximumSize(int maximumSize) { + this.maximumSize = maximumSize; + } + + public char getPrefix() { + return prefix; + } + + public void setPrefix(char prefix) { + this.prefix = prefix; + } + + public char getSuffix() { + return suffix; + } + + public void setSuffix(char suffix) { + this.suffix = suffix; + } + + +// public boolean isUsingBoundaryCharsAsPartOfN() { +// return usingBoundaryCharsAsPartOfN; +// } +// +// public void setUsingBoundaryCharsAsPartOfN(boolean usingBoundaryCharsAsPartOfN) { +// this.usingBoundaryCharsAsPartOfN = usingBoundaryCharsAsPartOfN; +// } + + public String toString() { + return "CombinedNGramTokenFilter{" + + "minimumSize=" + minimumSize + + ", maximumSize=" + maximumSize + + ", prefix=" + prefix + + ", suffix=" + suffix + +// ", usingBoundaryCharsAsPartOfN=" + usingBoundaryCharsAsPartOfN + + '}'; + } +} \ No newline at end of file