Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (revision 508508) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (working copy) @@ -37,7 +37,7 @@ public void testInvalidInput() throws Exception { boolean gotException = false; try { - new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0); + new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0, 0); } catch (IllegalArgumentException e) { gotException = true; } @@ -45,9 +45,19 @@ } public void testInvalidInput2() throws Exception { + boolean gotException = false; + try { + new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 2, 1); + } catch (IllegalArgumentException e) { + gotException = true; + } + assertTrue(gotException); + } + + public void testInvalidInput3() throws Exception { boolean gotException = false; try { - new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1); + new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1, 2); } catch (IllegalArgumentException e) { gotException = true; } @@ -55,7 +65,7 @@ } public void testFrontUnigram() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1); + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1); Token token = null; token = tokenizer.next(); assertEquals("(a,0,1)", token.toString()); @@ -64,7 +74,7 @@ } public void testBackUnigram() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1); + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1); Token token = null; token = tokenizer.next(); assertEquals("(e,4,5)", token.toString()); @@ -73,9 +83,69 @@ } public void testOversizedNgrams() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6); + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6); Token token = null; token = tokenizer.next(); assertNull(token); } + + public void testFrontRangeOfNgrams() throws Exception { + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3); + Token token = null; + token = tokenizer.next(); + assertEquals("(a,0,1)", token.toString()); + token = tokenizer.next(); + assertEquals("(ab,0,2)", token.toString()); + token = tokenizer.next(); + assertEquals("(abc,0,3)", token.toString()); + token = tokenizer.next(); + assertEquals("(b,1,2)", token.toString()); + token = tokenizer.next(); + assertEquals("(bc,1,3)", token.toString()); + token = tokenizer.next(); + assertEquals("(bcd,1,4)", token.toString()); + token = tokenizer.next(); + assertEquals("(c,2,3)", token.toString()); + token = tokenizer.next(); + assertEquals("(cd,2,4)", token.toString()); + token = tokenizer.next(); + assertEquals("(cde,2,5)", token.toString()); + token = tokenizer.next(); + assertEquals("(d,3,4)", token.toString()); + token = tokenizer.next(); + assertEquals("(de,3,5)", token.toString()); + token = tokenizer.next(); + assertNull(token); + } + + public void testBackRangeOfNgrams() throws Exception { + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3); + Token token = null; + token = tokenizer.next(); + assertEquals("(e,4,5)", token.toString()); + token = tokenizer.next(); + assertEquals("(de,3,5)", token.toString()); + token = tokenizer.next(); + assertEquals("(cde,2,5)", token.toString()); + token = tokenizer.next(); + assertEquals("(d,3,4)", token.toString()); + token = tokenizer.next(); + assertEquals("(cd,2,4)", token.toString()); + token = tokenizer.next(); + assertEquals("(bcd,1,4)", token.toString()); + token = tokenizer.next(); + assertEquals("(c,2,3)", token.toString()); + token = tokenizer.next(); + assertEquals("(bc,1,3)", token.toString()); + token = tokenizer.next(); + assertEquals("(abc,0,3)", token.toString()); + token = tokenizer.next(); + assertEquals("(b,1,2)", token.toString()); + token = tokenizer.next(); + assertEquals("(ab,0,2)", token.toString()); + token = tokenizer.next(); + assertEquals("(a,0,1)", token.toString()); + token = tokenizer.next(); + assertNull(token); + } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 508508) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy) @@ -24,75 +24,157 @@ import java.io.Reader; /** - * Tokenizes the input into n-grams of the given size. + * Tokenizes the input from an edge into n-grams of given size(s). * @author Otis Gospodnetic + * @author Adam Hiatt */ public class EdgeNGramTokenizer extends Tokenizer { - // which side to get the n-gram from - // TODO: switch to using this enum when we move to 1.5+ -// public enum Side { -// FRONT (), -// BACK (); -// } + public static final Side DEFAULT_SIDE = Side.FRONT; + + public static final int DEFAULT_MAX_GRAM_SIZE = 1; + + public static final int DEFAULT_MIN_GRAM_SIZE = 1; + + // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified /** Specifies which side of the input the n-gram should be generated from */ public static class Side { + private String label; + /** Get the n-gram from the front of the input */ public static Side FRONT = new Side("front"); + /** Get the n-gram from the end of the input */ public static Side BACK = new Side("back"); - private Side(String label) {} + + // Private ctor + private Side(String label) { + this.label = label; + } + + public String getLabel() { + return label; + } + + // Get the appropriate Side from a string + public static Side getSide(String sideName) { + if (FRONT.getLabel().equals(sideName)) { + return FRONT; + } else if (BACK.getLabel().equals(sideName)) { + return BACK; + } + return null; + } } + + private int minGram; + private int maxGram; private int gramSize; private Side side; + private boolean started = false; + private int currPos; private int inLen; private String inStr; - private boolean started = false; /** - * Creates EdgeNGramTokenizer that can generate an n-gram of the given size. + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range. + * * @param input Reader holding the input to be tokenized - * @param side the {@link Side} from which to chop off an n-gram - * @param gramSize the size of the n-gram to generate + * @param side the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate */ - public EdgeNGramTokenizer(Reader input, Side side, int gramSize) { + public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) { super(input); - if (gramSize < 1) { - throw new IllegalArgumentException("gramSize must be greater than zero"); + + if (side == null) { + throw new IllegalArgumentException("sideLabel must be either front or back"); } - this.gramSize = gramSize; + + if (minGram < 1) { + throw new IllegalArgumentException("minGram must be greater than zero"); + } + + if (minGram > maxGram) { + throw new IllegalArgumentException("minGram must not be greater than maxGram"); + } + + this.minGram = minGram; + this.maxGram = maxGram; this.side = side; + this.currPos = 0; } - public EdgeNGramTokenizer(Reader input, String side, int gramSize) { + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range. + * + * @param input Reader holding the input to be tokenized + * @param sideLabel the name of the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) { + this(input, Side.getSide(sideLabel), minGram, maxGram); } /** Returns the next token in the stream, or null at EOS. */ public final Token next() throws IOException { // if we already returned the edge n-gram, we are done - if (started) - return null; if (!started) { started = true; char[] chars = new char[1024]; input.read(chars); - inStr = new String(chars).trim(); // remove any trailing empty strings + inStr = new String(chars).trim(); // remove any trailing empty strings inLen = inStr.length(); + gramSize = minGram; + if (side == Side.FRONT) + currPos = 0; + else + currPos = inLen; } - // if the input is too short, we can't generate any n-grams - if (gramSize > inLen) - return null; - if (side == Side.FRONT) - return new Token(inStr.substring(0, gramSize), 0, gramSize); - else - return new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen); - } - static Side side(String label) { - if (label == null || label.trim().length() == 0) - throw new IllegalArgumentException("Label must be either 'front' or 'back'"); - if (label.equals("front")) - return Side.FRONT; - else - return Side.BACK; + // if we have hit the end of our n-gram size range, go back to minGram + if (gramSize > maxGram) { + gramSize = minGram; + if (side == Side.FRONT) + currPos++; + else + currPos--; + } + + if (side == Side.FRONT) { + if (inLen - currPos < gramSize) { +// System.out.println("Not enough chars left"); +// System.out.println("gramSize: " + gramSize); +// System.out.println("currPos: " + currPos); + return null; + } + } else { + if ((currPos - gramSize < 0)) { + if (gramSize == minGram) { +// System.out.println("Not enough chars left"); +// System.out.println("gramSize: " + gramSize); +// System.out.println("currPos: " + currPos); + return null; + } +// HACK THAT DID NOT EVEN WORK +// else { +// gramSize -= 2; +// currPos--; +// } + } + } + + Token tok; + if (side == Side.FRONT) { + tok = new Token(inStr.substring(currPos, currPos+gramSize), currPos, currPos+gramSize); + } else { + tok = new Token(inStr.substring(currPos - gramSize, currPos), currPos - gramSize, currPos); + } + +// System.out.println("Tok: " + tok); +// System.out.println("currPos: " + currPos); +// System.out.println("gramSize: " + gramSize); + gramSize++; + return tok; } }