Index: src/test/org/apache/lucene/analysis/TestStopFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestStopFilter.java (revision 590752) +++ src/test/org/apache/lucene/analysis/TestStopFilter.java (working copy) @@ -16,11 +16,12 @@ * limitations under the License. */ +import org.apache.lucene.util.LuceneTestCase; + import java.io.IOException; import java.io.StringReader; +import java.util.Set; -import org.apache.lucene.util.LuceneTestCase; - /** * @author yonik */ @@ -45,4 +46,14 @@ assertEquals(null,stream.next()); } + public void testStopFilt() throws IOException { + StringReader reader = new StringReader("Now is The Time"); + String[] stopWords = new String[] { "is", "the", "Time" }; + Set stopSet = StopFilter.makeStopSet(stopWords); + TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet); + assertEquals("Now", stream.next().termText()); + assertEquals("The", stream.next().termText()); + assertEquals(null, stream.next()); + } + } Index: src/java/org/apache/lucene/analysis/CharArraySet.java =================================================================== --- src/java/org/apache/lucene/analysis/CharArraySet.java (revision 590752) +++ src/java/org/apache/lucene/analysis/CharArraySet.java (working copy) @@ -1,5 +1,9 @@ package org.apache.lucene.analysis; +import java.util.AbstractSet; +import java.util.Collection; +import java.util.Iterator; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,131 +23,265 @@ /** - * A simple class that can store & retrieve char[]'s in a + * A simple class that stores Strings as char[]'s in a * hash table. Note that this is not a general purpose - * class. For example, it cannot remove char[]'s from the + * class. For example, it cannot remove items from the * set, nor does it resize its hash table to be smaller, - * etc. It is designed for use with StopFilter to enable - * quick filtering based on the char[] termBuffer in a - * Token. + * etc. It is designed to be quick to test if a char[] + * is in the set without the necessity of converting it + * to a String first. */ -final class CharArraySet { - +public class CharArraySet extends AbstractSet { private final static int INIT_SIZE = 8; - private final static double MAX_LOAD_FACTOR = 0.75; - private int mask; private char[][] entries; private int count; - private boolean ignoreCase; + private final boolean ignoreCase; /** Create set with enough capacity to hold startSize * terms */ public CharArraySet(int startSize, boolean ignoreCase) { this.ignoreCase = ignoreCase; int size = INIT_SIZE; - while(((double) startSize)/size >= MAX_LOAD_FACTOR) - size *= 2; - mask = size-1; + while(startSize + (startSize>>2) > size) + size <<= 1; entries = new char[size][]; } - /** Returns true if the characters in text up to length - * len is present in the set. */ - public boolean contains(char[] text, int len) { + /** Create set from a Collection of char[] or String */ + public CharArraySet(Collection c, boolean ignoreCase) { + this(c.size(), ignoreCase); + addAll(c); + } + + /** true if the len chars of text starting at off + * are in the set */ + public boolean contains(char[] text, int off, int len) { + return entries[getSlot(text, off, len)] != null; + } + + /** true if the CharSequence is in the set */ + public boolean contains(CharSequence cs) { + return entries[getSlot(cs)] != null; + } + + private int getSlot(char[] text, int off, int len) { int code = getHashCode(text, len); - int pos = code & mask; + int pos = code & (entries.length-1); char[] text2 = entries[pos]; - if (text2 != null && !equals(text, len, text2)) { + if (text2 != null && !equals(text, off, len, text2)) { final int inc = ((code>>8)+code)|1; do { code += inc; - pos = code & mask; + pos = code & (entries.length-1); text2 = entries[pos]; - } while (text2 != null && !equals(text, len, text2)); + } while (text2 != null && !equals(text, off, len, text2)); } - return text2 != null; + return pos; } + /** Returns true if the String is in the set */ + private int getSlot(CharSequence text) { + int code = getHashCode(text); + int pos = code & (entries.length-1); + char[] text2 = entries[pos]; + if (text2 != null && !equals(text, text2)) { + final int inc = ((code>>8)+code)|1; + do { + code += inc; + pos = code & (entries.length-1); + text2 = entries[pos]; + } while (text2 != null && !equals(text, text2)); + } + return pos; + } + + /** Add this CharSequence into the set */ + public boolean add(CharSequence text) { + return add(text.toString()); // could be more efficient + } + /** Add this String into the set */ - public void add(String text) { - add(text.toCharArray()); + public boolean add(String text) { + return add(text.toCharArray()); } - /** Add this text into the set */ - public void add(char[] text) { + /** Add this char[] directly to the set. + * If ignoreCase is true for this Set, the text array will be modified. + * The user should never modify this text array after calling this method. + */ + public boolean add(char[] text) { if (ignoreCase) for(int i=0;i>8)+code)|1; - do { - code += inc; - pos = code & mask; - text2 = entries[pos]; - } while (text2 != null); - } - entries[pos] = text; + int slot = getSlot(text, 0, text.length); + if (entries[slot] != null) return false; + entries[slot] = text; count++; - if (((double) count)/entries.length > MAX_LOAD_FACTOR) { + if (count > entries.length + (entries.length>>2) ) { rehash(); } + + return true; } - private boolean equals(char[] text1, int len, char[] text2) { + private boolean equals(char[] text1, int off, int len, char[] text2) { if (len != text2.length) return false; - for(int i=0;i>8)+code)|1; - do { - code += inc; - pos = code & mask; - } while (newEntries[pos] != null); - } - newEntries[pos] = text; + // todo: could be faster... no need to compare strings on collision + entries[ getSlot(text,0,text.length) ] = text; } } - - entries = newEntries; } private int getHashCode(char[] text, int len) { - int downto = len; int code = 0; - while (downto > 0) { - final char c; - if (ignoreCase) - c = Character.toLowerCase(text[--downto]); - else - c = text[--downto]; - code = (code*31) + c; + if (ignoreCase) { + for (int i=0; i for this set. Strings are constructed on the fly, so + * use nextCharArray for more efficient access. */ + public class CharArraySetIterator implements Iterator { + int pos=-1; + char[] next; + CharArraySetIterator() { + goNext(); + } + + private void goNext() { + next = null; + pos++; + while (pos < entries.length && (next=entries[pos]) == null) pos++; + } + + public boolean hasNext() { + return next != null; + } + + /** do not modify the returned char[] */ + public char[] nextCharArray() { + char[] ret = next; + goNext(); + return ret; + } + + /** Returns the next String, as a Set would... + * use nextCharArray() for better efficiency. */ + public Object next() { + return new String(nextCharArray()); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + + public Iterator iterator() { + return new CharArraySetIterator(); + } + } Index: src/java/org/apache/lucene/analysis/StopFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/StopFilter.java (revision 590752) +++ src/java/org/apache/lucene/analysis/StopFilter.java (working copy) @@ -18,7 +18,7 @@ */ import java.io.IOException; -import java.util.HashSet; +import java.util.Arrays; import java.util.Iterator; import java.util.Set; @@ -29,7 +29,6 @@ public final class StopFilter extends TokenFilter { private final CharArraySet stopWords; - private final boolean ignoreCase; /** * Construct a token stream filtering the given input. @@ -45,13 +44,17 @@ */ public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) { super(in); - this.ignoreCase = ignoreCase; - this.stopWords = makeStopCharArraySet(stopWords, ignoreCase); + this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase); } /** * Construct a token stream filtering the given input. + * If stopWords is an instance of {@link CharArraySet} or a Set constructed from + * makeStopSet() it will be directly used + * otherwise a new CharArraySet will be constructed from the given set. + * + * * @param input * @param stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased * @param ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words @@ -59,11 +62,12 @@ public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase) { super(input); - this.ignoreCase = ignoreCase; - this.stopWords = new CharArraySet(stopWords.size(), ignoreCase); - Iterator it = stopWords.iterator(); - while(it.hasNext()) - this.stopWords.add((String) it.next()); + if (stopWords instanceof CharArraySet) { + this.stopWords = (CharArraySet)stopWords; + } else { + this.stopWords = new CharArraySet(stopWords.size(), ignoreCase); + this.stopWords.addAll(stopWords); + } } /** @@ -97,18 +101,9 @@ * @return a Set containing the words */ public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) { - HashSet stopTable = new HashSet(stopWords.length); - for (int i = 0; i < stopWords.length; i++) - stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]); - return stopTable; - } - - private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) { CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); - for (int i = 0; i < stopWords.length; i++) - stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]); - return stopSet; - } + stopSet.addAll(Arrays.asList(stopWords)); + return stopSet; } /** * Returns the next input Token whose termText() is not a stop word. @@ -116,7 +111,7 @@ public final Token next(Token result) throws IOException { // return the first non-stop word found while((result = input.next(result)) != null) { - if (!stopWords.contains(result.termBuffer(), result.termLength)) + if (!stopWords.contains(result.termBuffer(), 0, result.termLength)) return result; } // reached EOS -- return null