Index: src/test/org/apache/lucene/analysis/TestStopFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestStopFilter.java (revision 590752)
+++ src/test/org/apache/lucene/analysis/TestStopFilter.java (working copy)
@@ -16,11 +16,12 @@
* limitations under the License.
*/
+import org.apache.lucene.util.LuceneTestCase;
+
import java.io.IOException;
import java.io.StringReader;
+import java.util.Set;
-import org.apache.lucene.util.LuceneTestCase;
-
/**
* @author yonik
*/
@@ -45,4 +46,14 @@
assertEquals(null,stream.next());
}
+ public void testStopFilt() throws IOException {
+ StringReader reader = new StringReader("Now is The Time");
+ String[] stopWords = new String[] { "is", "the", "Time" };
+ Set stopSet = StopFilter.makeStopSet(stopWords);
+ TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
+ assertEquals("Now", stream.next().termText());
+ assertEquals("The", stream.next().termText());
+ assertEquals(null, stream.next());
+ }
+
}
Index: src/java/org/apache/lucene/analysis/CharArraySet.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharArraySet.java (revision 590752)
+++ src/java/org/apache/lucene/analysis/CharArraySet.java (working copy)
@@ -1,5 +1,9 @@
package org.apache.lucene.analysis;
+import java.util.AbstractSet;
+import java.util.Collection;
+import java.util.Iterator;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -19,131 +23,265 @@
/**
- * A simple class that can store & retrieve char[]'s in a
+ * A simple class that stores Strings as char[]'s in a
* hash table. Note that this is not a general purpose
- * class. For example, it cannot remove char[]'s from the
+ * class. For example, it cannot remove items from the
* set, nor does it resize its hash table to be smaller,
- * etc. It is designed for use with StopFilter to enable
- * quick filtering based on the char[] termBuffer in a
- * Token.
+ * etc. It is designed to be quick to test if a char[]
+ * is in the set without the necessity of converting it
+ * to a String first.
*/
-final class CharArraySet {
-
+public class CharArraySet extends AbstractSet {
private final static int INIT_SIZE = 8;
- private final static double MAX_LOAD_FACTOR = 0.75;
- private int mask;
private char[][] entries;
private int count;
- private boolean ignoreCase;
+ private final boolean ignoreCase;
/** Create set with enough capacity to hold startSize
* terms */
public CharArraySet(int startSize, boolean ignoreCase) {
this.ignoreCase = ignoreCase;
int size = INIT_SIZE;
- while(((double) startSize)/size >= MAX_LOAD_FACTOR)
- size *= 2;
- mask = size-1;
+ while(startSize + (startSize>>2) > size)
+ size <<= 1;
entries = new char[size][];
}
- /** Returns true if the characters in text up to length
- * len is present in the set. */
- public boolean contains(char[] text, int len) {
+ /** Create set from a Collection of char[] or String */
+ public CharArraySet(Collection c, boolean ignoreCase) {
+ this(c.size(), ignoreCase);
+ addAll(c);
+ }
+
+ /** true if the len chars of text starting at off
+ * are in the set */
+ public boolean contains(char[] text, int off, int len) {
+ return entries[getSlot(text, off, len)] != null;
+ }
+
+ /** true if the CharSequence is in the set */
+ public boolean contains(CharSequence cs) {
+ return entries[getSlot(cs)] != null;
+ }
+
+ private int getSlot(char[] text, int off, int len) {
int code = getHashCode(text, len);
- int pos = code & mask;
+ int pos = code & (entries.length-1);
char[] text2 = entries[pos];
- if (text2 != null && !equals(text, len, text2)) {
+ if (text2 != null && !equals(text, off, len, text2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
- pos = code & mask;
+ pos = code & (entries.length-1);
text2 = entries[pos];
- } while (text2 != null && !equals(text, len, text2));
+ } while (text2 != null && !equals(text, off, len, text2));
}
- return text2 != null;
+ return pos;
}
+ /** Returns true if the String is in the set */
+ private int getSlot(CharSequence text) {
+ int code = getHashCode(text);
+ int pos = code & (entries.length-1);
+ char[] text2 = entries[pos];
+ if (text2 != null && !equals(text, text2)) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ pos = code & (entries.length-1);
+ text2 = entries[pos];
+ } while (text2 != null && !equals(text, text2));
+ }
+ return pos;
+ }
+
+ /** Add this CharSequence into the set */
+ public boolean add(CharSequence text) {
+ return add(text.toString()); // could be more efficient
+ }
+
/** Add this String into the set */
- public void add(String text) {
- add(text.toCharArray());
+ public boolean add(String text) {
+ return add(text.toCharArray());
}
- /** Add this text into the set */
- public void add(char[] text) {
+ /** Add this char[] directly to the set.
+ * If ignoreCase is true for this Set, the text array will be modified.
+ * The user should never modify this text array after calling this method.
+ */
+ public boolean add(char[] text) {
if (ignoreCase)
for(int i=0;i>8)+code)|1;
- do {
- code += inc;
- pos = code & mask;
- text2 = entries[pos];
- } while (text2 != null);
- }
- entries[pos] = text;
+ int slot = getSlot(text, 0, text.length);
+ if (entries[slot] != null) return false;
+ entries[slot] = text;
count++;
- if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
+ if (count > entries.length + (entries.length>>2) ) {
rehash();
}
+
+ return true;
}
- private boolean equals(char[] text1, int len, char[] text2) {
+ private boolean equals(char[] text1, int off, int len, char[] text2) {
if (len != text2.length)
return false;
- for(int i=0;i>8)+code)|1;
- do {
- code += inc;
- pos = code & mask;
- } while (newEntries[pos] != null);
- }
- newEntries[pos] = text;
+ // todo: could be faster... no need to compare strings on collision
+ entries[ getSlot(text,0,text.length) ] = text;
}
}
-
- entries = newEntries;
}
private int getHashCode(char[] text, int len) {
- int downto = len;
int code = 0;
- while (downto > 0) {
- final char c;
- if (ignoreCase)
- c = Character.toLowerCase(text[--downto]);
- else
- c = text[--downto];
- code = (code*31) + c;
+ if (ignoreCase) {
+ for (int i=0; i for this set. Strings are constructed on the fly, so
+ * use nextCharArray for more efficient access. */
+ public class CharArraySetIterator implements Iterator {
+ int pos=-1;
+ char[] next;
+ CharArraySetIterator() {
+ goNext();
+ }
+
+ private void goNext() {
+ next = null;
+ pos++;
+ while (pos < entries.length && (next=entries[pos]) == null) pos++;
+ }
+
+ public boolean hasNext() {
+ return next != null;
+ }
+
+ /** do not modify the returned char[] */
+ public char[] nextCharArray() {
+ char[] ret = next;
+ goNext();
+ return ret;
+ }
+
+ /** Returns the next String, as a Set would...
+ * use nextCharArray() for better efficiency. */
+ public Object next() {
+ return new String(nextCharArray());
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+
+ public Iterator iterator() {
+ return new CharArraySetIterator();
+ }
+
}
Index: src/java/org/apache/lucene/analysis/StopFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/StopFilter.java (revision 590752)
+++ src/java/org/apache/lucene/analysis/StopFilter.java (working copy)
@@ -18,7 +18,7 @@
*/
import java.io.IOException;
-import java.util.HashSet;
+import java.util.Arrays;
import java.util.Iterator;
import java.util.Set;
@@ -29,7 +29,6 @@
public final class StopFilter extends TokenFilter {
private final CharArraySet stopWords;
- private final boolean ignoreCase;
/**
* Construct a token stream filtering the given input.
@@ -45,13 +44,17 @@
*/
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
super(in);
- this.ignoreCase = ignoreCase;
- this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
+ this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
}
/**
* Construct a token stream filtering the given input.
+ * If stopWords is an instance of {@link CharArraySet} or a Set constructed from
+ * makeStopSet() it will be directly used
+ * otherwise a new CharArraySet will be constructed from the given set.
+ *
+ *
* @param input
* @param stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
* @param ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words
@@ -59,11 +62,12 @@
public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
{
super(input);
- this.ignoreCase = ignoreCase;
- this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
- Iterator it = stopWords.iterator();
- while(it.hasNext())
- this.stopWords.add((String) it.next());
+ if (stopWords instanceof CharArraySet) {
+ this.stopWords = (CharArraySet)stopWords;
+ } else {
+ this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
+ this.stopWords.addAll(stopWords);
+ }
}
/**
@@ -97,18 +101,9 @@
* @return a Set containing the words
*/
public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) {
- HashSet stopTable = new HashSet(stopWords.length);
- for (int i = 0; i < stopWords.length; i++)
- stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
- return stopTable;
- }
-
- private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
- for (int i = 0; i < stopWords.length; i++)
- stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
- return stopSet;
- }
+ stopSet.addAll(Arrays.asList(stopWords));
+ return stopSet; }
/**
* Returns the next input Token whose termText() is not a stop word.
@@ -116,7 +111,7 @@
public final Token next(Token result) throws IOException {
// return the first non-stop word found
while((result = input.next(result)) != null) {
- if (!stopWords.contains(result.termBuffer(), result.termLength))
+ if (!stopWords.contains(result.termBuffer(), 0, result.termLength))
return result;
}
// reached EOS -- return null