Index: src/test/org/apache/lucene/analysis/TestCharArraySet.java =================================================================== --- src/test/org/apache/lucene/analysis/TestCharArraySet.java (revision 0) +++ src/test/org/apache/lucene/analysis/TestCharArraySet.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.StopAnalyzer; +import org.apache.lucene.analysis.CharArraySet; + +public class TestCharArraySet extends LuceneTestCase +{ + public void testRehash() throws Exception { + CharArraySet cas = new CharArraySet(0, true); + for(int i=0;i= MAX_LOAD_FACTOR) - size *= 2; - mask = size-1; + while(startSize + (startSize>>2) > size) + size <<= 1; entries = new char[size][]; } - /** Returns true if the characters in text up to length - * len is present in the set. */ - public boolean contains(char[] text, int len) { + /** Create set from a Collection of char[] or String */ + public CharArraySet(Collection c, boolean ignoreCase) { + this(c.size(), ignoreCase); + addAll(c); + } + + /** true if the len chars of text starting at off + * are in the set */ + public boolean contains(char[] text, int off, int len) { + return entries[getSlot(text, off, len)] != null; + } + + /** true if the CharSequence is in the set */ + public boolean contains(CharSequence cs) { + return entries[getSlot(cs)] != null; + } + + private int getSlot(char[] text, int off, int len) { int code = getHashCode(text, len); - int pos = code & mask; + int pos = code & (entries.length-1); char[] text2 = entries[pos]; - if (text2 != null && !equals(text, len, text2)) { + if (text2 != null && !equals(text, off, len, text2)) { final int inc = ((code>>8)+code)|1; do { code += inc; - pos = code & mask; + pos = code & (entries.length-1); text2 = entries[pos]; - } while (text2 != null && !equals(text, len, text2)); + } while (text2 != null && !equals(text, off, len, text2)); } - return text2 != null; + return pos; } + private int getSlot(CharSequence text) { + int code = getHashCode(text); + int pos = code & (entries.length-1); + char[] text2 = entries[pos]; + if (text2 != null && !equals(text, text2)) { + final int inc = ((code>>8)+code)|1; + do { + code += inc; + pos = code & (entries.length-1); + text2 = entries[pos]; + } while (text2 != null && !equals(text, text2)); + } + return pos; + } + + /** Add this CharSequence into the set */ + public boolean add(CharSequence text) { + return add(text.toString()); // could be more efficient + } + /** Add this String into the set */ - public void add(String text) { - add(text.toCharArray()); + public boolean add(String text) { + return add(text.toCharArray()); } - /** Add this text into the set */ - public void add(char[] text) { + /** Add this char[] directly to the set. + * If ignoreCase is true for this Set, the text array will be modified. + * The user should never modify this text array after calling this method. + */ + public boolean add(char[] text) { if (ignoreCase) for(int i=0;i>8)+code)|1; - do { - code += inc; - pos = code & mask; - text2 = entries[pos]; - } while (text2 != null); - } - entries[pos] = text; + int slot = getSlot(text, 0, text.length); + if (entries[slot] != null) return false; + entries[slot] = text; count++; - if (((double) count)/entries.length > MAX_LOAD_FACTOR) { + if (count + (count>>2) > entries.length) { rehash(); } + + return true; } - private boolean equals(char[] text1, int len, char[] text2) { + // Assumes text2 is already lower-cased if ignoreCase==true + private boolean equals(char[] text1, int off, int len, char[] text2) { if (len != text2.length) return false; - for(int i=0;i>8)+code)|1; - do { - code += inc; - pos = code & mask; - } while (newEntries[pos] != null); - } - newEntries[pos] = text; + // todo: could be faster... no need to compare strings on collision + entries[getSlot(text,0,text.length)] = text; } } - - entries = newEntries; } private int getHashCode(char[] text, int len) { - int downto = len; int code = 0; - while (downto > 0) { - final char c; - if (ignoreCase) - c = Character.toLowerCase(text[--downto]); - else - c = text[--downto]; - code = (code*31) + c; + if (ignoreCase) { + for (int i=0; i for this set. Strings are constructed on the fly, so + * use nextCharArray for more efficient access. */ + public class CharArraySetIterator implements Iterator { + int pos=-1; + char[] next; + CharArraySetIterator() { + goNext(); + } + + private void goNext() { + next = null; + pos++; + while (pos < entries.length && (next=entries[pos]) == null) pos++; + } + + public boolean hasNext() { + return next != null; + } + + /** do not modify the returned char[] */ + public char[] nextCharArray() { + char[] ret = next; + goNext(); + return ret; + } + + /** Returns the next String, as a Set would... + * use nextCharArray() for better efficiency. */ + public Object next() { + return new String(nextCharArray()); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + + public Iterator iterator() { + return new CharArraySetIterator(); + } + } Index: src/java/org/apache/lucene/analysis/StopFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/StopFilter.java (revision 590944) +++ src/java/org/apache/lucene/analysis/StopFilter.java (working copy) @@ -18,7 +18,7 @@ */ import java.io.IOException; -import java.util.HashSet; +import java.util.Arrays; import java.util.Iterator; import java.util.Set; @@ -29,7 +29,6 @@ public final class StopFilter extends TokenFilter { private final CharArraySet stopWords; - private final boolean ignoreCase; /** * Construct a token stream filtering the given input. @@ -45,13 +44,17 @@ */ public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) { super(in); - this.ignoreCase = ignoreCase; - this.stopWords = makeStopCharArraySet(stopWords, ignoreCase); + this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase); } /** * Construct a token stream filtering the given input. + * If stopWords is an instance of {@link CharArraySet} or a Set constructed from + * makeStopSet() it will be directly used + * otherwise a new CharArraySet will be constructed from the given set. + * + * * @param input * @param stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased * @param ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words @@ -59,11 +62,12 @@ public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase) { super(input); - this.ignoreCase = ignoreCase; - this.stopWords = new CharArraySet(stopWords.size(), ignoreCase); - Iterator it = stopWords.iterator(); - while(it.hasNext()) - this.stopWords.add((String) it.next()); + if (stopWords instanceof CharArraySet) { + this.stopWords = (CharArraySet)stopWords; + } else { + this.stopWords = new CharArraySet(stopWords.size(), ignoreCase); + this.stopWords.addAll(stopWords); + } } /** @@ -97,16 +101,8 @@ * @return a Set containing the words */ public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) { - HashSet stopTable = new HashSet(stopWords.length); - for (int i = 0; i < stopWords.length; i++) - stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]); - return stopTable; - } - - private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) { CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); - for (int i = 0; i < stopWords.length; i++) - stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]); + stopSet.addAll(Arrays.asList(stopWords)); return stopSet; } @@ -116,7 +112,7 @@ public final Token next(Token result) throws IOException { // return the first non-stop word found while((result = input.next(result)) != null) { - if (!stopWords.contains(result.termBuffer(), result.termLength)) + if (!stopWords.contains(result.termBuffer(), 0, result.termLength)) return result; } // reached EOS -- return null