Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (revision 1458840)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (working copy)
@@ -71,8 +71,10 @@
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
@@ -580,6 +582,29 @@
return map;
}
});
+ put(StemmerOverrideMap.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ int num = random.nextInt(10);
+ StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
+ for (int i = 0; i < num; i++) {
+ String input = "";
+ do {
+ input = _TestUtil.randomRealisticUnicodeString(random);
+ } while(input.isEmpty());
+ String out = ""; _TestUtil.randomSimpleString(random);
+ do {
+ out = _TestUtil.randomRealisticUnicodeString(random);
+ } while(out.isEmpty());
+ builder.add(input, out);
+ }
+ try {
+ return builder.build();
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ }
+ });
put(SynonymMap.class, new ArgProducer() {
@Override public Object create(Random random) {
SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (revision 1458857)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (working copy)
@@ -1,15 +1,4 @@
package org.apache.lucene.analysis.miscellaneous;
-
-import java.io.IOException;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.en.PorterStemFilter;
-import org.apache.lucene.analysis.util.CharArrayMap;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -26,17 +15,112 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ *
+ */
public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
public void testOverride() throws IOException {
// lets make booked stem to books
// the override filter will convert "booked" to "books",
// but also mark it with KeywordAttribute so Porter will not change it.
- CharArrayMap dictionary = new CharArrayMap(TEST_VERSION_CURRENT, 1, false);
- dictionary.put("booked", "books");
+ StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
+ builder.add("booked", "books");
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
- TokenStream stream = new PorterStemFilter(
- new StemmerOverrideFilter(tokenizer, dictionary));
- assertTokenStreamContents(stream, new String[] { "books" });
+ TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(
+ tokenizer, builder.build(), false));
+ assertTokenStreamContents(stream, new String[] {"books"});
}
+
+ public void testRandomRealisticWhiteSpace() throws IOException {
+ Map map = new HashMap();
+ int numTerms = atLeast(50);
+ for (int i = 0; i < numTerms; i++) {
+ String randomRealisticUnicodeString = _TestUtil
+ .randomRealisticUnicodeString(random());
+ char[] charArray = randomRealisticUnicodeString.toCharArray();
+ StringBuilder builder = new StringBuilder();
+ for (int j = 0; j < charArray.length;) {
+ int cp = Character.codePointAt(charArray, j);
+ if (!Character.isWhitespace(cp)) {
+ builder.appendCodePoint(cp);
+ }
+ j += Character.charCount(cp);
+ }
+ if (builder.length() > 0) {
+ String value = _TestUtil.randomSimpleString(random());
+ map.put(builder.toString(),
+ value.isEmpty() ? "a" : value);
+
+ }
+ }
+ if (map.isEmpty()) {
+ map.put("booked", "books");
+ }
+ StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
+ Set> entrySet = map.entrySet();
+ StringBuilder input = new StringBuilder();
+ List output = new ArrayList();
+ for (Entry entry : entrySet) {
+ builder.add(entry.getKey(), entry.getValue());
+ if (random().nextBoolean() || output.isEmpty()) {
+ input.append(entry.getKey()).append(" ");
+ output.add(entry.getValue());
+ }
+ }
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ new StringReader(input.toString()));
+ TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(
+ tokenizer, builder.build(), false));
+ assertTokenStreamContents(stream, output.toArray(new String[0]));
+ }
+
+ public void testRandomRealisticKeyword() throws IOException {
+ Map map = new HashMap();
+ int numTerms = atLeast(50);
+ for (int i = 0; i < numTerms; i++) {
+ String randomRealisticUnicodeString = _TestUtil
+ .randomRealisticUnicodeString(random());
+ if (randomRealisticUnicodeString.length() > 0) {
+ String value = _TestUtil.randomSimpleString(random());
+ map.put(randomRealisticUnicodeString,
+ value.isEmpty() ? "a" : value);
+ }
+ }
+ if (map.isEmpty()) {
+ map.put("booked", "books");
+ }
+ StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
+ Set> entrySet = map.entrySet();
+ for (Entry entry : entrySet) {
+ builder.add(entry.getKey(), entry.getValue());
+ }
+ StemmerOverrideMap build = builder.build();
+ for (Entry entry : entrySet) {
+ if (random().nextBoolean()) {
+ Tokenizer tokenizer = new KeywordTokenizer(new StringReader(
+ entry.getKey()));
+ TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(
+ tokenizer, build, false));
+ assertTokenStreamContents(stream, new String[] {entry.getValue()});
+ }
+ }
+ }
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (revision 1458848)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (working copy)
@@ -18,22 +18,36 @@
*/
import java.io.IOException;
+import java.util.ArrayList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FST.Arc;
+import org.apache.lucene.util.fst.FST.BytesReader;
/**
* Provides the ability to override any {@link KeywordAttribute} aware stemmer
* with custom dictionary-based stemming.
*/
public final class StemmerOverrideFilter extends TokenFilter {
- private final CharArrayMap dictionary;
+ private final FST fst;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+ private final BytesReader fstReader;
+ private final Arc scratchArc = new FST.Arc();
+;
+ private final CharsRef spare = new CharsRef();
+ private final boolean ignoreCase;
/**
* Create a new StemmerOverrideFilter, performing dictionary-based stemming
@@ -43,19 +57,25 @@
* so that they will not be stemmed with stemmers down the chain.
*
*/
- public StemmerOverrideFilter(TokenStream input,
- CharArrayMap dictionary) {
+ public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap, boolean ignoreCase) {
super(input);
- this.dictionary = dictionary;
+ this.fst = stemmerOverrideMap.fst;
+ fstReader = fst.getBytesReader();
+ this.ignoreCase = ignoreCase;
}
-
+
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms
- String stem = dictionary.get(termAtt.buffer(), 0, termAtt.length());
+ final BytesRef stem = getStem(termAtt.buffer(), termAtt.length());
if (stem != null) {
- termAtt.setEmpty().append(stem);
+ final char[] buffer = spare.chars = termAtt.buffer();
+ UnicodeUtil.UTF8toUTF16(stem.bytes, stem.offset, stem.length, spare);
+ if (spare.chars != buffer) {
+ termAtt.copyBuffer(spare.chars, spare.offset, spare.length);
+ }
+ termAtt.setLength(spare.length);
keywordAtt.setKeyword(true);
}
}
@@ -64,4 +84,79 @@
return false;
}
}
+
+ private BytesRef getStem(char[] buffer, int bufferLen) throws IOException {
+ BytesRef pendingOutput = fst.outputs.getNoOutput();
+ BytesRef matchOutput = null;
+ int bufUpto = 0;
+ fst.getFirstArc(scratchArc);
+ while (bufUpto < bufferLen) {
+ final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
+ if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
+ return null;
+ }
+ pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+ bufUpto += Character.charCount(codePoint);
+ }
+ if (scratchArc.isFinal()) {
+ matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
+ }
+ return matchOutput;
+ }
+
+
+ public static class StemmerOverrideMap {
+ final FST fst;
+
+ StemmerOverrideMap(FST fst) {
+ this.fst = fst;
+ }
+
+ }
+ /**
+ * This builder builds an {@link FST} for the {@link StemmerOverrideFilter}
+ */
+ public static class Builder {
+ private final BytesRefHash hash = new BytesRefHash();
+ private final BytesRef spare = new BytesRef();
+ private final ArrayList outputValues = new ArrayList();
+ /**
+ * Adds an input string and it's stemmer overwrite output to this builder.
+ *
+ * @param input the input char sequence
+ * @param output the stemmer override output char sequence
+ * @return false iff the input has already been added to this builder otherwise true.
+ */
+ public boolean add(CharSequence input, CharSequence output) {
+ UnicodeUtil.UTF16toUTF8(input, 0, input.length(), spare);
+ int id = hash.add(spare);
+ if (id >= 0) {
+ outputValues.add(output);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
+ * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
+ * @throws IOException if an {@link IOException} occurs;
+ */
+ public StemmerOverrideMap build() throws IOException {
+ ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ org.apache.lucene.util.fst.Builder builder = new org.apache.lucene.util.fst.Builder(
+ FST.INPUT_TYPE.BYTE4, outputs);
+ final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ IntsRef intsSpare = new IntsRef();
+ final int size = hash.size();
+ for (int i = 0; i < size; i++) {
+ int id = sort[i];
+ BytesRef bytesRef = hash.get(id, spare);
+ UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
+ builder.add(intsSpare, new BytesRef(outputValues.get(id)));
+ }
+ return new StemmerOverrideMap(builder.finish());
+ }
+
+ }
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java (revision 1458848)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java (working copy)
@@ -19,10 +19,15 @@
import java.io.IOException;
import java.util.List;
+import java.util.Locale;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.util.*;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.fst.FST;
/**
* Factory for {@link StemmerOverrideFilter}.
@@ -36,7 +41,7 @@
*
*/
public class StemmerOverrideFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
- private CharArrayMap dictionary = null;
+ private StemmerOverrideMap dictionary;
private boolean ignoreCase;
@Override
@@ -47,15 +52,15 @@
assureMatchVersion();
List files = splitFileNames(dictionaryFiles);
if (files.size() > 0) {
- dictionary = new CharArrayMap(luceneMatchVersion,
- files.size() * 10, ignoreCase);
+ StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
for (String file : files) {
List list = getLines(loader, file.trim());
for (String line : list) {
String[] mapping = line.split("\t", 2);
- dictionary.put(mapping[0], mapping[1]);
+ builder.add(ignoreCase? mapping[0].toLowerCase(Locale.ROOT) : mapping[0], mapping[1]);
}
}
+ dictionary = builder.build();
}
}
}
@@ -66,6 +71,6 @@
@Override
public TokenStream create(TokenStream input) {
- return dictionary == null ? input : new StemmerOverrideFilter(input, dictionary);
+ return dictionary == null ? input : new StemmerOverrideFilter(input, dictionary, ignoreCase);
}
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (revision 1458857)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (working copy)
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
@@ -30,9 +31,13 @@
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
+import org.apache.lucene.util.fst.FST;
import java.io.IOException;
import java.io.Reader;
@@ -96,7 +101,7 @@
*/
private CharArraySet excltable = CharArraySet.EMPTY_SET;
- private final CharArrayMap stemdict;
+ private final StemmerOverrideMap stemdict;
private final Version matchVersion;
/**
@@ -120,7 +125,25 @@
this.matchVersion = matchVersion;
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
- this.stemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict));
+ if (stemOverrideDict.isEmpty()) {
+ this.stemdict = null;
+ } else {
+ StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
+ CharArrayMap.EntryIterator iter = stemOverrideDict.entrySet().iterator();
+ CharacterUtils instance = CharacterUtils.getInstance(matchVersion);
+ CharsRef spare = new CharsRef();
+ while (iter.hasNext()) {
+ char[] nextKey = iter.nextKey();
+ spare.copyChars(nextKey, 0, nextKey.length);
+ instance.toLowerCase(spare.chars, spare.offset, spare.length);
+ builder.add(spare, iter.currentValue());
+ }
+ try {
+ this.stemdict = builder.build();
+ } catch (IOException ex) {
+ throw new RuntimeException("can not build stem dict", ex);
+ }
+ }
}
/**
@@ -141,8 +164,8 @@
result = new StopFilter(matchVersion, result, stoptable);
if (!excltable.isEmpty())
result = new SetKeywordMarkerFilter(result, excltable);
- if (!stemdict.isEmpty())
- result = new StemmerOverrideFilter(result, stemdict);
+ if (stemdict != null)
+ result = new StemmerOverrideFilter(result, stemdict, false);
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
return new TokenStreamComponents(source, result);
}
Index: lucene/CHANGES.txt
===================================================================
--- lucene/CHANGES.txt (revision 1458848)
+++ lucene/CHANGES.txt (working copy)
@@ -110,6 +110,9 @@
takes int[] docIDs instead of TopDocs. (Robert Muir, Mike
McCandless)
+* LUCENE-4863: StemmerOverrideFilter now uses an FST to represent its overrides in memory.
+ (Simon Willnauer)
+
API Changes
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use