Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (revision 1458959) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (working copy) @@ -71,8 +71,10 @@ import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; @@ -580,6 +582,29 @@ return map; } }); + put(StemmerOverrideMap.class, new ArgProducer() { + @Override public Object create(Random random) { + int num = random.nextInt(10); + StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(); + for (int i = 0; i < num; i++) { + String input = ""; + do { + input = _TestUtil.randomRealisticUnicodeString(random); + } while(input.isEmpty()); + String out = ""; _TestUtil.randomSimpleString(random); + do { + out = _TestUtil.randomRealisticUnicodeString(random); + } while(out.isEmpty()); + builder.add(input, out); + } + try { + return builder.build(); + } catch (Exception ex) { + Rethrow.rethrow(ex); + return null; // unreachable code + } + } + }); put(SynonymMap.class, new ArgProducer() { @Override public Object create(Random random) { SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean()); Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (revision 1458959) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (working copy) @@ -1,15 +1,4 @@ package org.apache.lucene.analysis.miscellaneous; - -import java.io.IOException; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.en.PorterStemFilter; -import org.apache.lucene.analysis.util.CharArrayMap; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -26,17 +15,112 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; +import org.apache.lucene.util._TestUtil; + +/** + * + */ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase { public void testOverride() throws IOException { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. - CharArrayMap dictionary = new CharArrayMap(TEST_VERSION_CURRENT, 1, false); - dictionary.put("booked", "books"); + StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(); + builder.add("booked", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked")); - TokenStream stream = new PorterStemFilter( - new StemmerOverrideFilter(tokenizer, dictionary)); - assertTokenStreamContents(stream, new String[] { "books" }); + TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( + tokenizer, builder.build(), false)); + assertTokenStreamContents(stream, new String[] {"books"}); } + + public void testRandomRealisticWhiteSpace() throws IOException { + Map map = new HashMap(); + int numTerms = atLeast(50); + for (int i = 0; i < numTerms; i++) { + String randomRealisticUnicodeString = _TestUtil + .randomRealisticUnicodeString(random()); + char[] charArray = randomRealisticUnicodeString.toCharArray(); + StringBuilder builder = new StringBuilder(); + for (int j = 0; j < charArray.length;) { + int cp = Character.codePointAt(charArray, j); + if (!Character.isWhitespace(cp)) { + builder.appendCodePoint(cp); + } + j += Character.charCount(cp); + } + if (builder.length() > 0) { + String value = _TestUtil.randomSimpleString(random()); + map.put(builder.toString(), + value.isEmpty() ? "a" : value); + + } + } + if (map.isEmpty()) { + map.put("booked", "books"); + } + StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(); + Set> entrySet = map.entrySet(); + StringBuilder input = new StringBuilder(); + List output = new ArrayList(); + for (Entry entry : entrySet) { + builder.add(entry.getKey(), entry.getValue()); + if (random().nextBoolean() || output.isEmpty()) { + input.append(entry.getKey()).append(" "); + output.add(entry.getValue()); + } + } + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader(input.toString())); + TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( + tokenizer, builder.build(), false)); + assertTokenStreamContents(stream, output.toArray(new String[0])); + } + + public void testRandomRealisticKeyword() throws IOException { + Map map = new HashMap(); + int numTerms = atLeast(50); + for (int i = 0; i < numTerms; i++) { + String randomRealisticUnicodeString = _TestUtil + .randomRealisticUnicodeString(random()); + if (randomRealisticUnicodeString.length() > 0) { + String value = _TestUtil.randomSimpleString(random()); + map.put(randomRealisticUnicodeString, + value.isEmpty() ? "a" : value); + } + } + if (map.isEmpty()) { + map.put("booked", "books"); + } + StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(); + Set> entrySet = map.entrySet(); + for (Entry entry : entrySet) { + builder.add(entry.getKey(), entry.getValue()); + } + StemmerOverrideMap build = builder.build(); + for (Entry entry : entrySet) { + if (random().nextBoolean()) { + Tokenizer tokenizer = new KeywordTokenizer(new StringReader( + entry.getKey())); + TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( + tokenizer, build, false)); + assertTokenStreamContents(stream, new String[] {entry.getValue()}); + } + } + } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (revision 1458959) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (working copy) @@ -18,22 +18,36 @@ */ import java.io.IOException; +import java.util.ArrayList; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.util.CharArrayMap; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FST.BytesReader; /** * Provides the ability to override any {@link KeywordAttribute} aware stemmer * with custom dictionary-based stemming. */ public final class StemmerOverrideFilter extends TokenFilter { - private final CharArrayMap dictionary; + private final FST fst; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + private final BytesReader fstReader; + private final Arc scratchArc = new FST.Arc(); +; + private final CharsRef spare = new CharsRef(); + private final boolean ignoreCase; /** * Create a new StemmerOverrideFilter, performing dictionary-based stemming @@ -43,19 +57,25 @@ * so that they will not be stemmed with stemmers down the chain. *

*/ - public StemmerOverrideFilter(TokenStream input, - CharArrayMap dictionary) { + public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap, boolean ignoreCase) { super(input); - this.dictionary = dictionary; + this.fst = stemmerOverrideMap.fst; + fstReader = fst.getBytesReader(); + this.ignoreCase = ignoreCase; } - + @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms - String stem = dictionary.get(termAtt.buffer(), 0, termAtt.length()); + final BytesRef stem = getStem(termAtt.buffer(), termAtt.length()); if (stem != null) { - termAtt.setEmpty().append(stem); + final char[] buffer = spare.chars = termAtt.buffer(); + UnicodeUtil.UTF8toUTF16(stem.bytes, stem.offset, stem.length, spare); + if (spare.chars != buffer) { + termAtt.copyBuffer(spare.chars, spare.offset, spare.length); + } + termAtt.setLength(spare.length); keywordAtt.setKeyword(true); } } @@ -64,4 +84,79 @@ return false; } } + + private BytesRef getStem(char[] buffer, int bufferLen) throws IOException { + BytesRef pendingOutput = fst.outputs.getNoOutput(); + BytesRef matchOutput = null; + int bufUpto = 0; + fst.getFirstArc(scratchArc); + while (bufUpto < bufferLen) { + final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); + if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { + return null; + } + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + bufUpto += Character.charCount(codePoint); + } + if (scratchArc.isFinal()) { + matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); + } + return matchOutput; + } + + + public static class StemmerOverrideMap { + final FST fst; + + StemmerOverrideMap(FST fst) { + this.fst = fst; + } + + } + /** + * This builder builds an {@link FST} for the {@link StemmerOverrideFilter} + */ + public static class Builder { + private final BytesRefHash hash = new BytesRefHash(); + private final BytesRef spare = new BytesRef(); + private final ArrayList outputValues = new ArrayList(); + /** + * Adds an input string and it's stemmer overwrite output to this builder. + * + * @param input the input char sequence + * @param output the stemmer override output char sequence + * @return false iff the input has already been added to this builder otherwise true. + */ + public boolean add(CharSequence input, CharSequence output) { + UnicodeUtil.UTF16toUTF8(input, 0, input.length(), spare); + int id = hash.add(spare); + if (id >= 0) { + outputValues.add(output); + return true; + } + return false; + } + + /** + * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} + * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} + * @throws IOException if an {@link IOException} occurs; + */ + public StemmerOverrideMap build() throws IOException { + ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + org.apache.lucene.util.fst.Builder builder = new org.apache.lucene.util.fst.Builder( + FST.INPUT_TYPE.BYTE4, outputs); + final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + IntsRef intsSpare = new IntsRef(); + final int size = hash.size(); + for (int i = 0; i < size; i++) { + int id = sort[i]; + BytesRef bytesRef = hash.get(id, spare); + UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare); + builder.add(intsSpare, new BytesRef(outputValues.get(id))); + } + return new StemmerOverrideMap(builder.finish()); + } + + } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java (revision 1458959) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java (working copy) @@ -19,10 +19,15 @@ import java.io.IOException; import java.util.List; +import java.util.Locale; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; import org.apache.lucene.analysis.util.*; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.fst.FST; /** * Factory for {@link StemmerOverrideFilter}. @@ -36,7 +41,7 @@ * */ public class StemmerOverrideFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private CharArrayMap dictionary = null; + private StemmerOverrideMap dictionary; private boolean ignoreCase; @Override @@ -47,15 +52,15 @@ assureMatchVersion(); List files = splitFileNames(dictionaryFiles); if (files.size() > 0) { - dictionary = new CharArrayMap(luceneMatchVersion, - files.size() * 10, ignoreCase); + StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(); for (String file : files) { List list = getLines(loader, file.trim()); for (String line : list) { String[] mapping = line.split("\t", 2); - dictionary.put(mapping[0], mapping[1]); + builder.add(ignoreCase? mapping[0].toLowerCase(Locale.ROOT) : mapping[0], mapping[1]); } } + dictionary = builder.build(); } } } @@ -66,6 +71,6 @@ @Override public TokenStream create(TokenStream input) { - return dictionary == null ? input : new StemmerOverrideFilter(input, dictionary); + return dictionary == null ? input : new StemmerOverrideFilter(input, dictionary, ignoreCase); } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (revision 1458959) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; @@ -30,9 +31,13 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; +import org.apache.lucene.util.fst.FST; import java.io.IOException; import java.io.Reader; @@ -96,7 +101,7 @@ */ private CharArraySet excltable = CharArraySet.EMPTY_SET; - private final CharArrayMap stemdict; + private final StemmerOverrideMap stemdict; private final Version matchVersion; /** @@ -120,7 +125,25 @@ this.matchVersion = matchVersion; this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); - this.stemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict)); + if (stemOverrideDict.isEmpty()) { + this.stemdict = null; + } else { + StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(); + CharArrayMap.EntryIterator iter = stemOverrideDict.entrySet().iterator(); + CharacterUtils instance = CharacterUtils.getInstance(matchVersion); + CharsRef spare = new CharsRef(); + while (iter.hasNext()) { + char[] nextKey = iter.nextKey(); + spare.copyChars(nextKey, 0, nextKey.length); + instance.toLowerCase(spare.chars, spare.offset, spare.length); + builder.add(spare, iter.currentValue()); + } + try { + this.stemdict = builder.build(); + } catch (IOException ex) { + throw new RuntimeException("can not build stem dict", ex); + } + } } /** @@ -141,8 +164,8 @@ result = new StopFilter(matchVersion, result, stoptable); if (!excltable.isEmpty()) result = new SetKeywordMarkerFilter(result, excltable); - if (!stemdict.isEmpty()) - result = new StemmerOverrideFilter(result, stemdict); + if (stemdict != null) + result = new StemmerOverrideFilter(result, stemdict, false); result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer()); return new TokenStreamComponents(source, result); } Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1458959) +++ lucene/CHANGES.txt (working copy) @@ -116,6 +116,9 @@ * LUCENE-4859: IndexReader now exposes Terms statistics: getDocCount, getSumDocFreq, getSumTotalTermFreq. (Shai Erera) +* LUCENE-4863: StemmerOverrideFilter now uses an FST to represent its overrides in memory. + (Simon Willnauer) + API Changes * LUCENE-4844: removed TaxonomyReader.getParent(), you should use