Index: lucene/contrib/CHANGES.txt --- lucene/contrib/CHANGES.txt Tue Jul 05 15:59:45 2011 -0400 +++ lucene/contrib/CHANGES.txt Tue Jul 05 17:45:59 2011 -0400 @@ -78,6 +78,10 @@ documents must be indexed as a document block, using IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless) + * LUCENE-3233: FSTSynonymFilter for applying multi-word synonyms + during indexing, using far less RAM than the current + SynonymFilter. (Robert Muir, Mike McCandless) + API Changes Bug Fixes Index: lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java Tue Jul 05 17:45:59 2011 -0400 @@ -0,0 +1,52 @@ +package org.apache.lucene.store; + +import org.apache.lucene.util.BytesRef; + +/** + * @lucene.experimental + */ +public class ByteArrayDataOutput extends DataOutput { + private byte[] bytes; + + private int pos; + private int limit; + + public ByteArrayDataOutput(byte[] bytes) { + reset(bytes); + } + + public ByteArrayDataOutput(byte[] bytes, int offset, int len) { + reset(bytes, offset, len); + } + + public ByteArrayDataOutput() { + reset(BytesRef.EMPTY_BYTES); + } + + public void reset(byte[] bytes) { + reset(bytes, 0, bytes.length); + } + + public void reset(byte[] bytes, int offset, int len) { + this.bytes = bytes; + pos = offset; + limit = offset + len; + } + + public int getPosition() { + return pos; + } + + @Override + public void writeByte(byte b) { + assert pos < limit; + bytes[pos++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + assert pos + length <= limit; + System.arraycopy(b, offset, bytes, pos, length); + pos += length; + } +} Index: lucene/src/java/org/apache/lucene/util/CharsRef.java --- lucene/src/java/org/apache/lucene/util/CharsRef.java Tue Jul 05 15:59:45 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/CharsRef.java Tue Jul 05 17:45:59 2011 -0400 @@ -1,5 +1,7 @@ package org.apache.lucene.util; +import java.util.Comparator; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -167,7 +169,11 @@ * the {@link CharsRef} to copy */ public void copy(CharsRef other) { - chars = ArrayUtil.grow(chars, other.length); + if (chars == null) { + chars = new char[other.length]; + } else { + chars = ArrayUtil.grow(chars, other.length); + } System.arraycopy(other.chars, other.offset, chars, 0, other.length); length = other.length; offset = 0; @@ -213,4 +219,56 @@ public CharSequence subSequence(int start, int end) { return new CharsRef(chars, offset + start, offset + end - 1); } + + private final static Comparator utf16SortedAsUTF8SortOrder = new UTF16SortedAsUTF8Comparator(); + + public static Comparator getUTF16SortedAsUTF8Comparator() { + return utf16SortedAsUTF8SortOrder; + } + + private static class UTF16SortedAsUTF8Comparator implements Comparator { + // Only singleton + private UTF16SortedAsUTF8Comparator() {}; + + public int compare(CharsRef a, CharsRef b) { + if (a == b) + return 0; + + final char[] aChars = a.chars; + int aUpto = a.offset; + final char[] bChars = b.chars; + int bUpto = b.offset; + + final int aStop = aUpto + Math.min(a.length, b.length); + + while (aUpto < aStop) { + char aChar = aChars[aUpto++]; + char bChar = bChars[bUpto++]; + if (aChar != bChar) { + // http://icu-project.org/docs/papers/utf16_code_point_order.html + + /* aChar != bChar, fix up each one if they're both in or above the surrogate range, then compare them */ + if (aChar >= 0xd800 && bChar >= 0xd800) { + if (aChar >= 0xe000) { + aChar -= 0x800; + } else { + aChar += 0x2000; + } + + if (bChar >= 0xe000) { + bChar -= 0x800; + } else { + bChar += 0x2000; + } + } + + /* now aChar and bChar are in code point order */ + return (int)aChar - (int)bChar; /* int must be 32 bits wide */ + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + } } \ No newline at end of file Index: lucene/src/java/org/apache/lucene/util/fst/FST.java --- lucene/src/java/org/apache/lucene/util/fst/FST.java Tue Jul 05 15:59:45 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/fst/FST.java Tue Jul 05 17:45:59 2011 -0400 @@ -113,7 +113,7 @@ int target; byte flags; - T nextFinalOutput; + public T nextFinalOutput; int nextArc; // This is non-zero if current arcs are fixed array: @@ -754,6 +754,9 @@ // Linear scan readFirstTargetArc(follow, arc); while(true) { + // TODO: we should fix this code to not have to create + // object for the output of every arc we scan... only + // for the matching arc, if found if (arc.label == labelToMatch) { return arc; } else if (arc.label > labelToMatch) { Index: lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java --- lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Tue Jul 05 15:59:45 2011 -0400 +++ lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Tue Jul 05 17:45:59 2011 -0400 @@ -260,7 +260,11 @@ default: text = _TestUtil.randomUnicodeString(random, maxWordLength); } - + + if (VERBOSE) { + System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); + } + TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text)); assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); @@ -286,6 +290,9 @@ ts.close(); // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (!tokens.isEmpty()) { + if (VERBOSE) { + System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis"); + } if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type assertAnalyzesToReuse(a, text, Index: lucene/src/test/org/apache/lucene/util/TestCharsRef.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/test/org/apache/lucene/util/TestCharsRef.java Tue Jul 05 17:45:59 2011 -0400 @@ -0,0 +1,41 @@ +package org.apache.lucene.util; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCharsRef extends LuceneTestCase { + public void testUTF16InUTF8Order() { + final int numStrings = atLeast(1000); + BytesRef utf8[] = new BytesRef[numStrings]; + CharsRef utf16[] = new CharsRef[numStrings]; + + for (int i = 0; i < numStrings; i++) { + String s = _TestUtil.randomUnicodeString(random); + utf8[i] = new BytesRef(s); + utf16[i] = new CharsRef(s); + } + + Arrays.sort(utf8); + Arrays.sort(utf16, CharsRef.getUTF16SortedAsUTF8Comparator()); + + for (int i = 0; i < numStrings; i++) { + assertEquals(utf8[i].utf8ToString(), utf16[i].toString()); + } + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilter.java Tue Jul 05 17:45:59 2011 -0400 @@ -0,0 +1,564 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.FST; + +// nocommit -- need better name? we may not always use FST +// going forward (eg Aho/Corasick) + +// Maybe call this one SynonymFilter! + +/** + * Matches single or multi word synonyms in a token stream. + * This token stream cannot properly handle position + * increments != 1, ie, you should place this filter before + * filtering out stop words. + * + *

Note that with the current implementation, parsing is + * greedy, so whenever multiple parses would apply, the rule + * starting the earliest and parsing the most tokens wins. + * For example if you have these rules: + * + *

+ *   a -> x
+ *   a b -> y
+ *   b c d -> z
+ * 
+ * + * Then input a b c d e parses to y b c + * d, ie the 2nd rule "wins" because it started + * earliest and matched the most input tokens of other rules + * starting at that point.

+ * + *

A future improvement to this filter could allow + * non-greedy parsing, such that the 3rd rule would win, and + * also separately allow multiple parses, such that all 3 + * rules would match, perhaps even on a rule by rule + * basis.

+ * + *

NOTE: when a match occurs, the output tokens + * associated with the matching rule are "stacked" on top of + * the input stream (if the rule had + * keepOrig=true) and also on top of aother + * matched rule's output tokens. This is not a correct + * solution, as really the output should be an abitrary + * graph/lattice. For example, with the above match, you + * would expect an exact PhraseQuery "y b + * c" to match the parsed tokens, but it will fail to + * do so. This limitations is necessary because Lucene's + * TokenStream (and index) cannot yet represent an arbitrary + * graph.

+ * + *

NOTE: If multiple incoming tokens arrive on the + * same position, only the first token at that position is + * used for parsing. Subsequent tokens simply pass through + * and are not parsed. A future improvement would be to + * allow these tokens to also be matched.

+ */ + +// TODO: maybe we should resolve token -> wordID then run +// FST on wordIDs, for better perf? + +// TODO: a more efficient approach would be Aho/Corasick's +// algorithm +// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm +// It improves over the current approach here +// because it does not fully re-start matching at every +// token. For exampl,e if one pattern is "a b c x" +// and another is "b c d" and the input is "a b c d", on +// trying to parse "a b c x" but failing when you got to x, +// rather than starting over again your really should +// immediately recognize that "b c d" matches at the next +// input. I suspect this won't matter that much in +// practice, but it's possible on some set of synonyms it +// will. We'd have to modify Aho/Corasick to enforce our +// conflict resolving (eg greedy matching) because that algo +// finds all matches. + +public final class FSTSynonymFilter extends TokenFilter { + + public static final String TYPE_SYNONYM = "SYNONYM"; + + private final FSTSynonymMap synonyms; + + private final boolean ignoreCase; + private final int rollBufferSize; + + private int captureCount; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + // How many future input tokens have already been matched + // to a synonym; because the matching is "greedy" we don't + // try to do any more matching for such tokens: + private int inputSkipCount; + + // Hold all buffered (read ahead) stacked input tokens for + // a future position. When multiple tokens are at the + // same position, we only store (and match against) the + // term for the first token at the position, but capture + // state for (and enumerate) all other tokens at this + // position: + private static class PendingInput { + final CharsRef term = new CharsRef(); + AttributeSource.State state; + boolean keepOrig; + boolean consumed = true; + + public void reset() { + state = null; + consumed = true; + keepOrig = false; + } + }; + + // Rolling buffer, holding pending input tokens we had to + // clone because we needed to look ahead, indexed by + // position: + private final PendingInput[] futureInputs; + + // Holds pending output synonyms for one future position: + private static class PendingOutputs { + CharsRef[] outputs; + int upto; + int count; + int posIncr = 1; + + public PendingOutputs() { + outputs = new CharsRef[1]; + } + + public void reset() { + upto = count = 0; + posIncr = 1; + } + + public CharsRef pullNext() { + assert upto < count; + final CharsRef result = outputs[upto++]; + posIncr = 0; + if (upto == count) { + reset(); + } + return result; + } + + public void add(char[] output, int offset, int len) { + if (count == outputs.length) { + final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(outputs, 0, next, 0, count); + outputs = next; + } + if (outputs[count] == null) { + outputs[count] = new CharsRef(); + } + outputs[count].copy(output, offset, len); + count++; + } + }; + + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + // Rolling buffer, holding stack of pending synonym + // outputs, indexed by position: + private final PendingOutputs[] futureOutputs; + + // Where (in rolling buffers) to write next input saved state: + private int nextWrite; + + // Where (in rolling buffers) to read next input saved state: + private int nextRead; + + // True once we've read last token + private boolean finished; + + private final FST.Arc scratchArc; + + private final FST fst; + + private final BytesRef scratchBytes = new BytesRef(); + private final CharsRef scratchChars = new CharsRef(); + + /** + * @param input input tokenstream + * @param synonyms synonym map + * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. + * Note, if you set this to true, its your responsibility to lowercase + * the input entries when you create the {@link FSTSynonymMap} + */ + + public FSTSynonymFilter(TokenStream input, FSTSynonymMap synonyms, boolean ignoreCase) { + super(input); + this.synonyms = synonyms; + this.ignoreCase = ignoreCase; + this.fst = synonyms.fst; + + if (fst == null) { + throw new IllegalArgumentException("fst must be non-null"); + } + + // Must be 1+ so that when roll buffer is at full + // lookahead we can distinguish this full buffer from + // the empty buffer: + rollBufferSize = 1+synonyms.maxHorizontalContext; + + futureInputs = new PendingInput[rollBufferSize]; + futureOutputs = new PendingOutputs[rollBufferSize]; + for(int pos=0;pos> */ + public final FST fst; + /** map */ + public final BytesRefHash words; + /** maxHorizontalContext: maximum context we need on the tokenstream */ + public final int maxHorizontalContext; + /** maxVerticalContext: maximum number of synonym entries for a single input */ + public final int maxVerticalContext; + + public FSTSynonymMap(FST fst, BytesRefHash words, int maxHorizontalContext, int maxVerticalContext) { + this.fst = fst; + this.words = words; + this.maxHorizontalContext = maxHorizontalContext; + this.maxVerticalContext = maxVerticalContext; + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java Tue Jul 05 17:45:59 2011 -0400 @@ -0,0 +1,264 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; + +/** + * Builds an FSTSynonymMap. + *

+ * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap + * @lucene.experimental + */ +public class FSTSynonymMapBuilder { + private final TreeMap workingSet = new TreeMap(CharsRef.getUTF16SortedAsUTF8Comparator()); + private final BytesRefHash words = new BytesRefHash(); + private final BytesRef utf8Scratch = new BytesRef(8); + private int maxHorizontalContext; + private int maxVerticalContext; + private final boolean dedup; + + public FSTSynonymMapBuilder() { + this(true); + } + + /** If dedup is true then identical rules (same input, + * same output) will be added only once. */ + public FSTSynonymMapBuilder(boolean dedup) { + this.dedup = dedup; + } + + private static class MapEntry { + boolean includeOrig; + // we could sort for better sharing ultimately, but it could confuse people + ArrayList ords = new ArrayList(); + } + + /** Sugar: just joins the provided terms with {@link + * FSTSynonymMap#WORD_SEPARATOR}. reuse and its chars + * must not be null. */ + public static CharsRef join(String[] words, CharsRef reuse) { + int upto = 0; + char[] buffer = reuse.chars; + for(String word : words) { + if (upto > 0) { + if (upto >= buffer.length) { + reuse.grow(upto); + buffer = reuse.chars; + } + buffer[upto++] = FSTSynonymMap.WORD_SEPARATOR; + } + + final int wordLen = word.length(); + final int needed = upto + wordLen; + if (needed > buffer.length) { + reuse.grow(needed); + buffer = reuse.chars; + } + + word.getChars(0, wordLen, buffer, upto); + upto += wordLen; + } + + return reuse; + } + + private boolean hasHoles(CharsRef chars) { + final int end = chars.offset + chars.length; + for(int idx=chars.offset+1;idxphrase synonym mapping. + * Phrases are character sequences where words are + * separated with character zero (\u0000). Empty words + * (two \u0000s in a row) are not allowed in the input nor + * the output! + * + * @param input input phrase + * @param numInputWords number of input words in the input phrase + * @param output output phrase + * @param includeOrig true if the original should be included + */ + public void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) { + // first convert to UTF-8 + if (numInputWords <= 0) { + throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); + } + if (input.length <= 0) { + throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")"); + } + if (numOutputWords <= 0) { + throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")"); + } + if (output.length <= 0) { + throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")"); + } + + assert !hasHoles(input): "input has holes: " + input; + assert !hasHoles(output): "output has holes: " + output; + + //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); + final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch); + // lookup in hash + int ord = words.add(utf8Scratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + //System.out.println(" output=" + output + " old ord=" + ord); + } else { + //System.out.println(" output=" + output + " new ord=" + ord); + } + + MapEntry e = workingSet.get(input); + if (e == null) { + e = new MapEntry(); + workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map + } + + e.ords.add(ord); + e.includeOrig |= includeOrig; + maxVerticalContext = Math.max(maxVerticalContext, e.ords.size()); + maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); + maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords); + } + + private int countWords(CharsRef chars) { + int wordCount = 1; + int upto = chars.offset; + final int limit = chars.offset + chars.length; + while(upto < limit) { + final int codePoint = Character.codePointAt(chars.chars, upto, limit); + if (codePoint == FSTSynonymMap.WORD_SEPARATOR) { + wordCount++; + } + upto += Character.charCount(codePoint); + } + return wordCount; + } + + /** + * Helper for {@link #add(CharsRef, int, CharsRef, boolean)}, except it counts + * the words in the input phrase for you. + *

+ * Chances are your parser is/can likely count this itself so it should just + * use the other method if so. + */ + public void add(CharsRef input, CharsRef output, boolean includeOrig) { + add(input, countWords(input), output, countWords(output), includeOrig); + } + + /** + * Builds an {@link FSTSynonymMap} and returns it. + */ + public FSTSynonymMap build() throws IOException { + ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + // TODO: are we using the best sharing options? + Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); + + BytesRef scratch = new BytesRef(64); + ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); + + final Set dedupSet; + + if (dedup) { + dedupSet = new HashSet(); + } else { + dedupSet = null; + } + + final byte[] spare = new byte[5]; + + //System.out.println("fmap.build"); + for (Map.Entry e : workingSet.entrySet()) { + CharsRef input = e.getKey(); + MapEntry output = e.getValue(); + + int numEntries = output.ords.size(); + + // output size, assume the worst case + int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry + + scratch.grow(estimatedSize); + scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); + assert scratch.offset == 0; + + // now write our output data: + int count = 0; + for (int i = 0; i < numEntries; i++) { + if (dedupSet != null) { + // box once + final Integer ent = output.ords.get(i); + if (dedupSet.contains(ent)) { + continue; + } + dedupSet.add(ent); + } + scratchOutput.writeVInt(output.ords.get(i)); + count++; + } + + final int pos = scratchOutput.getPosition(); + scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); + final int pos2 = scratchOutput.getPosition(); + final int vIntLen = pos2-pos; + + // Move the count + includeOrig to the front of the byte[]: + System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen); + System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos); + System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen); + + if (dedupSet != null) { + dedupSet.clear(); + } + + scratch.length = scratchOutput.getPosition() - scratch.offset; + //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset); + builder.add(input, new BytesRef(scratch)); + } + + FST fst = builder.finish(); + return new FSTSynonymMap(fst, words, maxHorizontalContext, maxVerticalContext); + } +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java Tue Jul 05 17:45:59 2011 -0400 @@ -0,0 +1,388 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util._TestUtil; + +public class TestFSTSynonymMapFilter extends BaseTokenStreamTestCase { + + private FSTSynonymMapBuilder b; + private Tokenizer tokensIn; + private FSTSynonymFilter tokensOut; + private CharTermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + + private void add(String input, String output, boolean keepOrig) { + b.add(new CharsRef(input.replaceAll(" +", "\u0000")), + new CharsRef(output.replaceAll(" +", "\u0000")), + keepOrig); + } + + private void assertEquals(CharTermAttribute term, String expected) { + assertEquals(expected.length(), term.length()); + final char[] buffer = term.buffer(); + for(int chIDX=0;chIDX 0) { + assertTrue(tokensOut.incrementToken()); + if (VERBOSE) { + System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement()); + } + } + assertEquals(termAtt, expectedAtPos[atPos]); + assertEquals(atPos == 0 ? 1 : 0, + posIncrAtt.getPositionIncrement()); + } + } + tokensOut.end(); + tokensOut.close(); + if (VERBOSE) { + System.out.println(" incr: END"); + } + assertEquals(expectedUpto, expected.length); + } + + public void testBasic() throws Exception { + b = new FSTSynonymMapBuilder(); + add("a", "foo", true); + add("a b", "bar fee", true); + add("b c", "dog collar", true); + add("c d", "dog harness holder extras", true); + add("m c e", "dog barks loudly", false); + + add("e f", "foo bar", false); + add("e f", "baz bee", false); + + add("z", "boo", false); + add("y", "bee", true); + + tokensIn = new MockTokenizer(new StringReader("a"), + MockTokenizer.WHITESPACE, + true); + tokensIn.reset(); + assertTrue(tokensIn.incrementToken()); + assertFalse(tokensIn.incrementToken()); + tokensIn.end(); + tokensIn.close(); + + tokensOut = new FSTSynonymFilter(tokensIn, + b.build(), + true); + termAtt = tokensOut.addAttribute(CharTermAttribute.class); + posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); + + verify("a b c", "a/bar b/fee c"); + + // syn output extends beyond input tokens + verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); + + verify("a b a", "a/bar b/fee a/foo"); + + // outputs that add to one another: + verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); + + // two outputs for same input + verify("e f", "foo/baz bar/bee"); + + // mixed keepOrig true/false: + verify("a m c e x", "a/foo dog barks loudly x"); + verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x"); + assertTrue(tokensOut.getCaptureCount() > 0); + + // no captureStates when no syns matched + verify("p q r s t", "p q r s t"); + assertEquals(0, tokensOut.getCaptureCount()); + + // no captureStates when only single-input syns, w/ no + // lookahead needed, matched + verify("p q z y t", "p q boo y/bee t"); + assertEquals(0, tokensOut.getCaptureCount()); + } + + private String getRandomString(char start, int alphabetSize, int length) { + assert alphabetSize <= 26; + char[] s = new char[2*length]; + for(int charIDX=0;charIDX out; + boolean keepOrig; + } + + public String slowSynMatcher(String doc, List syns, int maxOutputLength) { + assertTrue(doc.length() % 2 == 0); + final int numInputs = doc.length()/2; + boolean[] keepOrigs = new boolean[numInputs]; + Arrays.fill(keepOrigs, false); + String[] outputs = new String[numInputs + maxOutputLength]; + OneSyn[] matches = new OneSyn[numInputs]; + for(OneSyn syn : syns) { + int idx = -1; + while(true) { + idx = doc.indexOf(syn.in, 1+idx); + if (idx == -1) { + break; + } + assertTrue(idx % 2 == 0); + final int matchIDX = idx/2; + assertTrue(syn.in.length() % 2 == 1); + if (matches[matchIDX] == null) { + matches[matchIDX] = syn; + } else if (syn.in.length() > matches[matchIDX].in.length()) { + // Greedy conflict resolution: longer match wins: + matches[matchIDX] = syn; + } else { + assertTrue(syn.in.length() < matches[matchIDX].in.length()); + } + } + } + + // Greedy conflict resolution: if syn matches a range of inputs, + // it prevents other syns from matching that range + for(int inputIDX=0;inputIDX= numInputs && outputs[inputIDX] == null) { + break; + } + if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) { + sb.append(inputTokens[inputIDX]); + posHasOutput = true; + } + + if (outputs[inputIDX] != null) { + if (posHasOutput) { + sb.append('/'); + } + sb.append(outputs[inputIDX]); + } + if (inputIDX < limit-1) { + sb.append(' '); + } + } + + return sb.toString(); + } + + public void testRandom() throws Exception { + + final int alphabetSize = _TestUtil.nextInt(random, 2, 7); + + final int docLen = atLeast(3000); + //final int docLen = 50; + + final String document = getRandomString('a', alphabetSize, docLen); + + if (VERBOSE) { + System.out.println("TEST: doc=" + document); + } + + final int numSyn = atLeast(5); + //final int numSyn = 2; + + final Map synMap = new HashMap(); + final List syns = new ArrayList(); + final boolean dedup = random.nextBoolean(); + if (VERBOSE) { + System.out.println(" dedup=" + dedup); + } + b = new FSTSynonymMapBuilder(dedup); + for(int synIDX=0;synIDX(); + synMap.put(synIn, s); + s.keepOrig = random.nextBoolean(); + } + final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim(); + s.out.add(synOut); + add(synIn, synOut, s.keepOrig); + if (VERBOSE) { + System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig); + } + } + + tokensIn = new MockTokenizer(new StringReader("a"), + MockTokenizer.WHITESPACE, + true); + tokensIn.reset(); + assertTrue(tokensIn.incrementToken()); + assertFalse(tokensIn.incrementToken()); + tokensIn.end(); + tokensIn.close(); + + tokensOut = new FSTSynonymFilter(tokensIn, + b.build(), + true); + termAtt = tokensOut.addAttribute(CharTermAttribute.class); + posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); + + if (dedup) { + pruneDups(syns); + } + + final String expected = slowSynMatcher(document, syns, 5); + + if (VERBOSE) { + System.out.println("TEST: expected=" + expected); + } + + verify(document, expected); + } + + private void pruneDups(List syns) { + Set seen = new HashSet(); + for(OneSyn syn : syns) { + int idx = 0; + while(idx < syn.out.size()) { + String out = syn.out.get(idx); + if (!seen.contains(out)) { + seen.add(out); + idx++; + } else { + syn.out.remove(idx); + } + } + seen.clear(); + } + } + + private String randomNonEmptyString() { + while(true) { + final String s = _TestUtil.randomUnicodeString(random).trim(); + //final String s = _TestUtil.randomSimpleString(random).trim(); + if (s.length() != 0 && s.indexOf('\u0000') == -1) { + return s; + } + } + } + + /** simple random test, doesn't verify correctness. + * does verify it doesnt throw exceptions, or that the stream doesn't misbehave + */ + public void testRandom2() throws Exception { + final int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + b = new FSTSynonymMapBuilder(random.nextBoolean()); + final int numEntries = atLeast(10); + //final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + // nocommit: better random strings here (e.g. lots of spaces and ascii?) + add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); + } + final FSTSynonymMap map = b.build(); + final boolean ignoreCase = random.nextBoolean(); + + final Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + return new TokenStreamComponents(tokenizer, new FSTSynonymFilter(tokenizer, map, ignoreCase)); + } + }; + + checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER); + //checkRandomData(random, analyzer, 10*RANDOM_MULTIPLIER); + } + } +}