Index: lucene/contrib/CHANGES.txt --- lucene/contrib/CHANGES.txt Thu Jul 07 06:12:14 2011 -0400 +++ lucene/contrib/CHANGES.txt Thu Jul 07 09:06:52 2011 -0400 @@ -78,6 +78,10 @@ documents must be indexed as a document block, using IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless) + * LUCENE-3233: FSTSynonymFilter for applying multi-word synonyms + during indexing, using far less RAM than the current + SynonymFilter. (Robert Muir, Mike McCandless) + API Changes Bug Fixes Index: lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java --- lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java Thu Jul 07 06:12:14 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java Thu Jul 07 09:06:52 2011 -0400 @@ -94,9 +94,6 @@ this.out = out; this.field = field; builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - - // The byte[] output we create can easily be > 255 bytes: - builder.setAllowArrayArcs(false); } private class PostingsWriter extends PostingsConsumer { Index: lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java Thu Jul 07 09:06:52 2011 -0400 @@ -0,0 +1,52 @@ +package org.apache.lucene.store; + +import org.apache.lucene.util.BytesRef; + +/** + * @lucene.experimental + */ +public class ByteArrayDataOutput extends DataOutput { + private byte[] bytes; + + private int pos; + private int limit; + + public ByteArrayDataOutput(byte[] bytes) { + reset(bytes); + } + + public ByteArrayDataOutput(byte[] bytes, int offset, int len) { + reset(bytes, offset, len); + } + + public ByteArrayDataOutput() { + reset(BytesRef.EMPTY_BYTES); + } + + public void reset(byte[] bytes) { + reset(bytes, 0, bytes.length); + } + + public void reset(byte[] bytes, int offset, int len) { + this.bytes = bytes; + pos = offset; + limit = offset + len; + } + + public int getPosition() { + return pos; + } + + @Override + public void writeByte(byte b) { + assert pos < limit; + bytes[pos++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + assert pos + length <= limit; + System.arraycopy(b, offset, bytes, pos, length); + pos += length; + } +} Index: lucene/src/java/org/apache/lucene/util/CharsRef.java --- lucene/src/java/org/apache/lucene/util/CharsRef.java Thu Jul 07 06:12:14 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/CharsRef.java Thu Jul 07 09:06:52 2011 -0400 @@ -1,5 +1,7 @@ package org.apache.lucene.util; +import java.util.Comparator; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -167,7 +169,11 @@ * the {@link CharsRef} to copy */ public void copy(CharsRef other) { - chars = ArrayUtil.grow(chars, other.length); + if (chars == null) { + chars = new char[other.length]; + } else { + chars = ArrayUtil.grow(chars, other.length); + } System.arraycopy(other.chars, other.offset, chars, 0, other.length); length = other.length; offset = 0; @@ -213,4 +219,56 @@ public CharSequence subSequence(int start, int end) { return new CharsRef(chars, offset + start, offset + end - 1); } + + private final static Comparator utf16SortedAsUTF8SortOrder = new UTF16SortedAsUTF8Comparator(); + + public static Comparator getUTF16SortedAsUTF8Comparator() { + return utf16SortedAsUTF8SortOrder; + } + + private static class UTF16SortedAsUTF8Comparator implements Comparator { + // Only singleton + private UTF16SortedAsUTF8Comparator() {}; + + public int compare(CharsRef a, CharsRef b) { + if (a == b) + return 0; + + final char[] aChars = a.chars; + int aUpto = a.offset; + final char[] bChars = b.chars; + int bUpto = b.offset; + + final int aStop = aUpto + Math.min(a.length, b.length); + + while (aUpto < aStop) { + char aChar = aChars[aUpto++]; + char bChar = bChars[bUpto++]; + if (aChar != bChar) { + // http://icu-project.org/docs/papers/utf16_code_point_order.html + + /* aChar != bChar, fix up each one if they're both in or above the surrogate range, then compare them */ + if (aChar >= 0xd800 && bChar >= 0xd800) { + if (aChar >= 0xe000) { + aChar -= 0x800; + } else { + aChar += 0x2000; + } + + if (bChar >= 0xe000) { + bChar -= 0x800; + } else { + bChar += 0x2000; + } + } + + /* now aChar and bChar are in code point order */ + return (int)aChar - (int)bChar; /* int must be 32 bits wide */ + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + } } \ No newline at end of file Index: lucene/src/java/org/apache/lucene/util/fst/FST.java --- lucene/src/java/org/apache/lucene/util/fst/FST.java Thu Jul 07 06:12:14 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/fst/FST.java Thu Jul 07 09:06:52 2011 -0400 @@ -106,6 +106,8 @@ private boolean allowArrayArcs = true; + private Arc cachedRootArcs[]; + public final static class Arc { public int label; public T output; @@ -113,7 +115,7 @@ int target; byte flags; - T nextFinalOutput; + public T nextFinalOutput; int nextArc; // This is non-zero if current arcs are fixed array: @@ -209,6 +211,8 @@ bytes = new byte[in.readVInt()]; in.readBytes(bytes, 0, bytes.length); NO_OUTPUT = outputs.getNoOutput(); + + cacheRootArcs(); } public INPUT_TYPE getInputType() { @@ -220,7 +224,7 @@ return bytes.length; } - void finish(int startNode) { + void finish(int startNode) throws IOException { if (startNode == FINAL_END_NODE && emptyOutput != null) { startNode = 0; } @@ -231,6 +235,32 @@ System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite); bytes = finalBytes; this.startNode = startNode; + + cacheRootArcs(); + } + + // Caches first 128 labels + @SuppressWarnings("unchecked") + private void cacheRootArcs() throws IOException { + cachedRootArcs = (FST.Arc[]) new FST.Arc[0x80]; + final FST.Arc arc = new FST.Arc(); + getFirstArc(arc); + final BytesReader in = getBytesReader(0); + if (targetHasArcs(arc)) { + readFirstRealArc(arc.target, arc); + while(true) { + assert arc.label != END_LABEL; + if (arc.label < cachedRootArcs.length) { + cachedRootArcs[arc.label] = new Arc().copyFrom(arc); + } else { + break; + } + if (arc.isLast()) { + break; + } + readNextRealArc(arc, in); + } + } } void setEmptyOutput(T v) throws IOException { @@ -345,8 +375,9 @@ writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY); writer.writeVInt(node.numArcs); // placeholder -- we'll come back and write the number - // of bytes per arc here: - writer.writeByte((byte) 0); + // of bytes per arc (int) here: + // TODO: we could make this a vInt instead + writer.writeInt(0); fixedArrayStart = writer.posWrite; //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart); } else { @@ -427,9 +458,10 @@ // byte size final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc; bytes = ArrayUtil.grow(bytes, sizeNeeded); - if (maxBytesPerArc > 255) { - throw new IllegalStateException("max arc size is too large (" + maxBytesPerArc + "); disable array arcs by calling Builder.setAllowArrayArcs(false)"); - } + // TODO: we could make this a vInt instead + bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); + bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16); + bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8); bytes[fixedArrayStart-1] = (byte) maxBytesPerArc; // expand the arcs in place, backwards @@ -502,7 +534,7 @@ if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { // array: jump straight to end arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readByte() & 0xFF; + arc.bytesPerArc = in.readInt(); //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); arc.posArcsStart = in.pos; arc.arcIdx = arc.numArcs - 2; @@ -528,7 +560,7 @@ } arc.nextArc = in.pos+1; } - readNextRealArc(arc); + readNextRealArc(arc, in); assert arc.isLast(); return arc; } @@ -572,7 +604,7 @@ //System.out.println(" fixedArray"); // this is first arc in a fixed-array arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readByte() & 0xFF; + arc.bytesPerArc = in.readInt(); arc.arcIdx = -1; arc.nextArc = arc.posArcsStart = in.pos; //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); @@ -580,7 +612,7 @@ arc.nextArc = address; arc.bytesPerArc = 0; } - return readNextRealArc(arc); + return readNextRealArc(arc, in); } /** @@ -609,7 +641,7 @@ } return readFirstRealArc(arc.nextArc, arc); } else { - return readNextRealArc(arc); + return readNextRealArc(arc, getBytesReader(0)); } } @@ -627,7 +659,7 @@ //System.out.println(" nextArc fake array"); in.pos--; in.readVInt(); - in.readByte(); + in.readInt(); } } else { if (arc.bytesPerArc != 0) { @@ -645,17 +677,16 @@ return readLabel(in); } - Arc readNextRealArc(Arc arc) throws IOException { + Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { // this is a continuing arc in a fixed array - final BytesReader in; if (arc.bytesPerArc != 0) { // arcs are at fixed entries arc.arcIdx++; assert arc.arcIdx < arc.numArcs; - in = getBytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc); + in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc; } else { // arcs are packed - in = getBytesReader(arc.nextArc); + in.pos = arc.nextArc; } arc.flags = in.readByte(); arc.label = readLabel(in); @@ -701,7 +732,18 @@ /** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */ public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc) throws IOException { - + assert cachedRootArcs != null; + // Short-circuit if this arc is in the root arc cache: + if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) { + final Arc result = cachedRootArcs[labelToMatch]; + if (result == null) { + return result; + } else { + arc.copyFrom(result); + return arc; + } + } + if (labelToMatch == END_LABEL) { if (follow.isFinal()) { if (follow.target <= 0) { @@ -726,14 +768,18 @@ // reusable stuff eg BytesReader: final BytesReader in = getBytesReader(follow.target); + // System.out.println("fta label=" + (char) labelToMatch); + if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) { // Arcs are full array; do binary search: arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readByte() & 0xFF; + //System.out.println(" bs " + arc.numArcs); + arc.bytesPerArc = in.readInt(); arc.posArcsStart = in.pos; int low = 0; int high = arc.numArcs-1; while (low <= high) { + //System.out.println(" cycle"); int mid = (low + high) >>> 1; in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; int midLabel = readLabel(in); @@ -744,7 +790,8 @@ high = mid - 1; else { arc.arcIdx = mid-1; - return readNextRealArc(arc); + //System.out.println(" found!"); + return readNextRealArc(arc, in); } } @@ -754,7 +801,12 @@ // Linear scan readFirstTargetArc(follow, arc); while(true) { + //System.out.println(" non-bs cycle"); + // TODO: we should fix this code to not have to create + // object for the output of every arc we scan... only + // for the matching arc, if found if (arc.label == labelToMatch) { + //System.out.println(" found!"); return arc; } else if (arc.label > labelToMatch) { return null; @@ -863,7 +915,7 @@ } // Non-static: reads byte[] from FST - class BytesReader extends DataInput { + final class BytesReader extends DataInput { int pos; public BytesReader(int pos) { Index: lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java --- lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java Thu Jul 07 06:12:14 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java Thu Jul 07 09:06:52 2011 -0400 @@ -170,7 +170,7 @@ if (found) { // Match arc.arcIdx = mid-1; - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.arcIdx == mid; assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto-1], arc.output); @@ -185,7 +185,7 @@ } else if (low == arc.numArcs) { // Dead end arc.arcIdx = arc.numArcs-2; - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.isLast(); // Dead end (target is after the last arc); // rollback to last fork then push @@ -205,7 +205,7 @@ } } else { arc.arcIdx = (low > high ? low : high)-1; - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.label > targetLabel; pushFirst(); return; @@ -309,7 +309,7 @@ // Match -- recurse //System.out.println(" match! arcIdx=" + mid); arc.arcIdx = mid-1; - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.arcIdx == mid; assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto-1], arc.output); @@ -352,7 +352,7 @@ // There is a floor arc: arc.arcIdx = (low > high ? high : low)-1; //System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1)); - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel; assert arc.label < targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel; pushLast(); Index: lucene/src/java/org/apache/lucene/util/fst/NodeHash.java --- lucene/src/java/org/apache/lucene/util/fst/NodeHash.java Thu Jul 07 06:12:14 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/fst/NodeHash.java Thu Jul 07 09:06:52 2011 -0400 @@ -35,6 +35,7 @@ } private boolean nodesEqual(Builder.UnCompiledNode node, int address) throws IOException { + final FST.BytesReader in = fst.getBytesReader(0); fst.readFirstRealArc(address, scratchArc); if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { return false; @@ -56,7 +57,7 @@ return false; } } - fst.readNextRealArc(scratchArc); + fst.readNextRealArc(scratchArc, in); } return false; @@ -87,6 +88,7 @@ // hash code for a frozen node private int hash(int node) throws IOException { final int PRIME = 31; + final FST.BytesReader in = fst.getBytesReader(0); //System.out.println("hash frozen"); int h = 0; fst.readFirstRealArc(node, scratchArc); @@ -102,7 +104,7 @@ if (scratchArc.isLast()) { break; } - fst.readNextRealArc(scratchArc); + fst.readNextRealArc(scratchArc, in); } //System.out.println(" ret " + (h&Integer.MAX_VALUE)); return h & Integer.MAX_VALUE; Index: lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java --- lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Thu Jul 07 06:12:14 2011 -0400 +++ lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Thu Jul 07 09:06:52 2011 -0400 @@ -260,7 +260,11 @@ default: text = _TestUtil.randomUnicodeString(random, maxWordLength); } - + + if (VERBOSE) { + System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); + } + TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text)); assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); @@ -286,6 +290,9 @@ ts.close(); // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (!tokens.isEmpty()) { + if (VERBOSE) { + System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis"); + } if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type assertAnalyzesToReuse(a, text, Index: lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java --- lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java Thu Jul 07 06:12:14 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java Thu Jul 07 09:06:52 2011 -0400 @@ -23,6 +23,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; @@ -141,6 +142,8 @@ */ public void testAddIndexOnDiskFull() throws IOException { + // nocommit - why? ant test -Dtestcase=TestIndexWriterOnDiskFull -Dtestmethod=testAddIndexOnDiskFull -Dtests.seed=-5507459305173761956:2176952332896351574 -Dtests.codec=Memory + assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getDefaultFieldCodec().equals("Memory")); int START_COUNT = 57; int NUM_DIR = TEST_NIGHTLY ? 50 : 5; int END_COUNT = START_COUNT + NUM_DIR* (TEST_NIGHTLY ? 25 : 5); Index: lucene/src/test/org/apache/lucene/util/TestCharsRef.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/test/org/apache/lucene/util/TestCharsRef.java Thu Jul 07 09:06:52 2011 -0400 @@ -0,0 +1,41 @@ +package org.apache.lucene.util; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCharsRef extends LuceneTestCase { + public void testUTF16InUTF8Order() { + final int numStrings = atLeast(1000); + BytesRef utf8[] = new BytesRef[numStrings]; + CharsRef utf16[] = new CharsRef[numStrings]; + + for (int i = 0; i < numStrings; i++) { + String s = _TestUtil.randomUnicodeString(random); + utf8[i] = new BytesRef(s); + utf16[i] = new CharsRef(s); + } + + Arrays.sort(utf8); + Arrays.sort(utf16, CharsRef.getUTF16SortedAsUTF8Comparator()); + + for (int i = 0; i < numStrings; i++) { + assertEquals(utf8[i].utf8ToString(), utf16[i].toString()); + } + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilter.java Thu Jul 07 09:06:52 2011 -0400 @@ -0,0 +1,564 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.FST; + +// nocommit -- need better name? we may not always use FST +// going forward (eg Aho/Corasick) + +// Maybe call this one SynonymFilter! + +/** + * Matches single or multi word synonyms in a token stream. + * This token stream cannot properly handle position + * increments != 1, ie, you should place this filter before + * filtering out stop words. + * + *

Note that with the current implementation, parsing is + * greedy, so whenever multiple parses would apply, the rule + * starting the earliest and parsing the most tokens wins. + * For example if you have these rules: + * + *

+ *   a -> x
+ *   a b -> y
+ *   b c d -> z
+ * 
+ * + * Then input a b c d e parses to y b c + * d, ie the 2nd rule "wins" because it started + * earliest and matched the most input tokens of other rules + * starting at that point.

+ * + *

A future improvement to this filter could allow + * non-greedy parsing, such that the 3rd rule would win, and + * also separately allow multiple parses, such that all 3 + * rules would match, perhaps even on a rule by rule + * basis.

+ * + *

NOTE: when a match occurs, the output tokens + * associated with the matching rule are "stacked" on top of + * the input stream (if the rule had + * keepOrig=true) and also on top of aother + * matched rule's output tokens. This is not a correct + * solution, as really the output should be an abitrary + * graph/lattice. For example, with the above match, you + * would expect an exact PhraseQuery "y b + * c" to match the parsed tokens, but it will fail to + * do so. This limitations is necessary because Lucene's + * TokenStream (and index) cannot yet represent an arbitrary + * graph.

+ * + *

NOTE: If multiple incoming tokens arrive on the + * same position, only the first token at that position is + * used for parsing. Subsequent tokens simply pass through + * and are not parsed. A future improvement would be to + * allow these tokens to also be matched.

+ */ + +// TODO: maybe we should resolve token -> wordID then run +// FST on wordIDs, for better perf? + +// TODO: a more efficient approach would be Aho/Corasick's +// algorithm +// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm +// It improves over the current approach here +// because it does not fully re-start matching at every +// token. For exampl,e if one pattern is "a b c x" +// and another is "b c d" and the input is "a b c d", on +// trying to parse "a b c x" but failing when you got to x, +// rather than starting over again your really should +// immediately recognize that "b c d" matches at the next +// input. I suspect this won't matter that much in +// practice, but it's possible on some set of synonyms it +// will. We'd have to modify Aho/Corasick to enforce our +// conflict resolving (eg greedy matching) because that algo +// finds all matches. + +public final class FSTSynonymFilter extends TokenFilter { + + public static final String TYPE_SYNONYM = "SYNONYM"; + + private final FSTSynonymMap synonyms; + + private final boolean ignoreCase; + private final int rollBufferSize; + + private int captureCount; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + // How many future input tokens have already been matched + // to a synonym; because the matching is "greedy" we don't + // try to do any more matching for such tokens: + private int inputSkipCount; + + // Hold all buffered (read ahead) stacked input tokens for + // a future position. When multiple tokens are at the + // same position, we only store (and match against) the + // term for the first token at the position, but capture + // state for (and enumerate) all other tokens at this + // position: + private static class PendingInput { + final CharsRef term = new CharsRef(); + AttributeSource.State state; + boolean keepOrig; + boolean consumed = true; + + public void reset() { + state = null; + consumed = true; + keepOrig = false; + } + }; + + // Rolling buffer, holding pending input tokens we had to + // clone because we needed to look ahead, indexed by + // position: + private final PendingInput[] futureInputs; + + // Holds pending output synonyms for one future position: + private static class PendingOutputs { + CharsRef[] outputs; + int upto; + int count; + int posIncr = 1; + + public PendingOutputs() { + outputs = new CharsRef[1]; + } + + public void reset() { + upto = count = 0; + posIncr = 1; + } + + public CharsRef pullNext() { + assert upto < count; + final CharsRef result = outputs[upto++]; + posIncr = 0; + if (upto == count) { + reset(); + } + return result; + } + + public void add(char[] output, int offset, int len) { + if (count == outputs.length) { + final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(outputs, 0, next, 0, count); + outputs = next; + } + if (outputs[count] == null) { + outputs[count] = new CharsRef(); + } + outputs[count].copy(output, offset, len); + count++; + } + }; + + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + // Rolling buffer, holding stack of pending synonym + // outputs, indexed by position: + private final PendingOutputs[] futureOutputs; + + // Where (in rolling buffers) to write next input saved state: + private int nextWrite; + + // Where (in rolling buffers) to read next input saved state: + private int nextRead; + + // True once we've read last token + private boolean finished; + + private final FST.Arc scratchArc; + + private final FST fst; + + private final BytesRef scratchBytes = new BytesRef(); + private final CharsRef scratchChars = new CharsRef(); + + /** + * @param input input tokenstream + * @param synonyms synonym map + * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. + * Note, if you set this to true, its your responsibility to lowercase + * the input entries when you create the {@link FSTSynonymMap} + */ + + public FSTSynonymFilter(TokenStream input, FSTSynonymMap synonyms, boolean ignoreCase) { + super(input); + this.synonyms = synonyms; + this.ignoreCase = ignoreCase; + this.fst = synonyms.fst; + + if (fst == null) { + throw new IllegalArgumentException("fst must be non-null"); + } + + // Must be 1+ so that when roll buffer is at full + // lookahead we can distinguish this full buffer from + // the empty buffer: + rollBufferSize = 1+synonyms.maxHorizontalContext; + + futureInputs = new PendingInput[rollBufferSize]; + futureOutputs = new PendingOutputs[rollBufferSize]; + for(int pos=0;pos> */ + public final FST fst; + /** map */ + public final BytesRefHash words; + /** maxHorizontalContext: maximum context we need on the tokenstream */ + public final int maxHorizontalContext; + /** maxVerticalContext: maximum number of synonym entries for a single input */ + public final int maxVerticalContext; + + public FSTSynonymMap(FST fst, BytesRefHash words, int maxHorizontalContext, int maxVerticalContext) { + this.fst = fst; + this.words = words; + this.maxHorizontalContext = maxHorizontalContext; + this.maxVerticalContext = maxVerticalContext; + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java Thu Jul 07 09:06:52 2011 -0400 @@ -0,0 +1,267 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; + +/** + * Builds an FSTSynonymMap. + *

+ * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap + * @lucene.experimental + */ +public class FSTSynonymMapBuilder { + private final HashMap workingSet = new HashMap(); + private final BytesRefHash words = new BytesRefHash(); + private final BytesRef utf8Scratch = new BytesRef(8); + private int maxHorizontalContext; + private int maxVerticalContext; + private final boolean dedup; + + public FSTSynonymMapBuilder() { + this(true); + } + + /** If dedup is true then identical rules (same input, + * same output) will be added only once. */ + public FSTSynonymMapBuilder(boolean dedup) { + this.dedup = dedup; + } + + private static class MapEntry { + boolean includeOrig; + // we could sort for better sharing ultimately, but it could confuse people + ArrayList ords = new ArrayList(); + } + + /** Sugar: just joins the provided terms with {@link + * FSTSynonymMap#WORD_SEPARATOR}. reuse and its chars + * must not be null. */ + public static CharsRef join(String[] words, CharsRef reuse) { + int upto = 0; + char[] buffer = reuse.chars; + for(String word : words) { + if (upto > 0) { + if (upto >= buffer.length) { + reuse.grow(upto); + buffer = reuse.chars; + } + buffer[upto++] = FSTSynonymMap.WORD_SEPARATOR; + } + + final int wordLen = word.length(); + final int needed = upto + wordLen; + if (needed > buffer.length) { + reuse.grow(needed); + buffer = reuse.chars; + } + + word.getChars(0, wordLen, buffer, upto); + upto += wordLen; + } + + return reuse; + } + + private boolean hasHoles(CharsRef chars) { + final int end = chars.offset + chars.length; + for(int idx=chars.offset+1;idxphrase synonym mapping. + * Phrases are character sequences where words are + * separated with character zero (\u0000). Empty words + * (two \u0000s in a row) are not allowed in the input nor + * the output! + * + * @param input input phrase + * @param numInputWords number of input words in the input phrase + * @param output output phrase + * @param includeOrig true if the original should be included + */ + public void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) { + // first convert to UTF-8 + if (numInputWords <= 0) { + throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); + } + if (input.length <= 0) { + throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")"); + } + if (numOutputWords <= 0) { + throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")"); + } + if (output.length <= 0) { + throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")"); + } + + assert !hasHoles(input): "input has holes: " + input; + assert !hasHoles(output): "output has holes: " + output; + + //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); + final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch); + // lookup in hash + int ord = words.add(utf8Scratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + //System.out.println(" output=" + output + " old ord=" + ord); + } else { + //System.out.println(" output=" + output + " new ord=" + ord); + } + + MapEntry e = workingSet.get(input); + if (e == null) { + e = new MapEntry(); + workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map + } + + e.ords.add(ord); + e.includeOrig |= includeOrig; + maxVerticalContext = Math.max(maxVerticalContext, e.ords.size()); + maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); + maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords); + } + + private int countWords(CharsRef chars) { + int wordCount = 1; + int upto = chars.offset; + final int limit = chars.offset + chars.length; + while(upto < limit) { + final int codePoint = Character.codePointAt(chars.chars, upto, limit); + if (codePoint == FSTSynonymMap.WORD_SEPARATOR) { + wordCount++; + } + upto += Character.charCount(codePoint); + } + return wordCount; + } + + /** + * Helper for {@link #add(CharsRef, int, CharsRef, boolean)}, except it counts + * the words in the input phrase for you. + *

+ * Chances are your parser is/can likely count this itself so it should just + * use the other method if so. + */ + public void add(CharsRef input, CharsRef output, boolean includeOrig) { + add(input, countWords(input), output, countWords(output), includeOrig); + } + + /** + * Builds an {@link FSTSynonymMap} and returns it. + */ + public FSTSynonymMap build() throws IOException { + ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + // TODO: are we using the best sharing options? + Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); + + BytesRef scratch = new BytesRef(64); + ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); + + final Set dedupSet; + + if (dedup) { + dedupSet = new HashSet(); + } else { + dedupSet = null; + } + + final byte[] spare = new byte[5]; + + Set keys = workingSet.keySet(); + CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]); + Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); + + //System.out.println("fmap.build"); + for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) { + CharsRef input = sortedKeys[keyIdx]; + MapEntry output = workingSet.get(input); + + int numEntries = output.ords.size(); + // output size, assume the worst case + int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry + + scratch.grow(estimatedSize); + scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); + assert scratch.offset == 0; + + // now write our output data: + int count = 0; + for (int i = 0; i < numEntries; i++) { + if (dedupSet != null) { + // box once + final Integer ent = output.ords.get(i); + if (dedupSet.contains(ent)) { + continue; + } + dedupSet.add(ent); + } + scratchOutput.writeVInt(output.ords.get(i)); + count++; + } + + final int pos = scratchOutput.getPosition(); + scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); + final int pos2 = scratchOutput.getPosition(); + final int vIntLen = pos2-pos; + + // Move the count + includeOrig to the front of the byte[]: + System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen); + System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos); + System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen); + + if (dedupSet != null) { + dedupSet.clear(); + } + + scratch.length = scratchOutput.getPosition() - scratch.offset; + //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); + builder.add(input, new BytesRef(scratch)); + } + + FST fst = builder.finish(); + return new FSTSynonymMap(fst, words, maxHorizontalContext, maxVerticalContext); + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymsParser.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymsParser.java Thu Jul 07 09:06:52 2011 -0400 @@ -0,0 +1,223 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.LineNumberReader; +import java.io.Reader; +import java.io.StringReader; +import java.text.ParseException; +import java.util.ArrayList; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.CharsRef; + +/** + * Parser for the Solr synonyms format. + *

    + *
  1. Blank lines and lines starting with '#' are comments. + *
  2. Explicit mappings match any token sequence on the LHS of "=>" + * and replace with all alternatives on the RHS. These types of mappings + * ignore the expand parameter in the constructor. + * Example: + *
    i-pod, i pod => ipod
    + *
  3. Equivalent synonyms may be separated with commas and give + * no explicit mapping. In this case the mapping behavior will + * be taken from the expand parameter in the constructor. This allows + * the same synonym file to be used in different synonym handling strategies. + * Example: + *
    ipod, i-pod, i pod
    + * + *
  4. Multiple synonym mapping entries are merged. + * Example: + *
    + * foo => foo bar
    + * foo => baz

    + * is equivalent to

    + * foo => foo bar, baz + *
    + *
+ * @lucene.experimental + */ +public class SolrSynonymsParser { + private final boolean expand; + private final Analyzer analyzer; + private final FSTSynonymMapBuilder builder; + + public SolrSynonymsParser(boolean expand, Analyzer analyzer) { + this.expand = expand; + this.analyzer = analyzer; + this.builder = new FSTSynonymMapBuilder(); + } + + public void addSynonyms(Reader in) throws IOException, ParseException { + LineNumberReader br = new LineNumberReader(in); + try { + addInternal(br); + } catch (IllegalArgumentException e) { + ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); + ex.initCause(e); + throw ex; + } finally { + br.close(); + } + } + + private void addInternal(BufferedReader in) throws IOException { + String line = null; + while ((line = in.readLine()) != null) { + if (line.length() == 0 || line.charAt(0) == '#') { + continue; // ignore empty lines and comments + } + + CharsRef inputs[]; + CharsRef outputs[]; + + // TODO: we could process this more efficiently. + + String sides[] = split(line, "=>"); + if (sides.length > 1) { // explicit mapping + if (sides.length != 2) { + throw new IllegalArgumentException("more than one explicit mapping specified on the same line"); + } + String inputStrings[] = split(sides[0], ","); + inputs = new CharsRef[inputStrings.length]; + for (int i = 0; i < inputs.length; i++) { + inputs[i] = analyze(inputStrings[i]); + } + + String outputStrings[] = split(sides[1], ","); + outputs = new CharsRef[outputStrings.length]; + for (int i = 0; i < outputs.length; i++) { + outputs[i] = analyze(outputStrings[i]); + } + } else { + String inputStrings[] = split(line, ","); + inputs = new CharsRef[inputStrings.length]; + for (int i = 0; i < inputs.length; i++) { + inputs[i] = analyze(inputStrings[i]); + } + if (expand) { + outputs = inputs; + } else { + outputs = new CharsRef[1]; + outputs[0] = inputs[0]; + } + } + + // currently we include the term itself in the map, + // and use includeOrig = false always. + // this is how the existing filter does it, but its actually a bug, + // especially if combined with ignoreCase = true + for (int i = 0; i < inputs.length; i++) { + for (int j = 0; j < outputs.length; j++) { + //System.out.println("add: " + inputs[i] + " -> " + outputs[j]); + builder.add(inputs[i], outputs[j], false); + } + } + } + } + + private CharsRef analyze(String text) throws IOException { + CharsRef out = new CharsRef(); + // nocommit: this unescaping/trimming is ugly + text = unescape(text.trim()); + TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text)); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + int length = termAtt.length(); + if (length == 0) { + throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); + } + if (posIncAtt.getPositionIncrement() != 1) { + throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1"); + } + out.grow(out.length + length + 1 /* one for the separator */); + int end = out.offset + out.length; + if (out.length > 0) { + out.chars[end++] = FSTSynonymMap.WORD_SEPARATOR; + out.length++; + } + System.arraycopy(termAtt.buffer(), 0, out.chars, end, length); + out.length += length; + } + ts.end(); + ts.close(); + if (out.length == 0) { + throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); + } + return out; + } + + private static String[] split(String s, String separator) { + ArrayList list = new ArrayList(2); + StringBuilder sb = new StringBuilder(); + int pos=0, end=s.length(); + while (pos < end) { + if (s.startsWith(separator,pos)) { + if (sb.length() > 0) { + list.add(sb.toString()); + sb=new StringBuilder(); + } + pos+=separator.length(); + continue; + } + + char ch = s.charAt(pos++); + if (ch=='\\') { + sb.append(ch); + if (pos>=end) break; // ERROR, or let it go? + ch = s.charAt(pos++); + } + + sb.append(ch); + } + + if (sb.length() > 0) { + list.add(sb.toString()); + } + + return list.toArray(new String[list.size()]); + } + + private String unescape(String s) { + if (s.indexOf("\\") >= 0) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + char ch = s.charAt(i); + if (ch == '\\' && i < s.length() - 1) { + sb.append(s.charAt(++i)); + } else { + sb.append(ch); + } + } + return sb.toString(); + } + return s; + } + + public FSTSynonymMap build() throws IOException { + return builder.build(); + } +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java Thu Jul 07 09:06:52 2011 -0400 @@ -0,0 +1,388 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util._TestUtil; + +public class TestFSTSynonymMapFilter extends BaseTokenStreamTestCase { + + private FSTSynonymMapBuilder b; + private Tokenizer tokensIn; + private FSTSynonymFilter tokensOut; + private CharTermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + + private void add(String input, String output, boolean keepOrig) { + b.add(new CharsRef(input.replaceAll(" +", "\u0000")), + new CharsRef(output.replaceAll(" +", "\u0000")), + keepOrig); + } + + private void assertEquals(CharTermAttribute term, String expected) { + assertEquals(expected.length(), term.length()); + final char[] buffer = term.buffer(); + for(int chIDX=0;chIDX 0) { + assertTrue(tokensOut.incrementToken()); + if (VERBOSE) { + System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement()); + } + } + assertEquals(termAtt, expectedAtPos[atPos]); + assertEquals(atPos == 0 ? 1 : 0, + posIncrAtt.getPositionIncrement()); + } + } + tokensOut.end(); + tokensOut.close(); + if (VERBOSE) { + System.out.println(" incr: END"); + } + assertEquals(expectedUpto, expected.length); + } + + public void testBasic() throws Exception { + b = new FSTSynonymMapBuilder(); + add("a", "foo", true); + add("a b", "bar fee", true); + add("b c", "dog collar", true); + add("c d", "dog harness holder extras", true); + add("m c e", "dog barks loudly", false); + + add("e f", "foo bar", false); + add("e f", "baz bee", false); + + add("z", "boo", false); + add("y", "bee", true); + + tokensIn = new MockTokenizer(new StringReader("a"), + MockTokenizer.WHITESPACE, + true); + tokensIn.reset(); + assertTrue(tokensIn.incrementToken()); + assertFalse(tokensIn.incrementToken()); + tokensIn.end(); + tokensIn.close(); + + tokensOut = new FSTSynonymFilter(tokensIn, + b.build(), + true); + termAtt = tokensOut.addAttribute(CharTermAttribute.class); + posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); + + verify("a b c", "a/bar b/fee c"); + + // syn output extends beyond input tokens + verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); + + verify("a b a", "a/bar b/fee a/foo"); + + // outputs that add to one another: + verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); + + // two outputs for same input + verify("e f", "foo/baz bar/bee"); + + // mixed keepOrig true/false: + verify("a m c e x", "a/foo dog barks loudly x"); + verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x"); + assertTrue(tokensOut.getCaptureCount() > 0); + + // no captureStates when no syns matched + verify("p q r s t", "p q r s t"); + assertEquals(0, tokensOut.getCaptureCount()); + + // no captureStates when only single-input syns, w/ no + // lookahead needed, matched + verify("p q z y t", "p q boo y/bee t"); + assertEquals(0, tokensOut.getCaptureCount()); + } + + private String getRandomString(char start, int alphabetSize, int length) { + assert alphabetSize <= 26; + char[] s = new char[2*length]; + for(int charIDX=0;charIDX out; + boolean keepOrig; + } + + public String slowSynMatcher(String doc, List syns, int maxOutputLength) { + assertTrue(doc.length() % 2 == 0); + final int numInputs = doc.length()/2; + boolean[] keepOrigs = new boolean[numInputs]; + Arrays.fill(keepOrigs, false); + String[] outputs = new String[numInputs + maxOutputLength]; + OneSyn[] matches = new OneSyn[numInputs]; + for(OneSyn syn : syns) { + int idx = -1; + while(true) { + idx = doc.indexOf(syn.in, 1+idx); + if (idx == -1) { + break; + } + assertTrue(idx % 2 == 0); + final int matchIDX = idx/2; + assertTrue(syn.in.length() % 2 == 1); + if (matches[matchIDX] == null) { + matches[matchIDX] = syn; + } else if (syn.in.length() > matches[matchIDX].in.length()) { + // Greedy conflict resolution: longer match wins: + matches[matchIDX] = syn; + } else { + assertTrue(syn.in.length() < matches[matchIDX].in.length()); + } + } + } + + // Greedy conflict resolution: if syn matches a range of inputs, + // it prevents other syns from matching that range + for(int inputIDX=0;inputIDX= numInputs && outputs[inputIDX] == null) { + break; + } + if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) { + sb.append(inputTokens[inputIDX]); + posHasOutput = true; + } + + if (outputs[inputIDX] != null) { + if (posHasOutput) { + sb.append('/'); + } + sb.append(outputs[inputIDX]); + } + if (inputIDX < limit-1) { + sb.append(' '); + } + } + + return sb.toString(); + } + + public void testRandom() throws Exception { + + final int alphabetSize = _TestUtil.nextInt(random, 2, 7); + + final int docLen = atLeast(3000); + //final int docLen = 50; + + final String document = getRandomString('a', alphabetSize, docLen); + + if (VERBOSE) { + System.out.println("TEST: doc=" + document); + } + + final int numSyn = atLeast(5); + //final int numSyn = 2; + + final Map synMap = new HashMap(); + final List syns = new ArrayList(); + final boolean dedup = random.nextBoolean(); + if (VERBOSE) { + System.out.println(" dedup=" + dedup); + } + b = new FSTSynonymMapBuilder(dedup); + for(int synIDX=0;synIDX(); + synMap.put(synIn, s); + s.keepOrig = random.nextBoolean(); + } + final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim(); + s.out.add(synOut); + add(synIn, synOut, s.keepOrig); + if (VERBOSE) { + System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig); + } + } + + tokensIn = new MockTokenizer(new StringReader("a"), + MockTokenizer.WHITESPACE, + true); + tokensIn.reset(); + assertTrue(tokensIn.incrementToken()); + assertFalse(tokensIn.incrementToken()); + tokensIn.end(); + tokensIn.close(); + + tokensOut = new FSTSynonymFilter(tokensIn, + b.build(), + true); + termAtt = tokensOut.addAttribute(CharTermAttribute.class); + posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); + + if (dedup) { + pruneDups(syns); + } + + final String expected = slowSynMatcher(document, syns, 5); + + if (VERBOSE) { + System.out.println("TEST: expected=" + expected); + } + + verify(document, expected); + } + + private void pruneDups(List syns) { + Set seen = new HashSet(); + for(OneSyn syn : syns) { + int idx = 0; + while(idx < syn.out.size()) { + String out = syn.out.get(idx); + if (!seen.contains(out)) { + seen.add(out); + idx++; + } else { + syn.out.remove(idx); + } + } + seen.clear(); + } + } + + private String randomNonEmptyString() { + while(true) { + final String s = _TestUtil.randomUnicodeString(random).trim(); + //final String s = _TestUtil.randomSimpleString(random).trim(); + if (s.length() != 0 && s.indexOf('\u0000') == -1) { + return s; + } + } + } + + /** simple random test, doesn't verify correctness. + * does verify it doesnt throw exceptions, or that the stream doesn't misbehave + */ + public void testRandom2() throws Exception { + final int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + b = new FSTSynonymMapBuilder(random.nextBoolean()); + final int numEntries = atLeast(10); + //final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + // nocommit: better random strings here (e.g. lots of spaces and ascii?) + add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); + } + final FSTSynonymMap map = b.build(); + final boolean ignoreCase = random.nextBoolean(); + + final Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + return new TokenStreamComponents(tokenizer, new FSTSynonymFilter(tokenizer, map, ignoreCase)); + } + }; + + checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER); + //checkRandomData(random, analyzer, 10*RANDOM_MULTIPLIER); + } + } +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java Thu Jul 07 09:06:52 2011 -0400 @@ -0,0 +1,144 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.text.ParseException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.junit.Test; + +/** + * Tests parser for the Solr synonyms format + * @lucene.experimental + */ +public class TestSolrSynonymParser extends BaseTokenStreamTestCase { + + /** Tests some simple examples from the solr wiki */ + public void testSimple() throws Exception { + String testFile = + "i-pod, ipod, ipoooood\n" + + "foo => foo bar\n" + + "foo => baz\n" + + "this test, that testing"; + + SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random)); + parser.addSynonyms(new StringReader(testFile)); + final FSTSynonymMap map = parser.build(); + + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); + return new TokenStreamComponents(tokenizer, new FSTSynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(analyzer, "ball", + new String[] { "ball" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "i-pod", + new String[] { "i-pod", "ipod", "ipoooood" }, + new int[] { 1, 0, 0 }); + + assertAnalyzesTo(analyzer, "foo", + new String[] { "foo", "baz", "bar" }, + new int[] { 1, 0, 1 }); + + assertAnalyzesTo(analyzer, "this test", + new String[] { "this", "that", "test", "testing" }, + new int[] { 1, 0, 1, 0 }); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidDoubleMap() throws Exception { + String testFile = "a => b => c"; + SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random)); + parser.addSynonyms(new StringReader(testFile)); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidAnalyzesToNothingOutput() throws Exception { + String testFile = "a => 1"; + SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false)); + parser.addSynonyms(new StringReader(testFile)); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidAnalyzesToNothingInput() throws Exception { + String testFile = "1 => a"; + SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false)); + parser.addSynonyms(new StringReader(testFile)); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidPositionsInput() throws Exception { + String testFile = "testola => the test"; + SolrSynonymsParser parser = new SolrSynonymsParser(true, new EnglishAnalyzer(TEST_VERSION_CURRENT)); + parser.addSynonyms(new StringReader(testFile)); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidPositionsOutput() throws Exception { + String testFile = "the test => testola"; + SolrSynonymsParser parser = new SolrSynonymsParser(true, new EnglishAnalyzer(TEST_VERSION_CURRENT)); + parser.addSynonyms(new StringReader(testFile)); + } + + /** parse a syn file with some escaped syntax chars */ + public void testEscapedStuff() throws Exception { + String testFile = + "a\\=>a => b\\=>b\n" + + "a\\,a => b\\,b"; + SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random, MockTokenizer.KEYWORD, false)); + parser.addSynonyms(new StringReader(testFile)); + final FSTSynonymMap map = parser.build(); + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false); + return new TokenStreamComponents(tokenizer, new FSTSynonymFilter(tokenizer, map, false)); + } + }; + + assertAnalyzesTo(analyzer, "ball", + new String[] { "ball" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "a=>a", + new String[] { "b=>b" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "a,a", + new String[] { "b,b" }, + new int[] { 1 }); + } +} Index: solr/build.xml --- solr/build.xml Thu Jul 07 06:12:14 2011 -0400 +++ solr/build.xml Thu Jul 07 09:06:52 2011 -0400 @@ -469,10 +469,10 @@ - + Index: solr/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ solr/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java Thu Jul 07 09:06:52 2011 -0400 @@ -0,0 +1,128 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.text.ParseException; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.synonym.FSTSynonymFilter; +import org.apache.lucene.analysis.synonym.FSTSynonymMap; +import org.apache.lucene.analysis.synonym.SolrSynonymsParser; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.Version; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.util.plugin.ResourceLoaderAware; + +/** + * Factory for {@link FSTSynonymFilter}. + *
+ * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.FSTSynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
+ *             expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class FSTSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + private FSTSynonymMap map; + private boolean ignoreCase; + + @Override + public TokenStream create(TokenStream input) { + return new FSTSynonymFilter(input, map, ignoreCase); + } + + @Override + public void inform(ResourceLoader loader) { + final boolean ignoreCase = getBoolean("ignoreCase", false); + this.ignoreCase = ignoreCase; + + String tf = args.get("tokenizerFactory"); + + final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args); + + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader); + TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer; + return new TokenStreamComponents(tokenizer, stream); + } + }; + + String format = args.get("format"); + try { + if (format == null || format.equals("solr")) { + map = loadSolrSynonyms(loader, analyzer); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Load synonyms from the solr format, "format=solr". + */ + private FSTSynonymMap loadSolrSynonyms(ResourceLoader loader, Analyzer analyzer) throws IOException, ParseException { + final boolean expand = getBoolean("expand", true); + String synonyms = args.get("synonyms"); + if (synonyms == null) + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'."); + + CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + SolrSynonymsParser parser = new SolrSynonymsParser(expand, analyzer); + File synonymFile = new File(synonyms); + if (synonymFile.exists()) { + decoder.reset(); + parser.addSynonyms(new InputStreamReader(loader.openResource(synonyms), decoder)); + } else { + List files = StrUtils.splitFileNames(synonyms); + for (String file : files) { + decoder.reset(); + parser.addSynonyms(new InputStreamReader(loader.openResource(file), decoder)); + } + } + return parser.build(); + } + + private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){ + TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname); + tokFactory.init(args); + return tokFactory; + } +} Index: solr/src/test/org/apache/solr/analysis/TestFSTSynonymFilterFactory.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ solr/src/test/org/apache/solr/analysis/TestFSTSynonymFilterFactory.java Thu Jul 07 09:06:52 2011 -0400 @@ -0,0 +1,144 @@ +package org.apache.solr.analysis; + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.solr.core.SolrResourceLoader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestFSTSynonymFilterFactory extends BaseTokenTestCase { + /** stupid test that we can parse the solr syn file */ + public void testSolrSynonymsfile() throws Exception { + FSTSynonymFilterFactory fst = new FSTSynonymFilterFactory(); + fst.init(Collections.singletonMap("synonyms", "synonyms.txt")); + fst.inform(new SolrResourceLoader(null, null)); + } + /** run me with no -ea for benchmarking */ + public void testBenchmarkBig() throws Exception { + String testFile = "/home/rmuir/synonyms.txt"; + for (int i = 0; i < 3; i++) { + long ms = System.currentTimeMillis(); + doBenchmarkLoadOld(testFile); + long endMs = System.currentTimeMillis(); + System.out.println("old=" + (endMs - ms)); + ms = System.currentTimeMillis(); + doBenchmarkLoadNew(testFile); + endMs = System.currentTimeMillis(); + System.out.println("new=" + (endMs - ms)); + } + } + + /* + public void testBenchmarkBigRAM() throws Exception { + String testFile = "/home/rmuir/synonyms.txt"; + SynonymFilterFactory factory = doBenchmarkLoadOld(testFile); + final Runtime runtime = Runtime.getRuntime(); + long usedMem1 = runtime.totalMemory() - runtime.freeMemory(); + long usedMem2 = Long.MAX_VALUE; + for(int iter=0;iter<10;iter++) { + runtime.runFinalization(); + runtime.gc(); + Thread.currentThread().yield(); + Thread.sleep(1000); + usedMem2 = usedMem1; + usedMem1 = runtime.totalMemory() - runtime.freeMemory(); + } + System.out.println("done: ram used: " + usedMem1); + System.out.flush(); + } + */ + + private SynonymFilterFactory doBenchmarkLoadOld(String file) throws Exception { + SynonymFilterFactory old = new SynonymFilterFactory(); + old.init(Collections.singletonMap("synonyms", file)); + old.inform(new SolrResourceLoader(null, null)); + return old; + } + + private FSTSynonymFilterFactory doBenchmarkLoadNew(String file) throws Exception { + FSTSynonymFilterFactory fst = new FSTSynonymFilterFactory(); + fst.init(Collections.singletonMap("synonyms", file)); + fst.inform(new SolrResourceLoader(null, null)); + return fst; + } + + public void testBenchmarkDefaultSolrSyns() throws Exception { + final FSTSynonymFilterFactory fst = new FSTSynonymFilterFactory(); + fst.init(Collections.singletonMap("synonyms", "synonyms.txt")); + fst.inform(new SolrResourceLoader(null, null)); + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(tokenizer, fst.create(tokenizer)); + } + }; + + /* + final SynonymFilterFactory old = new SynonymFilterFactory(); + old.init(Collections.singletonMap("synonyms", "synonyms.txt")); + old.inform(new SolrResourceLoader(null, null)); + Analyzer oldAnalyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(tokenizer, old.create(tokenizer)); + } + }; + */ + + for (int i = 0; i < 3; i++) { + System.out.println("round: " + i); + //doBenchmarkAnalysisSpeed(oldAnalyzer); + doBenchmarkAnalysisSpeed(analyzer); + } + } + + public void doBenchmarkAnalysisSpeed(Analyzer analyzer) throws Exception { + String text = "this is a test of the emergency broadcasting system. this is only a test. Please do not pass go, do not collect $200"; + for (int i = 0; i < 20000; i++) { + TokenStream ts = analyzer.reusableTokenStream("foo", new StringReader(text)); + ts.reset(); + while (ts.incrementToken()) { + ; + } + ts.end(); + ts.close(); + } + + long ms = System.currentTimeMillis(); + for (int i = 0; i < 1000000; i++) { + TokenStream ts = analyzer.reusableTokenStream("foo", new StringReader(text)); + ts.reset(); + while (ts.incrementToken()) { + ; + } + ts.end(); + ts.close(); + } + long endMs = System.currentTimeMillis(); + System.out.println("time: " + (endMs - ms)); + } +}