Index: lucene/contrib/CHANGES.txt
--- lucene/contrib/CHANGES.txt Tue Jul 05 14:00:43 2011 -0400
+++ lucene/contrib/CHANGES.txt Tue Jul 05 15:46:39 2011 -0400
@@ -78,6 +78,10 @@
documents must be indexed as a document block, using
IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless)
+ * LUCENE-3233: FSTSynonymFilter for applying multi-word synonyms
+ during indexing, using far less RAM than the current
+ SynonymFilter. (Robert Muir, Mike McCandless)
+
API Changes
Bug Fixes
Index: lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java Tue Jul 05 15:46:39 2011 -0400
@@ -0,0 +1,52 @@
+package org.apache.lucene.store;
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * @lucene.experimental
+ */
+public class ByteArrayDataOutput extends DataOutput {
+ private byte[] bytes;
+
+ private int pos;
+ private int limit;
+
+ public ByteArrayDataOutput(byte[] bytes) {
+ reset(bytes);
+ }
+
+ public ByteArrayDataOutput(byte[] bytes, int offset, int len) {
+ reset(bytes, offset, len);
+ }
+
+ public ByteArrayDataOutput() {
+ reset(BytesRef.EMPTY_BYTES);
+ }
+
+ public void reset(byte[] bytes) {
+ reset(bytes, 0, bytes.length);
+ }
+
+ public void reset(byte[] bytes, int offset, int len) {
+ this.bytes = bytes;
+ pos = offset;
+ limit = offset + len;
+ }
+
+ public int getPosition() {
+ return pos;
+ }
+
+ @Override
+ public void writeByte(byte b) {
+ assert pos < limit;
+ bytes[pos++] = b;
+ }
+
+ @Override
+ public void writeBytes(byte[] b, int offset, int length) {
+ assert pos + length <= limit;
+ System.arraycopy(b, offset, bytes, pos, length);
+ pos += length;
+ }
+}
Index: lucene/src/java/org/apache/lucene/util/CharsRef.java
--- lucene/src/java/org/apache/lucene/util/CharsRef.java Tue Jul 05 14:00:43 2011 -0400
+++ lucene/src/java/org/apache/lucene/util/CharsRef.java Tue Jul 05 15:46:39 2011 -0400
@@ -1,5 +1,7 @@
package org.apache.lucene.util;
+import java.util.Comparator;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -167,7 +169,11 @@
* the {@link CharsRef} to copy
*/
public void copy(CharsRef other) {
- chars = ArrayUtil.grow(chars, other.length);
+ if (chars == null) {
+ chars = new char[other.length];
+ } else {
+ chars = ArrayUtil.grow(chars, other.length);
+ }
System.arraycopy(other.chars, other.offset, chars, 0, other.length);
length = other.length;
offset = 0;
@@ -213,4 +219,56 @@
public CharSequence subSequence(int start, int end) {
return new CharsRef(chars, offset + start, offset + end - 1);
}
+
+ private final static Comparator Note that with the current implementation, parsing is
+ * greedy, so whenever multiple parses would apply, the rule
+ * starting the earliest and parsing the most tokens wins.
+ * For example if you have these rules:
+ *
+ *
+ * a -> x
+ * a b -> y
+ * b c d -> z
+ *
+ *
+ * Then input a b c d e parses to y b c
+ * d, ie the 2nd rule "wins" because it started
+ * earliest and matched the most input tokens of other rules
+ * starting at that point.
A future improvement to this filter could allow + * non-greedy parsing, such that the 3rd rule would win, and + * also separately allow multiple parses, such that all 3 + * rules would match, perhaps even on a rule by rule + * basis.
+ * + *NOTE: when a match occurs, the output tokens
+ * associated with the matching rule are "stacked" on top of
+ * the input stream (if the rule had
+ * keepOrig=true) and also on top of aother
+ * matched rule's output tokens. This is not a correct
+ * solution, as really the output should be an abitrary
+ * graph/lattice. For example, with the above match, you
+ * would expect an exact PhraseQuery "y b
+ * c" to match the parsed tokens, but it will fail to
+ * do so. This limitations is necessary because Lucene's
+ * TokenStream (and index) cannot yet represent an arbitrary
+ * graph.
NOTE: If multiple incoming tokens arrive on the + * same position, only the first token at that position is + * used for parsing. Subsequent tokens simply pass through + * and are not parsed. A future improvement would be to + * allow these tokens to also be matched.
+ */ + +// TODO: maybe we should resolve token -> wordID then run +// FST on wordIDs, for better perf? + +// TODO: a more efficient approach would be Aho/Corasick's +// algorithm +// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm +// It improves over the current approach here +// because it does not fully re-start matching at every +// token. For exampl,e if one pattern is "a b c x" +// and another is "b c d" and the input is "a b c d", on +// trying to parse "a b c x" but failing when you got to x, +// rather than starting over again your really should +// immediately recognize that "b c d" matches at the next +// input. I suspect this won't matter that much in +// practice, but it's possible on some set of synonyms it +// will. We'd have to modify Aho/Corasick to enforce our +// conflict resolving (eg greedy matching) because that algo +// finds all matches. + +public final class FSTSynonymFilter extends TokenFilter { + private final FSTSynonymMap synonyms; + + private final boolean ignoreCase; + private final int rollBufferSize; + + private int captureCount; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + + // How many future input tokens have already been matched + // to a synonym; because the matching is "greedy" we don't + // try to do any more matching for such tokens: + private int inputSkipCount; + + // Hold all buffered (read ahead) stacked input tokens for + // a future position. When multiple tokens are at the + // same position, we only store (and match against) the + // term for the first token at the position, but capture + // state for (and enumerate) all other tokens at this + // position: + private static class PendingInput { + final CharsRef term = new CharsRef(); + AttributeSource.State state; + boolean keepOrig; + boolean consumed = true; + + public void reset() { + state = null; + consumed = true; + keepOrig = false; + } + }; + + // Rolling buffer, holding pending input tokens we had to + // clone because we needed to look ahead, indexed by + // position: + private final PendingInput[] futureInputs; + + // Holds pending output synonyms for one future position: + private static class PendingOutputs { + CharsRef[] outputs; + int upto; + int count; + int posIncr = 1; + + public PendingOutputs() { + outputs = new CharsRef[1]; + } + + public void reset() { + upto = count = 0; + posIncr = 1; + } + + public CharsRef pullNext() { + assert upto < count; + final CharsRef result = outputs[upto++]; + posIncr = 0; + if (upto == count) { + reset(); + } + return result; + } + + public void add(char[] output, int offset, int len) { + if (count == outputs.length) { + final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(outputs, 0, next, 0, count); + outputs = next; + } + if (outputs[count] == null) { + outputs[count] = new CharsRef(); + } + outputs[count].copy(output, offset, len); + count++; + } + }; + + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + // Rolling buffer, holding stack of pending synonym + // outputs, indexed by position: + private final PendingOutputs[] futureOutputs; + + // Where (in rolling buffers) to write next input saved state: + private int nextWrite; + + // Where (in rolling buffers) to read next input saved state: + private int nextRead; + + // True once we've read last token + private boolean finished; + + private final FST.Arc
+ * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap
+ * @lucene.experimental
+ */
+public class FSTSynonymMapBuilder {
+ private final TreeMap
+ * Chances are your parser is/can likely count this itself so it should just
+ * use the other method if so.
+ */
+ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
+ add(input, countWords(input), output, countWords(output), includeOrig);
+ }
+
+ /**
+ * Builds an {@link FSTSynonymMap} and returns it.
+ */
+ public FSTSynonymMap build() throws IOException {
+ ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ // TODO: are we using the best sharing options?
+ Builder