fst, BytesRefHash words, int maxHorizontalContext, int maxVerticalContext) {
+ this.fst = fst;
+ this.words = words;
+ this.maxHorizontalContext = maxHorizontalContext;
+ this.maxVerticalContext = maxVerticalContext;
+ }
+}
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java Tue Jul 05 17:45:59 2011 -0400
@@ -0,0 +1,264 @@
+package org.apache.lucene.analysis.synonym;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+
+/**
+ * Builds an FSTSynonymMap.
+ *
+ * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap
+ * @lucene.experimental
+ */
+public class FSTSynonymMapBuilder {
+ private final TreeMap workingSet = new TreeMap(CharsRef.getUTF16SortedAsUTF8Comparator());
+ private final BytesRefHash words = new BytesRefHash();
+ private final BytesRef utf8Scratch = new BytesRef(8);
+ private int maxHorizontalContext;
+ private int maxVerticalContext;
+ private final boolean dedup;
+
+ public FSTSynonymMapBuilder() {
+ this(true);
+ }
+
+ /** If dedup is true then identical rules (same input,
+ * same output) will be added only once. */
+ public FSTSynonymMapBuilder(boolean dedup) {
+ this.dedup = dedup;
+ }
+
+ private static class MapEntry {
+ boolean includeOrig;
+ // we could sort for better sharing ultimately, but it could confuse people
+ ArrayList ords = new ArrayList();
+ }
+
+ /** Sugar: just joins the provided terms with {@link
+ * FSTSynonymMap#WORD_SEPARATOR}. reuse and its chars
+ * must not be null. */
+ public static CharsRef join(String[] words, CharsRef reuse) {
+ int upto = 0;
+ char[] buffer = reuse.chars;
+ for(String word : words) {
+ if (upto > 0) {
+ if (upto >= buffer.length) {
+ reuse.grow(upto);
+ buffer = reuse.chars;
+ }
+ buffer[upto++] = FSTSynonymMap.WORD_SEPARATOR;
+ }
+
+ final int wordLen = word.length();
+ final int needed = upto + wordLen;
+ if (needed > buffer.length) {
+ reuse.grow(needed);
+ buffer = reuse.chars;
+ }
+
+ word.getChars(0, wordLen, buffer, upto);
+ upto += wordLen;
+ }
+
+ return reuse;
+ }
+
+ private boolean hasHoles(CharsRef chars) {
+ final int end = chars.offset + chars.length;
+ for(int idx=chars.offset+1;idxphrase synonym mapping.
+ * Phrases are character sequences where words are
+ * separated with character zero (\u0000). Empty words
+ * (two \u0000s in a row) are not allowed in the input nor
+ * the output!
+ *
+ * @param input input phrase
+ * @param numInputWords number of input words in the input phrase
+ * @param output output phrase
+ * @param includeOrig true if the original should be included
+ */
+ public void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) {
+ // first convert to UTF-8
+ if (numInputWords <= 0) {
+ throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
+ }
+ if (input.length <= 0) {
+ throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")");
+ }
+ if (numOutputWords <= 0) {
+ throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
+ }
+ if (output.length <= 0) {
+ throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")");
+ }
+
+ assert !hasHoles(input): "input has holes: " + input;
+ assert !hasHoles(output): "output has holes: " + output;
+
+ //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
+ final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch);
+ // lookup in hash
+ int ord = words.add(utf8Scratch, hashCode);
+ if (ord < 0) {
+ // already exists in our hash
+ ord = (-ord)-1;
+ //System.out.println(" output=" + output + " old ord=" + ord);
+ } else {
+ //System.out.println(" output=" + output + " new ord=" + ord);
+ }
+
+ MapEntry e = workingSet.get(input);
+ if (e == null) {
+ e = new MapEntry();
+ workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map
+ }
+
+ e.ords.add(ord);
+ e.includeOrig |= includeOrig;
+ maxVerticalContext = Math.max(maxVerticalContext, e.ords.size());
+ maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
+ maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords);
+ }
+
+ private int countWords(CharsRef chars) {
+ int wordCount = 1;
+ int upto = chars.offset;
+ final int limit = chars.offset + chars.length;
+ while(upto < limit) {
+ final int codePoint = Character.codePointAt(chars.chars, upto, limit);
+ if (codePoint == FSTSynonymMap.WORD_SEPARATOR) {
+ wordCount++;
+ }
+ upto += Character.charCount(codePoint);
+ }
+ return wordCount;
+ }
+
+ /**
+ * Helper for {@link #add(CharsRef, int, CharsRef, boolean)}, except it counts
+ * the words in the input phrase for you.
+ *
+ * Chances are your parser is/can likely count this itself so it should just
+ * use the other method if so.
+ */
+ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
+ add(input, countWords(input), output, countWords(output), includeOrig);
+ }
+
+ /**
+ * Builds an {@link FSTSynonymMap} and returns it.
+ */
+ public FSTSynonymMap build() throws IOException {
+ ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ // TODO: are we using the best sharing options?
+ Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
+
+ BytesRef scratch = new BytesRef(64);
+ ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
+
+ final Set dedupSet;
+
+ if (dedup) {
+ dedupSet = new HashSet();
+ } else {
+ dedupSet = null;
+ }
+
+ final byte[] spare = new byte[5];
+
+ //System.out.println("fmap.build");
+ for (Map.Entry e : workingSet.entrySet()) {
+ CharsRef input = e.getKey();
+ MapEntry output = e.getValue();
+
+ int numEntries = output.ords.size();
+
+ // output size, assume the worst case
+ int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
+
+ scratch.grow(estimatedSize);
+ scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
+ assert scratch.offset == 0;
+
+ // now write our output data:
+ int count = 0;
+ for (int i = 0; i < numEntries; i++) {
+ if (dedupSet != null) {
+ // box once
+ final Integer ent = output.ords.get(i);
+ if (dedupSet.contains(ent)) {
+ continue;
+ }
+ dedupSet.add(ent);
+ }
+ scratchOutput.writeVInt(output.ords.get(i));
+ count++;
+ }
+
+ final int pos = scratchOutput.getPosition();
+ scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
+ final int pos2 = scratchOutput.getPosition();
+ final int vIntLen = pos2-pos;
+
+ // Move the count + includeOrig to the front of the byte[]:
+ System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
+ System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
+ System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);
+
+ if (dedupSet != null) {
+ dedupSet.clear();
+ }
+
+ scratch.length = scratchOutput.getPosition() - scratch.offset;
+ //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset);
+ builder.add(input, new BytesRef(scratch));
+ }
+
+ FST fst = builder.finish();
+ return new FSTSynonymMap(fst, words, maxHorizontalContext, maxVerticalContext);
+ }
+}
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java Tue Jul 05 17:45:59 2011 -0400
@@ -0,0 +1,388 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util._TestUtil;
+
+public class TestFSTSynonymMapFilter extends BaseTokenStreamTestCase {
+
+ private FSTSynonymMapBuilder b;
+ private Tokenizer tokensIn;
+ private FSTSynonymFilter tokensOut;
+ private CharTermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+
+ private void add(String input, String output, boolean keepOrig) {
+ b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
+ new CharsRef(output.replaceAll(" +", "\u0000")),
+ keepOrig);
+ }
+
+ private void assertEquals(CharTermAttribute term, String expected) {
+ assertEquals(expected.length(), term.length());
+ final char[] buffer = term.buffer();
+ for(int chIDX=0;chIDX 0) {
+ assertTrue(tokensOut.incrementToken());
+ if (VERBOSE) {
+ System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+ }
+ }
+ assertEquals(termAtt, expectedAtPos[atPos]);
+ assertEquals(atPos == 0 ? 1 : 0,
+ posIncrAtt.getPositionIncrement());
+ }
+ }
+ tokensOut.end();
+ tokensOut.close();
+ if (VERBOSE) {
+ System.out.println(" incr: END");
+ }
+ assertEquals(expectedUpto, expected.length);
+ }
+
+ public void testBasic() throws Exception {
+ b = new FSTSynonymMapBuilder();
+ add("a", "foo", true);
+ add("a b", "bar fee", true);
+ add("b c", "dog collar", true);
+ add("c d", "dog harness holder extras", true);
+ add("m c e", "dog barks loudly", false);
+
+ add("e f", "foo bar", false);
+ add("e f", "baz bee", false);
+
+ add("z", "boo", false);
+ add("y", "bee", true);
+
+ tokensIn = new MockTokenizer(new StringReader("a"),
+ MockTokenizer.WHITESPACE,
+ true);
+ tokensIn.reset();
+ assertTrue(tokensIn.incrementToken());
+ assertFalse(tokensIn.incrementToken());
+ tokensIn.end();
+ tokensIn.close();
+
+ tokensOut = new FSTSynonymFilter(tokensIn,
+ b.build(),
+ true);
+ termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+ posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+
+ verify("a b c", "a/bar b/fee c");
+
+ // syn output extends beyond input tokens
+ verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");
+
+ verify("a b a", "a/bar b/fee a/foo");
+
+ // outputs that add to one another:
+ verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");
+
+ // two outputs for same input
+ verify("e f", "foo/baz bar/bee");
+
+ // mixed keepOrig true/false:
+ verify("a m c e x", "a/foo dog barks loudly x");
+ verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x");
+ assertTrue(tokensOut.getCaptureCount() > 0);
+
+ // no captureStates when no syns matched
+ verify("p q r s t", "p q r s t");
+ assertEquals(0, tokensOut.getCaptureCount());
+
+ // no captureStates when only single-input syns, w/ no
+ // lookahead needed, matched
+ verify("p q z y t", "p q boo y/bee t");
+ assertEquals(0, tokensOut.getCaptureCount());
+ }
+
+ private String getRandomString(char start, int alphabetSize, int length) {
+ assert alphabetSize <= 26;
+ char[] s = new char[2*length];
+ for(int charIDX=0;charIDX out;
+ boolean keepOrig;
+ }
+
+ public String slowSynMatcher(String doc, List syns, int maxOutputLength) {
+ assertTrue(doc.length() % 2 == 0);
+ final int numInputs = doc.length()/2;
+ boolean[] keepOrigs = new boolean[numInputs];
+ Arrays.fill(keepOrigs, false);
+ String[] outputs = new String[numInputs + maxOutputLength];
+ OneSyn[] matches = new OneSyn[numInputs];
+ for(OneSyn syn : syns) {
+ int idx = -1;
+ while(true) {
+ idx = doc.indexOf(syn.in, 1+idx);
+ if (idx == -1) {
+ break;
+ }
+ assertTrue(idx % 2 == 0);
+ final int matchIDX = idx/2;
+ assertTrue(syn.in.length() % 2 == 1);
+ if (matches[matchIDX] == null) {
+ matches[matchIDX] = syn;
+ } else if (syn.in.length() > matches[matchIDX].in.length()) {
+ // Greedy conflict resolution: longer match wins:
+ matches[matchIDX] = syn;
+ } else {
+ assertTrue(syn.in.length() < matches[matchIDX].in.length());
+ }
+ }
+ }
+
+ // Greedy conflict resolution: if syn matches a range of inputs,
+ // it prevents other syns from matching that range
+ for(int inputIDX=0;inputIDX= numInputs && outputs[inputIDX] == null) {
+ break;
+ }
+ if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) {
+ sb.append(inputTokens[inputIDX]);
+ posHasOutput = true;
+ }
+
+ if (outputs[inputIDX] != null) {
+ if (posHasOutput) {
+ sb.append('/');
+ }
+ sb.append(outputs[inputIDX]);
+ }
+ if (inputIDX < limit-1) {
+ sb.append(' ');
+ }
+ }
+
+ return sb.toString();
+ }
+
+ public void testRandom() throws Exception {
+
+ final int alphabetSize = _TestUtil.nextInt(random, 2, 7);
+
+ final int docLen = atLeast(3000);
+ //final int docLen = 50;
+
+ final String document = getRandomString('a', alphabetSize, docLen);
+
+ if (VERBOSE) {
+ System.out.println("TEST: doc=" + document);
+ }
+
+ final int numSyn = atLeast(5);
+ //final int numSyn = 2;
+
+ final Map synMap = new HashMap();
+ final List syns = new ArrayList();
+ final boolean dedup = random.nextBoolean();
+ if (VERBOSE) {
+ System.out.println(" dedup=" + dedup);
+ }
+ b = new FSTSynonymMapBuilder(dedup);
+ for(int synIDX=0;synIDX();
+ synMap.put(synIn, s);
+ s.keepOrig = random.nextBoolean();
+ }
+ final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim();
+ s.out.add(synOut);
+ add(synIn, synOut, s.keepOrig);
+ if (VERBOSE) {
+ System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
+ }
+ }
+
+ tokensIn = new MockTokenizer(new StringReader("a"),
+ MockTokenizer.WHITESPACE,
+ true);
+ tokensIn.reset();
+ assertTrue(tokensIn.incrementToken());
+ assertFalse(tokensIn.incrementToken());
+ tokensIn.end();
+ tokensIn.close();
+
+ tokensOut = new FSTSynonymFilter(tokensIn,
+ b.build(),
+ true);
+ termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+ posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+
+ if (dedup) {
+ pruneDups(syns);
+ }
+
+ final String expected = slowSynMatcher(document, syns, 5);
+
+ if (VERBOSE) {
+ System.out.println("TEST: expected=" + expected);
+ }
+
+ verify(document, expected);
+ }
+
+ private void pruneDups(List syns) {
+ Set seen = new HashSet();
+ for(OneSyn syn : syns) {
+ int idx = 0;
+ while(idx < syn.out.size()) {
+ String out = syn.out.get(idx);
+ if (!seen.contains(out)) {
+ seen.add(out);
+ idx++;
+ } else {
+ syn.out.remove(idx);
+ }
+ }
+ seen.clear();
+ }
+ }
+
+ private String randomNonEmptyString() {
+ while(true) {
+ final String s = _TestUtil.randomUnicodeString(random).trim();
+ //final String s = _TestUtil.randomSimpleString(random).trim();
+ if (s.length() != 0 && s.indexOf('\u0000') == -1) {
+ return s;
+ }
+ }
+ }
+
+ /** simple random test, doesn't verify correctness.
+ * does verify it doesnt throw exceptions, or that the stream doesn't misbehave
+ */
+ public void testRandom2() throws Exception {
+ final int numIters = atLeast(10);
+ for (int i = 0; i < numIters; i++) {
+ b = new FSTSynonymMapBuilder(random.nextBoolean());
+ final int numEntries = atLeast(10);
+ //final int numEntries = atLeast(10);
+ for (int j = 0; j < numEntries; j++) {
+ // nocommit: better random strings here (e.g. lots of spaces and ascii?)
+ add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
+ }
+ final FSTSynonymMap map = b.build();
+ final boolean ignoreCase = random.nextBoolean();
+
+ final Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+ return new TokenStreamComponents(tokenizer, new FSTSynonymFilter(tokenizer, map, ignoreCase));
+ }
+ };
+
+ checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
+ //checkRandomData(random, analyzer, 10*RANDOM_MULTIPLIER);
+ }
+ }
+}