fst, BytesRefHash words, int maxHorizontalContext, int maxVerticalContext) {
+ this.fst = fst;
+ this.words = words;
+ this.maxHorizontalContext = maxHorizontalContext;
+ this.maxVerticalContext = maxVerticalContext;
+ }
+}
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java Thu Jul 07 09:06:52 2011 -0400
@@ -0,0 +1,267 @@
+package org.apache.lucene.analysis.synonym;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+
+/**
+ * Builds an FSTSynonymMap.
+ *
+ * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap
+ * @lucene.experimental
+ */
+public class FSTSynonymMapBuilder {
+ private final HashMap workingSet = new HashMap();
+ private final BytesRefHash words = new BytesRefHash();
+ private final BytesRef utf8Scratch = new BytesRef(8);
+ private int maxHorizontalContext;
+ private int maxVerticalContext;
+ private final boolean dedup;
+
+ public FSTSynonymMapBuilder() {
+ this(true);
+ }
+
+ /** If dedup is true then identical rules (same input,
+ * same output) will be added only once. */
+ public FSTSynonymMapBuilder(boolean dedup) {
+ this.dedup = dedup;
+ }
+
+ private static class MapEntry {
+ boolean includeOrig;
+ // we could sort for better sharing ultimately, but it could confuse people
+ ArrayList ords = new ArrayList();
+ }
+
+ /** Sugar: just joins the provided terms with {@link
+ * FSTSynonymMap#WORD_SEPARATOR}. reuse and its chars
+ * must not be null. */
+ public static CharsRef join(String[] words, CharsRef reuse) {
+ int upto = 0;
+ char[] buffer = reuse.chars;
+ for(String word : words) {
+ if (upto > 0) {
+ if (upto >= buffer.length) {
+ reuse.grow(upto);
+ buffer = reuse.chars;
+ }
+ buffer[upto++] = FSTSynonymMap.WORD_SEPARATOR;
+ }
+
+ final int wordLen = word.length();
+ final int needed = upto + wordLen;
+ if (needed > buffer.length) {
+ reuse.grow(needed);
+ buffer = reuse.chars;
+ }
+
+ word.getChars(0, wordLen, buffer, upto);
+ upto += wordLen;
+ }
+
+ return reuse;
+ }
+
+ private boolean hasHoles(CharsRef chars) {
+ final int end = chars.offset + chars.length;
+ for(int idx=chars.offset+1;idxphrase synonym mapping.
+ * Phrases are character sequences where words are
+ * separated with character zero (\u0000). Empty words
+ * (two \u0000s in a row) are not allowed in the input nor
+ * the output!
+ *
+ * @param input input phrase
+ * @param numInputWords number of input words in the input phrase
+ * @param output output phrase
+ * @param includeOrig true if the original should be included
+ */
+ public void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) {
+ // first convert to UTF-8
+ if (numInputWords <= 0) {
+ throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
+ }
+ if (input.length <= 0) {
+ throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")");
+ }
+ if (numOutputWords <= 0) {
+ throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
+ }
+ if (output.length <= 0) {
+ throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")");
+ }
+
+ assert !hasHoles(input): "input has holes: " + input;
+ assert !hasHoles(output): "output has holes: " + output;
+
+ //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
+ final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch);
+ // lookup in hash
+ int ord = words.add(utf8Scratch, hashCode);
+ if (ord < 0) {
+ // already exists in our hash
+ ord = (-ord)-1;
+ //System.out.println(" output=" + output + " old ord=" + ord);
+ } else {
+ //System.out.println(" output=" + output + " new ord=" + ord);
+ }
+
+ MapEntry e = workingSet.get(input);
+ if (e == null) {
+ e = new MapEntry();
+ workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map
+ }
+
+ e.ords.add(ord);
+ e.includeOrig |= includeOrig;
+ maxVerticalContext = Math.max(maxVerticalContext, e.ords.size());
+ maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
+ maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords);
+ }
+
+ private int countWords(CharsRef chars) {
+ int wordCount = 1;
+ int upto = chars.offset;
+ final int limit = chars.offset + chars.length;
+ while(upto < limit) {
+ final int codePoint = Character.codePointAt(chars.chars, upto, limit);
+ if (codePoint == FSTSynonymMap.WORD_SEPARATOR) {
+ wordCount++;
+ }
+ upto += Character.charCount(codePoint);
+ }
+ return wordCount;
+ }
+
+ /**
+ * Helper for {@link #add(CharsRef, int, CharsRef, boolean)}, except it counts
+ * the words in the input phrase for you.
+ *
+ * Chances are your parser is/can likely count this itself so it should just
+ * use the other method if so.
+ */
+ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
+ add(input, countWords(input), output, countWords(output), includeOrig);
+ }
+
+ /**
+ * Builds an {@link FSTSynonymMap} and returns it.
+ */
+ public FSTSynonymMap build() throws IOException {
+ ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ // TODO: are we using the best sharing options?
+ Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
+
+ BytesRef scratch = new BytesRef(64);
+ ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
+
+ final Set dedupSet;
+
+ if (dedup) {
+ dedupSet = new HashSet();
+ } else {
+ dedupSet = null;
+ }
+
+ final byte[] spare = new byte[5];
+
+ Set keys = workingSet.keySet();
+ CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
+ Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
+
+ //System.out.println("fmap.build");
+ for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
+ CharsRef input = sortedKeys[keyIdx];
+ MapEntry output = workingSet.get(input);
+
+ int numEntries = output.ords.size();
+ // output size, assume the worst case
+ int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
+
+ scratch.grow(estimatedSize);
+ scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
+ assert scratch.offset == 0;
+
+ // now write our output data:
+ int count = 0;
+ for (int i = 0; i < numEntries; i++) {
+ if (dedupSet != null) {
+ // box once
+ final Integer ent = output.ords.get(i);
+ if (dedupSet.contains(ent)) {
+ continue;
+ }
+ dedupSet.add(ent);
+ }
+ scratchOutput.writeVInt(output.ords.get(i));
+ count++;
+ }
+
+ final int pos = scratchOutput.getPosition();
+ scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
+ final int pos2 = scratchOutput.getPosition();
+ final int vIntLen = pos2-pos;
+
+ // Move the count + includeOrig to the front of the byte[]:
+ System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
+ System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
+ System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);
+
+ if (dedupSet != null) {
+ dedupSet.clear();
+ }
+
+ scratch.length = scratchOutput.getPosition() - scratch.offset;
+ //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
+ builder.add(input, new BytesRef(scratch));
+ }
+
+ FST fst = builder.finish();
+ return new FSTSynonymMap(fst, words, maxHorizontalContext, maxVerticalContext);
+ }
+}
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymsParser.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymsParser.java Thu Jul 07 09:06:52 2011 -0400
@@ -0,0 +1,223 @@
+package org.apache.lucene.analysis.synonym;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.ParseException;
+import java.util.ArrayList;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.CharsRef;
+
+/**
+ * Parser for the Solr synonyms format.
+ *
+ * - Blank lines and lines starting with '#' are comments.
+ *
- Explicit mappings match any token sequence on the LHS of "=>"
+ * and replace with all alternatives on the RHS. These types of mappings
+ * ignore the expand parameter in the constructor.
+ * Example:
+ *
i-pod, i pod => ipod
+ * - Equivalent synonyms may be separated with commas and give
+ * no explicit mapping. In this case the mapping behavior will
+ * be taken from the expand parameter in the constructor. This allows
+ * the same synonym file to be used in different synonym handling strategies.
+ * Example:
+ *
ipod, i-pod, i pod
+ *
+ * - Multiple synonym mapping entries are merged.
+ * Example:
+ *
+ * foo => foo bar
+ * foo => baz
+ * is equivalent to
+ * foo => foo bar, baz
+ *
+ *
+ * @lucene.experimental
+ */
+public class SolrSynonymsParser {
+ private final boolean expand;
+ private final Analyzer analyzer;
+ private final FSTSynonymMapBuilder builder;
+
+ public SolrSynonymsParser(boolean expand, Analyzer analyzer) {
+ this.expand = expand;
+ this.analyzer = analyzer;
+ this.builder = new FSTSynonymMapBuilder();
+ }
+
+ public void addSynonyms(Reader in) throws IOException, ParseException {
+ LineNumberReader br = new LineNumberReader(in);
+ try {
+ addInternal(br);
+ } catch (IllegalArgumentException e) {
+ ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
+ ex.initCause(e);
+ throw ex;
+ } finally {
+ br.close();
+ }
+ }
+
+ private void addInternal(BufferedReader in) throws IOException {
+ String line = null;
+ while ((line = in.readLine()) != null) {
+ if (line.length() == 0 || line.charAt(0) == '#') {
+ continue; // ignore empty lines and comments
+ }
+
+ CharsRef inputs[];
+ CharsRef outputs[];
+
+ // TODO: we could process this more efficiently.
+
+ String sides[] = split(line, "=>");
+ if (sides.length > 1) { // explicit mapping
+ if (sides.length != 2) {
+ throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
+ }
+ String inputStrings[] = split(sides[0], ",");
+ inputs = new CharsRef[inputStrings.length];
+ for (int i = 0; i < inputs.length; i++) {
+ inputs[i] = analyze(inputStrings[i]);
+ }
+
+ String outputStrings[] = split(sides[1], ",");
+ outputs = new CharsRef[outputStrings.length];
+ for (int i = 0; i < outputs.length; i++) {
+ outputs[i] = analyze(outputStrings[i]);
+ }
+ } else {
+ String inputStrings[] = split(line, ",");
+ inputs = new CharsRef[inputStrings.length];
+ for (int i = 0; i < inputs.length; i++) {
+ inputs[i] = analyze(inputStrings[i]);
+ }
+ if (expand) {
+ outputs = inputs;
+ } else {
+ outputs = new CharsRef[1];
+ outputs[0] = inputs[0];
+ }
+ }
+
+ // currently we include the term itself in the map,
+ // and use includeOrig = false always.
+ // this is how the existing filter does it, but its actually a bug,
+ // especially if combined with ignoreCase = true
+ for (int i = 0; i < inputs.length; i++) {
+ for (int j = 0; j < outputs.length; j++) {
+ //System.out.println("add: " + inputs[i] + " -> " + outputs[j]);
+ builder.add(inputs[i], outputs[j], false);
+ }
+ }
+ }
+ }
+
+ private CharsRef analyze(String text) throws IOException {
+ CharsRef out = new CharsRef();
+ // nocommit: this unescaping/trimming is ugly
+ text = unescape(text.trim());
+ TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text));
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
+ ts.reset();
+ while (ts.incrementToken()) {
+ int length = termAtt.length();
+ if (length == 0) {
+ throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
+ }
+ if (posIncAtt.getPositionIncrement() != 1) {
+ throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
+ }
+ out.grow(out.length + length + 1 /* one for the separator */);
+ int end = out.offset + out.length;
+ if (out.length > 0) {
+ out.chars[end++] = FSTSynonymMap.WORD_SEPARATOR;
+ out.length++;
+ }
+ System.arraycopy(termAtt.buffer(), 0, out.chars, end, length);
+ out.length += length;
+ }
+ ts.end();
+ ts.close();
+ if (out.length == 0) {
+ throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
+ }
+ return out;
+ }
+
+ private static String[] split(String s, String separator) {
+ ArrayList list = new ArrayList(2);
+ StringBuilder sb = new StringBuilder();
+ int pos=0, end=s.length();
+ while (pos < end) {
+ if (s.startsWith(separator,pos)) {
+ if (sb.length() > 0) {
+ list.add(sb.toString());
+ sb=new StringBuilder();
+ }
+ pos+=separator.length();
+ continue;
+ }
+
+ char ch = s.charAt(pos++);
+ if (ch=='\\') {
+ sb.append(ch);
+ if (pos>=end) break; // ERROR, or let it go?
+ ch = s.charAt(pos++);
+ }
+
+ sb.append(ch);
+ }
+
+ if (sb.length() > 0) {
+ list.add(sb.toString());
+ }
+
+ return list.toArray(new String[list.size()]);
+ }
+
+ private String unescape(String s) {
+ if (s.indexOf("\\") >= 0) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < s.length(); i++) {
+ char ch = s.charAt(i);
+ if (ch == '\\' && i < s.length() - 1) {
+ sb.append(s.charAt(++i));
+ } else {
+ sb.append(ch);
+ }
+ }
+ return sb.toString();
+ }
+ return s;
+ }
+
+ public FSTSynonymMap build() throws IOException {
+ return builder.build();
+ }
+}
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java Thu Jul 07 09:06:52 2011 -0400
@@ -0,0 +1,388 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util._TestUtil;
+
+public class TestFSTSynonymMapFilter extends BaseTokenStreamTestCase {
+
+ private FSTSynonymMapBuilder b;
+ private Tokenizer tokensIn;
+ private FSTSynonymFilter tokensOut;
+ private CharTermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+
+ private void add(String input, String output, boolean keepOrig) {
+ b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
+ new CharsRef(output.replaceAll(" +", "\u0000")),
+ keepOrig);
+ }
+
+ private void assertEquals(CharTermAttribute term, String expected) {
+ assertEquals(expected.length(), term.length());
+ final char[] buffer = term.buffer();
+ for(int chIDX=0;chIDX 0) {
+ assertTrue(tokensOut.incrementToken());
+ if (VERBOSE) {
+ System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+ }
+ }
+ assertEquals(termAtt, expectedAtPos[atPos]);
+ assertEquals(atPos == 0 ? 1 : 0,
+ posIncrAtt.getPositionIncrement());
+ }
+ }
+ tokensOut.end();
+ tokensOut.close();
+ if (VERBOSE) {
+ System.out.println(" incr: END");
+ }
+ assertEquals(expectedUpto, expected.length);
+ }
+
+ public void testBasic() throws Exception {
+ b = new FSTSynonymMapBuilder();
+ add("a", "foo", true);
+ add("a b", "bar fee", true);
+ add("b c", "dog collar", true);
+ add("c d", "dog harness holder extras", true);
+ add("m c e", "dog barks loudly", false);
+
+ add("e f", "foo bar", false);
+ add("e f", "baz bee", false);
+
+ add("z", "boo", false);
+ add("y", "bee", true);
+
+ tokensIn = new MockTokenizer(new StringReader("a"),
+ MockTokenizer.WHITESPACE,
+ true);
+ tokensIn.reset();
+ assertTrue(tokensIn.incrementToken());
+ assertFalse(tokensIn.incrementToken());
+ tokensIn.end();
+ tokensIn.close();
+
+ tokensOut = new FSTSynonymFilter(tokensIn,
+ b.build(),
+ true);
+ termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+ posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+
+ verify("a b c", "a/bar b/fee c");
+
+ // syn output extends beyond input tokens
+ verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");
+
+ verify("a b a", "a/bar b/fee a/foo");
+
+ // outputs that add to one another:
+ verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");
+
+ // two outputs for same input
+ verify("e f", "foo/baz bar/bee");
+
+ // mixed keepOrig true/false:
+ verify("a m c e x", "a/foo dog barks loudly x");
+ verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x");
+ assertTrue(tokensOut.getCaptureCount() > 0);
+
+ // no captureStates when no syns matched
+ verify("p q r s t", "p q r s t");
+ assertEquals(0, tokensOut.getCaptureCount());
+
+ // no captureStates when only single-input syns, w/ no
+ // lookahead needed, matched
+ verify("p q z y t", "p q boo y/bee t");
+ assertEquals(0, tokensOut.getCaptureCount());
+ }
+
+ private String getRandomString(char start, int alphabetSize, int length) {
+ assert alphabetSize <= 26;
+ char[] s = new char[2*length];
+ for(int charIDX=0;charIDX out;
+ boolean keepOrig;
+ }
+
+ public String slowSynMatcher(String doc, List syns, int maxOutputLength) {
+ assertTrue(doc.length() % 2 == 0);
+ final int numInputs = doc.length()/2;
+ boolean[] keepOrigs = new boolean[numInputs];
+ Arrays.fill(keepOrigs, false);
+ String[] outputs = new String[numInputs + maxOutputLength];
+ OneSyn[] matches = new OneSyn[numInputs];
+ for(OneSyn syn : syns) {
+ int idx = -1;
+ while(true) {
+ idx = doc.indexOf(syn.in, 1+idx);
+ if (idx == -1) {
+ break;
+ }
+ assertTrue(idx % 2 == 0);
+ final int matchIDX = idx/2;
+ assertTrue(syn.in.length() % 2 == 1);
+ if (matches[matchIDX] == null) {
+ matches[matchIDX] = syn;
+ } else if (syn.in.length() > matches[matchIDX].in.length()) {
+ // Greedy conflict resolution: longer match wins:
+ matches[matchIDX] = syn;
+ } else {
+ assertTrue(syn.in.length() < matches[matchIDX].in.length());
+ }
+ }
+ }
+
+ // Greedy conflict resolution: if syn matches a range of inputs,
+ // it prevents other syns from matching that range
+ for(int inputIDX=0;inputIDX= numInputs && outputs[inputIDX] == null) {
+ break;
+ }
+ if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) {
+ sb.append(inputTokens[inputIDX]);
+ posHasOutput = true;
+ }
+
+ if (outputs[inputIDX] != null) {
+ if (posHasOutput) {
+ sb.append('/');
+ }
+ sb.append(outputs[inputIDX]);
+ }
+ if (inputIDX < limit-1) {
+ sb.append(' ');
+ }
+ }
+
+ return sb.toString();
+ }
+
+ public void testRandom() throws Exception {
+
+ final int alphabetSize = _TestUtil.nextInt(random, 2, 7);
+
+ final int docLen = atLeast(3000);
+ //final int docLen = 50;
+
+ final String document = getRandomString('a', alphabetSize, docLen);
+
+ if (VERBOSE) {
+ System.out.println("TEST: doc=" + document);
+ }
+
+ final int numSyn = atLeast(5);
+ //final int numSyn = 2;
+
+ final Map synMap = new HashMap();
+ final List syns = new ArrayList();
+ final boolean dedup = random.nextBoolean();
+ if (VERBOSE) {
+ System.out.println(" dedup=" + dedup);
+ }
+ b = new FSTSynonymMapBuilder(dedup);
+ for(int synIDX=0;synIDX();
+ synMap.put(synIn, s);
+ s.keepOrig = random.nextBoolean();
+ }
+ final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim();
+ s.out.add(synOut);
+ add(synIn, synOut, s.keepOrig);
+ if (VERBOSE) {
+ System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
+ }
+ }
+
+ tokensIn = new MockTokenizer(new StringReader("a"),
+ MockTokenizer.WHITESPACE,
+ true);
+ tokensIn.reset();
+ assertTrue(tokensIn.incrementToken());
+ assertFalse(tokensIn.incrementToken());
+ tokensIn.end();
+ tokensIn.close();
+
+ tokensOut = new FSTSynonymFilter(tokensIn,
+ b.build(),
+ true);
+ termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+ posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+
+ if (dedup) {
+ pruneDups(syns);
+ }
+
+ final String expected = slowSynMatcher(document, syns, 5);
+
+ if (VERBOSE) {
+ System.out.println("TEST: expected=" + expected);
+ }
+
+ verify(document, expected);
+ }
+
+ private void pruneDups(List syns) {
+ Set seen = new HashSet();
+ for(OneSyn syn : syns) {
+ int idx = 0;
+ while(idx < syn.out.size()) {
+ String out = syn.out.get(idx);
+ if (!seen.contains(out)) {
+ seen.add(out);
+ idx++;
+ } else {
+ syn.out.remove(idx);
+ }
+ }
+ seen.clear();
+ }
+ }
+
+ private String randomNonEmptyString() {
+ while(true) {
+ final String s = _TestUtil.randomUnicodeString(random).trim();
+ //final String s = _TestUtil.randomSimpleString(random).trim();
+ if (s.length() != 0 && s.indexOf('\u0000') == -1) {
+ return s;
+ }
+ }
+ }
+
+ /** simple random test, doesn't verify correctness.
+ * does verify it doesnt throw exceptions, or that the stream doesn't misbehave
+ */
+ public void testRandom2() throws Exception {
+ final int numIters = atLeast(10);
+ for (int i = 0; i < numIters; i++) {
+ b = new FSTSynonymMapBuilder(random.nextBoolean());
+ final int numEntries = atLeast(10);
+ //final int numEntries = atLeast(10);
+ for (int j = 0; j < numEntries; j++) {
+ // nocommit: better random strings here (e.g. lots of spaces and ascii?)
+ add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
+ }
+ final FSTSynonymMap map = b.build();
+ final boolean ignoreCase = random.nextBoolean();
+
+ final Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+ return new TokenStreamComponents(tokenizer, new FSTSynonymFilter(tokenizer, map, ignoreCase));
+ }
+ };
+
+ checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
+ //checkRandomData(random, analyzer, 10*RANDOM_MULTIPLIER);
+ }
+ }
+}
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java Thu Jul 07 09:06:52 2011 -0400
@@ -0,0 +1,144 @@
+package org.apache.lucene.analysis.synonym;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.ParseException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.junit.Test;
+
+/**
+ * Tests parser for the Solr synonyms format
+ * @lucene.experimental
+ */
+public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
+
+ /** Tests some simple examples from the solr wiki */
+ public void testSimple() throws Exception {
+ String testFile =
+ "i-pod, ipod, ipoooood\n" +
+ "foo => foo bar\n" +
+ "foo => baz\n" +
+ "this test, that testing";
+
+ SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random));
+ parser.addSynonyms(new StringReader(testFile));
+ final FSTSynonymMap map = parser.build();
+
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
+ return new TokenStreamComponents(tokenizer, new FSTSynonymFilter(tokenizer, map, true));
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "ball",
+ new String[] { "ball" },
+ new int[] { 1 });
+
+ assertAnalyzesTo(analyzer, "i-pod",
+ new String[] { "i-pod", "ipod", "ipoooood" },
+ new int[] { 1, 0, 0 });
+
+ assertAnalyzesTo(analyzer, "foo",
+ new String[] { "foo", "baz", "bar" },
+ new int[] { 1, 0, 1 });
+
+ assertAnalyzesTo(analyzer, "this test",
+ new String[] { "this", "that", "test", "testing" },
+ new int[] { 1, 0, 1, 0 });
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidDoubleMap() throws Exception {
+ String testFile = "a => b => c";
+ SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random));
+ parser.addSynonyms(new StringReader(testFile));
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidAnalyzesToNothingOutput() throws Exception {
+ String testFile = "a => 1";
+ SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
+ parser.addSynonyms(new StringReader(testFile));
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidAnalyzesToNothingInput() throws Exception {
+ String testFile = "1 => a";
+ SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
+ parser.addSynonyms(new StringReader(testFile));
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidPositionsInput() throws Exception {
+ String testFile = "testola => the test";
+ SolrSynonymsParser parser = new SolrSynonymsParser(true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
+ parser.addSynonyms(new StringReader(testFile));
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidPositionsOutput() throws Exception {
+ String testFile = "the test => testola";
+ SolrSynonymsParser parser = new SolrSynonymsParser(true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
+ parser.addSynonyms(new StringReader(testFile));
+ }
+
+ /** parse a syn file with some escaped syntax chars */
+ public void testEscapedStuff() throws Exception {
+ String testFile =
+ "a\\=>a => b\\=>b\n" +
+ "a\\,a => b\\,b";
+ SolrSynonymsParser parser = new SolrSynonymsParser(true, new MockAnalyzer(random, MockTokenizer.KEYWORD, false));
+ parser.addSynonyms(new StringReader(testFile));
+ final FSTSynonymMap map = parser.build();
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
+ return new TokenStreamComponents(tokenizer, new FSTSynonymFilter(tokenizer, map, false));
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "ball",
+ new String[] { "ball" },
+ new int[] { 1 });
+
+ assertAnalyzesTo(analyzer, "a=>a",
+ new String[] { "b=>b" },
+ new int[] { 1 });
+
+ assertAnalyzesTo(analyzer, "a,a",
+ new String[] { "b,b" },
+ new int[] { 1 });
+ }
+}
Index: solr/build.xml
--- solr/build.xml Thu Jul 07 06:12:14 2011 -0400
+++ solr/build.xml Thu Jul 07 09:06:52 2011 -0400
@@ -469,10 +469,10 @@
-
+
Index: solr/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ solr/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java Thu Jul 07 09:06:52 2011 -0400
@@ -0,0 +1,128 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.text.ParseException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.synonym.FSTSynonymFilter;
+import org.apache.lucene.analysis.synonym.FSTSynonymMap;
+import org.apache.lucene.analysis.synonym.SolrSynonymsParser;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.Version;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Factory for {@link FSTSynonymFilter}.
+ *
+ * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.FSTSynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
+ * expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
+ * </analyzer>
+ * </fieldType>
+ *
+ */
+public class FSTSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ private FSTSynonymMap map;
+ private boolean ignoreCase;
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new FSTSynonymFilter(input, map, ignoreCase);
+ }
+
+ @Override
+ public void inform(ResourceLoader loader) {
+ final boolean ignoreCase = getBoolean("ignoreCase", false);
+ this.ignoreCase = ignoreCase;
+
+ String tf = args.get("tokenizerFactory");
+
+ final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args);
+
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader);
+ TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer;
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+
+ String format = args.get("format");
+ try {
+ if (format == null || format.equals("solr")) {
+ map = loadSolrSynonyms(loader, analyzer);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Load synonyms from the solr format, "format=solr".
+ */
+ private FSTSynonymMap loadSolrSynonyms(ResourceLoader loader, Analyzer analyzer) throws IOException, ParseException {
+ final boolean expand = getBoolean("expand", true);
+ String synonyms = args.get("synonyms");
+ if (synonyms == null)
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
+
+ CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+
+ SolrSynonymsParser parser = new SolrSynonymsParser(expand, analyzer);
+ File synonymFile = new File(synonyms);
+ if (synonymFile.exists()) {
+ decoder.reset();
+ parser.addSynonyms(new InputStreamReader(loader.openResource(synonyms), decoder));
+ } else {
+ List files = StrUtils.splitFileNames(synonyms);
+ for (String file : files) {
+ decoder.reset();
+ parser.addSynonyms(new InputStreamReader(loader.openResource(file), decoder));
+ }
+ }
+ return parser.build();
+ }
+
+ private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){
+ TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname);
+ tokFactory.init(args);
+ return tokFactory;
+ }
+}
Index: solr/src/test/org/apache/solr/analysis/TestFSTSynonymFilterFactory.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ solr/src/test/org/apache/solr/analysis/TestFSTSynonymFilterFactory.java Thu Jul 07 09:06:52 2011 -0400
@@ -0,0 +1,144 @@
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Collections;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.solr.core.SolrResourceLoader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestFSTSynonymFilterFactory extends BaseTokenTestCase {
+ /** stupid test that we can parse the solr syn file */
+ public void testSolrSynonymsfile() throws Exception {
+ FSTSynonymFilterFactory fst = new FSTSynonymFilterFactory();
+ fst.init(Collections.singletonMap("synonyms", "synonyms.txt"));
+ fst.inform(new SolrResourceLoader(null, null));
+ }
+ /** run me with no -ea for benchmarking */
+ public void testBenchmarkBig() throws Exception {
+ String testFile = "/home/rmuir/synonyms.txt";
+ for (int i = 0; i < 3; i++) {
+ long ms = System.currentTimeMillis();
+ doBenchmarkLoadOld(testFile);
+ long endMs = System.currentTimeMillis();
+ System.out.println("old=" + (endMs - ms));
+ ms = System.currentTimeMillis();
+ doBenchmarkLoadNew(testFile);
+ endMs = System.currentTimeMillis();
+ System.out.println("new=" + (endMs - ms));
+ }
+ }
+
+ /*
+ public void testBenchmarkBigRAM() throws Exception {
+ String testFile = "/home/rmuir/synonyms.txt";
+ SynonymFilterFactory factory = doBenchmarkLoadOld(testFile);
+ final Runtime runtime = Runtime.getRuntime();
+ long usedMem1 = runtime.totalMemory() - runtime.freeMemory();
+ long usedMem2 = Long.MAX_VALUE;
+ for(int iter=0;iter<10;iter++) {
+ runtime.runFinalization();
+ runtime.gc();
+ Thread.currentThread().yield();
+ Thread.sleep(1000);
+ usedMem2 = usedMem1;
+ usedMem1 = runtime.totalMemory() - runtime.freeMemory();
+ }
+ System.out.println("done: ram used: " + usedMem1);
+ System.out.flush();
+ }
+ */
+
+ private SynonymFilterFactory doBenchmarkLoadOld(String file) throws Exception {
+ SynonymFilterFactory old = new SynonymFilterFactory();
+ old.init(Collections.singletonMap("synonyms", file));
+ old.inform(new SolrResourceLoader(null, null));
+ return old;
+ }
+
+ private FSTSynonymFilterFactory doBenchmarkLoadNew(String file) throws Exception {
+ FSTSynonymFilterFactory fst = new FSTSynonymFilterFactory();
+ fst.init(Collections.singletonMap("synonyms", file));
+ fst.inform(new SolrResourceLoader(null, null));
+ return fst;
+ }
+
+ public void testBenchmarkDefaultSolrSyns() throws Exception {
+ final FSTSynonymFilterFactory fst = new FSTSynonymFilterFactory();
+ fst.init(Collections.singletonMap("synonyms", "synonyms.txt"));
+ fst.inform(new SolrResourceLoader(null, null));
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(tokenizer, fst.create(tokenizer));
+ }
+ };
+
+ /*
+ final SynonymFilterFactory old = new SynonymFilterFactory();
+ old.init(Collections.singletonMap("synonyms", "synonyms.txt"));
+ old.inform(new SolrResourceLoader(null, null));
+ Analyzer oldAnalyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(tokenizer, old.create(tokenizer));
+ }
+ };
+ */
+
+ for (int i = 0; i < 3; i++) {
+ System.out.println("round: " + i);
+ //doBenchmarkAnalysisSpeed(oldAnalyzer);
+ doBenchmarkAnalysisSpeed(analyzer);
+ }
+ }
+
+ public void doBenchmarkAnalysisSpeed(Analyzer analyzer) throws Exception {
+ String text = "this is a test of the emergency broadcasting system. this is only a test. Please do not pass go, do not collect $200";
+ for (int i = 0; i < 20000; i++) {
+ TokenStream ts = analyzer.reusableTokenStream("foo", new StringReader(text));
+ ts.reset();
+ while (ts.incrementToken()) {
+ ;
+ }
+ ts.end();
+ ts.close();
+ }
+
+ long ms = System.currentTimeMillis();
+ for (int i = 0; i < 1000000; i++) {
+ TokenStream ts = analyzer.reusableTokenStream("foo", new StringReader(text));
+ ts.reset();
+ while (ts.incrementToken()) {
+ ;
+ }
+ ts.end();
+ ts.close();
+ }
+ long endMs = System.currentTimeMillis();
+ System.out.println("time: " + (endMs - ms));
+ }
+}