Index: lucene/contrib/CHANGES.txt --- lucene/contrib/CHANGES.txt Mon Jul 04 13:30:26 2011 -0400 +++ lucene/contrib/CHANGES.txt Mon Jul 04 13:40:54 2011 -0400 @@ -78,6 +78,10 @@ documents must be indexed as a document block, using IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless) + * LUCENE-3233: FSTSynonymFilter for applying multi-word synonyms + during indexing, using far less RAM than the current + SynonymFilter. (Robert Muir, Mike McCandless) + API Changes Bug Fixes Index: lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java Mon Jul 04 13:40:54 2011 -0400 @@ -0,0 +1,52 @@ +package org.apache.lucene.store; + +import org.apache.lucene.util.BytesRef; + +/** + * @lucene.experimental + */ +public class ByteArrayDataOutput extends DataOutput { + private byte[] bytes; + + private int pos; + private int limit; + + public ByteArrayDataOutput(byte[] bytes) { + reset(bytes); + } + + public ByteArrayDataOutput(byte[] bytes, int offset, int len) { + reset(bytes, offset, len); + } + + public ByteArrayDataOutput() { + reset(BytesRef.EMPTY_BYTES); + } + + public void reset(byte[] bytes) { + reset(bytes, 0, bytes.length); + } + + public void reset(byte[] bytes, int offset, int len) { + this.bytes = bytes; + pos = offset; + limit = offset + len; + } + + public int getPosition() { + return pos; + } + + @Override + public void writeByte(byte b) { + assert pos < limit; + bytes[pos++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + assert pos + length <= limit; + System.arraycopy(b, offset, bytes, pos, length); + pos += length; + } +} Index: lucene/src/java/org/apache/lucene/util/CharsRef.java --- lucene/src/java/org/apache/lucene/util/CharsRef.java Mon Jul 04 13:30:26 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/CharsRef.java Mon Jul 04 13:40:54 2011 -0400 @@ -1,5 +1,7 @@ package org.apache.lucene.util; +import java.util.Comparator; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -167,7 +169,11 @@ * the {@link CharsRef} to copy */ public void copy(CharsRef other) { - chars = ArrayUtil.grow(chars, other.length); + if (chars == null) { + chars = new char[other.length]; + } else { + chars = ArrayUtil.grow(chars, other.length); + } System.arraycopy(other.chars, other.offset, chars, 0, other.length); length = other.length; offset = 0; @@ -213,4 +219,56 @@ public CharSequence subSequence(int start, int end) { return new CharsRef(chars, offset + start, offset + end - 1); } + + private final static Comparator utf16SortedAsUTF8SortOrder = new UTF16SortedAsUTF8Comparator(); + + public static Comparator getUTF16SortedAsUTF8Comparator() { + return utf16SortedAsUTF8SortOrder; + } + + private static class UTF16SortedAsUTF8Comparator implements Comparator { + // Only singleton + private UTF16SortedAsUTF8Comparator() {}; + + public int compare(CharsRef a, CharsRef b) { + if (a == b) + return 0; + + final char[] aChars = a.chars; + int aUpto = a.offset; + final char[] bChars = b.chars; + int bUpto = b.offset; + + final int aStop = aUpto + Math.min(a.length, b.length); + + while (aUpto < aStop) { + char aChar = aChars[aUpto++]; + char bChar = bChars[bUpto++]; + if (aChar != bChar) { + // http://icu-project.org/docs/papers/utf16_code_point_order.html + + /* aChar != bChar, fix up each one if they're both in or above the surrogate range, then compare them */ + if (aChar >= 0xd800 && bChar >= 0xd800) { + if (aChar >= 0xe000) { + aChar -= 0x800; + } else { + aChar += 0x2000; + } + + if (bChar >= 0xe000) { + bChar -= 0x800; + } else { + bChar += 0x2000; + } + } + + /* now aChar and bChar are in code point order */ + return (int)aChar - (int)bChar; /* int must be 32 bits wide */ + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + } } \ No newline at end of file Index: lucene/src/java/org/apache/lucene/util/fst/FST.java --- lucene/src/java/org/apache/lucene/util/fst/FST.java Mon Jul 04 13:30:26 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/fst/FST.java Mon Jul 04 13:40:54 2011 -0400 @@ -113,7 +113,7 @@ int target; byte flags; - T nextFinalOutput; + public T nextFinalOutput; int nextArc; // This is non-zero if current arcs are fixed array: @@ -754,6 +754,9 @@ // Linear scan readFirstTargetArc(follow, arc); while(true) { + // TODO: we should fix this code to not have to create + // object for the output of every arc we scan... only + // for the matching arc, if found if (arc.label == labelToMatch) { return arc; } else if (arc.label > labelToMatch) { Index: lucene/src/test/org/apache/lucene/util/TestCharsRef.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/test/org/apache/lucene/util/TestCharsRef.java Mon Jul 04 13:40:54 2011 -0400 @@ -0,0 +1,41 @@ +package org.apache.lucene.util; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCharsRef extends LuceneTestCase { + public void testUTF16InUTF8Order() { + final int numStrings = atLeast(1000); + BytesRef utf8[] = new BytesRef[numStrings]; + CharsRef utf16[] = new CharsRef[numStrings]; + + for (int i = 0; i < numStrings; i++) { + String s = _TestUtil.randomUnicodeString(random); + utf8[i] = new BytesRef(s); + utf16[i] = new CharsRef(s); + } + + Arrays.sort(utf8); + Arrays.sort(utf16, CharsRef.getUTF16SortedAsUTF8Comparator()); + + for (int i = 0; i < numStrings; i++) { + assertEquals(utf8[i].utf8ToString(), utf16[i].toString()); + } + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilter.java Mon Jul 04 13:40:54 2011 -0400 @@ -0,0 +1,542 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.FST; + +// nocommit -- need better name? we may not always use FST +// going forward (eg Aho/Corasick) + +/** + * Matches single or multi word synonyms in a token stream. + * This token stream cannot properly handle position + * increments != 1, ie, you should place this filter before + * filtering out stop words. + * + *

Note that with the current implementation, parsing is + * greedy, so whenever multiple parses would apply, the rule + * starting the earliest and parsing the most tokens wins. + * For example if you have these rules: + * + *

+ *   a -> x
+ *   a b -> y
+ *   b c d -> z
+ * 
+ * + * Then input a b c d e parses to y b c + * d, ie the 2nd rule "wins" because it started + * earliest and matched the most input tokens of other rules + * starting at that point.

+ * + *

A future improvement to this filter could allow + * non-greedy parsing, such that the 3rd rule would win, and + * also separately allow multiple parses, such that all 3 + * rules would match, perhaps even on a rule by rule + * basis.

+ * + *

NOTE: when a match occurs, the output tokens + * associated with the matching rule are "stacked" on top of + * the input stream (if the rule had + * keepOrig=true) and also on top of aother + * matched rule's output tokens. This is not a correct + * solution, as really the output should be an abitrary + * graph/lattice. For example, with the above match, you + * would expect an exact PhraseQuery "y b + * c" to match the parsed tokens, but it will fail to + * do so. This limitations is necessary because Lucene's + * TokenStream (and index) cannot yet represent an arbitrary + * graph.

+ * + *

NOTE: If multiple incoming tokens arrive on the + * same position, only the first token at that position is + * used for parsing. Subsequent tokens simply pass through + * and are not parsed. A future improvement would be to + * allow these tokens to also be matched.

+ */ + +// TODO: maybe we should resolve token -> wordID then run +// FST on wordIDs, for better perf? + +// TODO: maybe outputs should be by reference, so many terms +// w/ same output are definitely shared (FST likely won't +// share outputs well) + +// TODO: a more efficient approach would be Aho/Corasick's +// algorithm +// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm +// It improves over the current approach here +// because it does not fully re-start matching at every +// token. For exampl,e if one pattern is "a b c x" +// and another is "b c d" and the input is "a b c d", on +// trying to parse "a b c x" but failing when you got to x, +// rather than starting over again your really should +// immediately recognize that "b c d" matches at the next +// input. I suspect this won't matter that much in +// practice, but it's possible on some set of synonyms it +// will. We'd have to modify Aho/Corasick to enforce our +// conflict resolving (eg greedy matching) because that algo +// finds all matches. + +public final class FSTSynonymFilter extends TokenFilter { + private final FSTSynonymMap synonyms; + + // nocommit -- unused now + private final boolean ignoreCase; + private final int rollBufferSize; + + private int captureCount; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + + // How many future input tokens have already been matched + // to a synonym; because the matching is "greedy" we don't + // try to do any more matching for such tokens: + private int inputSkipCount; + + // Hold all buffered (read ahead) stacked input tokens for + // a future position. When multiple tokens are at the + // same position, we only store (and match against) the + // term for the first token at the position, but capture + // state for (and enumerate) all other tokens at this + // position: + private static class PendingInput { + final CharsRef term = new CharsRef(); + AttributeSource.State state; + boolean keepOrig; + boolean consumed; + + public void reset() { + state = null; + consumed = true; + keepOrig = false; + } + }; + + // Rolling buffer, holding pending input tokens we had to + // clone because we needed to look ahead, indexed by + // position: + private final PendingInput[] futureInputs; + + // Holds pending output synonyms for one future position: + private static class PendingOutputs { + CharsRef[] outputs; + int upto; + int count; + int posIncr = 1; + + public PendingOutputs() { + outputs = new CharsRef[1]; + } + + public void reset() { + upto = count = 0; + posIncr = 1; + } + + public CharsRef pullNext() { + assert upto < count; + final CharsRef result = outputs[upto++]; + posIncr = 0; + if (upto == count) { + reset(); + } + return result; + } + + public void add(char[] output, int offset, int len) { + if (upto == count) { + final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(outputs, 0, next, 0, count); + outputs = next; + } + if (outputs[count] == null) { + outputs[count] = new CharsRef(); + } + outputs[count].copy(output, offset, len); + count++; + } + }; + + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + // Rolling buffer, holding stack of pending synonym + // outputs, indexed by position: + private final PendingOutputs[] futureOutputs; + + // Where (in rolling buffers) to write next input saved state: + private int nextWrite; + + // Where (in rolling buffers) to read next input saved state: + private int nextRead; + + // True once we've read last token + private boolean finished; + + private final FST.Arc scratchArc; + + private final FST fst; + + private final BytesRef scratchBytes = new BytesRef(); + private final CharsRef scratchChars = new CharsRef(); + + /** + * @param input input tokenstream + * @param synonyms synonym map + * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. + * Note, if you set this to true, its your responsibility to lowercase + * the input entries when you create the {@link FSTSynonymMap} + */ + + public FSTSynonymFilter(TokenStream input, FSTSynonymMap synonyms, boolean ignoreCase) { + super(input); + this.synonyms = synonyms; + this.ignoreCase = ignoreCase; + this.fst = synonyms.fst; + + if (fst == null) { + throw new IllegalArgumentException("fst must be non-null"); + } + + // Must be 1+ so that when roll buffer is at full + // lookahead we can distinguish this full buffer from + // the empty buffer: + rollBufferSize = 1+synonyms.maxHorizontalContext; + + futureInputs = new PendingInput[rollBufferSize]; + futureOutputs = new PendingOutputs[rollBufferSize]; + for(int pos=0;pos> */ + public final FST fst; + /** map */ + public final BytesRefHash words; + /** maxHorizontalContext: maximum context we need on the tokenstream */ + public final int maxHorizontalContext; + /** maxVerticalContext: maximum number of synonym entries for a single input */ + public final int maxVerticalContext; + + public FSTSynonymMap(FST fst, BytesRefHash words, int maxHorizontalContext, int maxVerticalContext) { + this.fst = fst; + this.words = words; + this.maxHorizontalContext = maxHorizontalContext; + this.maxVerticalContext = maxVerticalContext; + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java Mon Jul 04 13:40:54 2011 -0400 @@ -0,0 +1,181 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; + +/** + * Builds an FSTSynonymMap. + *

+ * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap + * @lucene.experimental + */ +public class FSTSynonymMapBuilder { + private final TreeMap workingSet = new TreeMap(CharsRef.getUTF16SortedAsUTF8Comparator()); + private final BytesRefHash words = new BytesRefHash(); + private final BytesRef utf8Scratch = new BytesRef(8); + private int maxHorizontalContext; + private int maxVerticalContext; + + private static class MapEntry { + boolean includeOrig; + // we could sort for better sharing ultimately, but it could confuse people + ArrayList ords = new ArrayList(); + } + + /** Sugar: just joins the provided terms with {@link + * FSTSynonymMap#WORD_SEPARATOR}. reuse and its chars + * must not be null. */ + public static CharsRef join(String[] words, CharsRef reuse) { + int upto = 0; + char[] buffer = reuse.chars; + for(String word : words) { + if (upto > 0) { + if (upto >= buffer.length) { + reuse.grow(upto); + buffer = reuse.chars; + } + buffer[upto++] = FSTSynonymMap.WORD_SEPARATOR; + } + + final int wordLen = word.length(); + final int needed = upto + wordLen; + if (needed > buffer.length) { + reuse.grow(needed); + buffer = reuse.chars; + } + + word.getChars(0, wordLen, buffer, upto); + upto += wordLen; + } + + return reuse; + } + + /** + * Add a phrase->phrase synonym mapping. + * Phrases are character sequences where words are separated with character zero (\u0000) + * + * @param input input phrase + * @param numInputWords number of input words in the input phrase + * @param output output phrase + * @param includeOrig true if the original should be included + */ + public void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) { + // first convert to UTF-8 + System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); + final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch); + // lookup in hash + int ord = words.add(utf8Scratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + System.out.println(" output=" + output + " old ord=" + ord); + } else { + System.out.println(" output=" + output + " new ord=" + ord); + } + + MapEntry e = workingSet.get(input); + if (e == null) { + e = new MapEntry(); + workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map + } + + e.ords.add(ord); + e.includeOrig |= includeOrig; + maxVerticalContext = Math.max(maxVerticalContext, e.ords.size()); + maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); + maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords); + } + + private int countWords(CharsRef chars) { + int wordCount = 1; + int upto = chars.offset; + final int limit = chars.offset + chars.length; + while(upto < limit) { + final int codePoint = Character.codePointAt(chars.chars, upto, limit); + if (codePoint == FSTSynonymMap.WORD_SEPARATOR) { + wordCount++; + } + upto += Character.charCount(codePoint); + } + return wordCount; + } + + /** + * Helper for {@link #add(CharsRef, int, CharsRef, boolean)}, except it counts + * the words in the input phrase for you. + *

+ * Chances are your parser is/can likely count this itself so it should just + * use the other method if so. + */ + public void add(CharsRef input, CharsRef output, boolean includeOrig) { + add(input, countWords(input), output, countWords(output), includeOrig); + } + + /** + * Builds an {@link FSTSynonymMap} and returns it. + */ + public FSTSynonymMap build() throws IOException { + ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + // TODO: are we using the best sharing options? + Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); + + BytesRef scratch = new BytesRef(64); + ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); + + System.out.println("fmap.build"); + for (Map.Entry e : workingSet.entrySet()) { + CharsRef input = e.getKey(); + MapEntry output = e.getValue(); + + int numEntries = output.ords.size(); + + // output size, assume the worst case + int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry + + scratch.grow(estimatedSize); + scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); + + // now write our output data: + scratchOutput.writeVInt(numEntries << 1 | (output.includeOrig ? 0 : 1)); + for (int i = 0; i < numEntries; i++) { + scratchOutput.writeVInt(output.ords.get(i)); + } + + scratch.length = scratchOutput.getPosition() - scratch.offset; + System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset); + builder.add(input, new BytesRef(scratch)); + } + + FST fst = builder.finish(); + return new FSTSynonymMap(fst, words, maxHorizontalContext, maxVerticalContext); + } +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java Mon Jul 04 13:40:54 2011 -0400 @@ -0,0 +1,145 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.util.CharsRef; + +// nocommit -- need randomized stress test, checking results +// against slowSimpleButCorrectParser + +public class TestFSTSynonymMapFilter extends BaseTokenStreamTestCase { + + private FSTSynonymMapBuilder b; + private Tokenizer tokensIn; + private FSTSynonymFilter tokensOut; + private CharTermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + + private void add(String input, String output, boolean keepOrig) { + b.add(new CharsRef(input.replace(' ', '\u0000')), + new CharsRef(output.replace(' ', '\u0000')), + keepOrig); + } + + private void assertEquals(CharTermAttribute term, String expected) { + assertEquals(expected.length(), term.length()); + final char[] buffer = term.buffer(); + for(int chIDX=0;chIDX 0); + + // no captureStates when no syns matched + verify("p q r s t", "p q r s t"); + assertEquals(0, tokensOut.getCaptureCount()); + + // no captureStates when only single-input syns, w/ no + // lookahead needed, matched + verify("p q z y t", "p q boo y/bee t"); + assertEquals(0, tokensOut.getCaptureCount()); + } +} Index: modules/queries/src/test/org/apache/lucene/queries/function/FunctionTestSetup.java --- modules/queries/src/test/org/apache/lucene/queries/function/FunctionTestSetup.java Mon Jul 04 13:30:26 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,178 +0,0 @@ -package org.apache.lucene.queries.function; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.Fieldable; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.queries.function.valuesource.ByteFieldSource; -import org.apache.lucene.queries.function.valuesource.FloatFieldSource; -import org.apache.lucene.queries.function.valuesource.IntFieldSource; -import org.apache.lucene.queries.function.valuesource.ShortFieldSource; -import org.apache.lucene.search.cache.*; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util._TestUtil; -import org.junit.AfterClass; -import org.junit.Ignore; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Setup for function tests - */ -@Ignore -public abstract class FunctionTestSetup extends LuceneTestCase { - - /** - * Actual score computation order is slightly different than assumptios - * this allows for a small amount of variation - */ - protected static float TEST_SCORE_TOLERANCE_DELTA = 0.001f; - - protected static final int N_DOCS = 17; // select a primary number > 2 - - protected static final String ID_FIELD = "id"; - protected static final String TEXT_FIELD = "text"; - protected static final String INT_FIELD = "iii"; - protected static final String FLOAT_FIELD = "fff"; - - private static final int CREATOR_FLAGS = CachedArrayCreator.CACHE_VALUES_AND_BITS; - - protected ValueSource BYTE_VALUESOURCE = new ByteFieldSource(new ByteValuesCreator(INT_FIELD, null, CREATOR_FLAGS)); - protected ValueSource SHORT_VALUESOURCE = new ShortFieldSource(new ShortValuesCreator(INT_FIELD, null, CREATOR_FLAGS)); - protected ValueSource INT_VALUESOURCE = new IntFieldSource(new IntValuesCreator(INT_FIELD, null, CREATOR_FLAGS)); - protected ValueSource INT_AS_FLOAT_VALUESOURCE = new FloatFieldSource(new FloatValuesCreator(INT_FIELD, null, CREATOR_FLAGS)); - protected ValueSource FLOAT_VALUESOURCE = new FloatFieldSource(new FloatValuesCreator(FLOAT_FIELD, null, CREATOR_FLAGS)); - - private static final String DOC_TEXT_LINES[] = { - "Well, this is just some plain text we use for creating the ", - "test documents. It used to be a text from an online collection ", - "devoted to first aid, but if there was there an (online) lawyers ", - "first aid collection with legal advices, \"it\" might have quite ", - "probably advised one not to include \"it\"'s text or the text of ", - "any other online collection in one's code, unless one has money ", - "that one don't need and one is happy to donate for lawyers ", - "charity. Anyhow at some point, rechecking the usage of this text, ", - "it became uncertain that this text is free to use, because ", - "the web site in the disclaimer of he eBook containing that text ", - "was not responding anymore, and at the same time, in projGut, ", - "searching for first aid no longer found that eBook as well. ", - "So here we are, with a perhaps much less interesting ", - "text for the test, but oh much much safer. ", - }; - - protected static Directory dir; - protected static Analyzer anlzr; - - @AfterClass - public static void afterClassFunctionTestSetup() throws Exception { - dir.close(); - dir = null; - anlzr = null; - } - - protected static void createIndex(boolean doMultiSegment) throws Exception { - if (VERBOSE) { - System.out.println("TEST: setUp"); - } - // prepare a small index with just a few documents. - dir = newDirectory(); - anlzr = new MockAnalyzer(random); - IndexWriterConfig iwc = newIndexWriterConfig( TEST_VERSION_CURRENT, anlzr).setMergePolicy(newLogMergePolicy()); - if (doMultiSegment) { - iwc.setMaxBufferedDocs(_TestUtil.nextInt(random, 2, 7)); - } - RandomIndexWriter iw = new RandomIndexWriter(random, dir, iwc); - iw.w.setInfoStream(VERBOSE ? System.out : null); - // add docs not exactly in natural ID order, to verify we do check the order of docs by scores - int remaining = N_DOCS; - boolean done[] = new boolean[N_DOCS]; - int i = 0; - while (remaining > 0) { - if (done[i]) { - throw new Exception("to set this test correctly N_DOCS=" + N_DOCS + " must be primary and greater than 2!"); - } - addDoc(iw, i); - done[i] = true; - i = (i + 4) % N_DOCS; - remaining --; - } - if (!doMultiSegment) { - if (VERBOSE) { - System.out.println("TEST: setUp optimize"); - } - iw.optimize(); - } - iw.close(); - if (VERBOSE) { - System.out.println("TEST: setUp done close"); - } - } - - private static void addDoc(RandomIndexWriter iw, int i) throws Exception { - Document d = new Document(); - Fieldable f; - int scoreAndID = i + 1; - - f = newField(ID_FIELD, id2String(scoreAndID), Field.Store.YES, Field.Index.NOT_ANALYZED); // for debug purposes - f.setOmitNorms(true); - d.add(f); - - f = newField(TEXT_FIELD, "text of doc" + scoreAndID + textLine(i), Field.Store.NO, Field.Index.ANALYZED); // for regular search - f.setOmitNorms(true); - d.add(f); - - f = newField(INT_FIELD, "" + scoreAndID, Field.Store.NO, Field.Index.NOT_ANALYZED); // for function scoring - f.setOmitNorms(true); - d.add(f); - - f = newField(FLOAT_FIELD, scoreAndID + ".000", Field.Store.NO, Field.Index.NOT_ANALYZED); // for function scoring - f.setOmitNorms(true); - d.add(f); - - iw.addDocument(d); - log("added: " + d); - } - - // 17 --> ID00017 - protected static String id2String(int scoreAndID) { - String s = "000000000" + scoreAndID; - int n = ("" + N_DOCS).length() + 3; - int k = s.length() - n; - return "ID" + s.substring(k); - } - - // some text line for regular search - private static String textLine(int docNum) { - return DOC_TEXT_LINES[docNum % DOC_TEXT_LINES.length]; - } - - // extract expected doc score from its ID Field: "ID7" --> 7.0 - protected static float expectedFieldScore(String docIDFieldVal) { - return Float.parseFloat(docIDFieldVal.substring(2)); - } - - // debug messages (change DBG to true for anything to print) - protected static void log(Object o) { - if (VERBOSE) { - System.out.println(o.toString()); - } - } -}