Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java (revision 1227129) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java (working copy) @@ -59,7 +59,12 @@ } } - // todo: we should probably refactor this guy to use/take analyzer, + // For the output string: separate positions with a space, + // and separate multiple tokens at each position with a + // /. If a token should have end offset != the input + // token's end offset then add :X to it: + + // TODO: we should probably refactor this guy to use/take analyzer, // the tests are a little messy private void verify(String input, String output) throws Exception { if (VERBOSE) { @@ -73,7 +78,7 @@ while(tokensOut.incrementToken()) { if (VERBOSE) { - System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement()); + System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset()); } assertTrue(expectedUpto < expected.length); @@ -85,16 +90,26 @@ if (atPos > 0) { assertTrue(tokensOut.incrementToken()); if (VERBOSE) { - System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement()); + System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset()); } } - assertEquals(termAtt, expectedAtPos[atPos]); + final int colonIndex = expectedAtPos[atPos].indexOf(':'); + final String expectedToken; + final int expectedEndOffset; + if (colonIndex != -1) { + expectedToken = expectedAtPos[atPos].substring(0, colonIndex); + expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex)); + } else { + expectedToken = expectedAtPos[atPos]; + expectedEndOffset = endOffset; + } + assertEquals(expectedToken, termAtt.toString()); assertEquals(atPos == 0 ? 1 : 0, posIncrAtt.getPositionIncrement()); // start/end offset of all tokens at same pos should // be the same: assertEquals(startOffset, offsetAtt.startOffset()); - assertEquals(endOffset, offsetAtt.endOffset()); + assertEquals(expectedEndOffset, offsetAtt.endOffset()); } } tokensOut.end(); @@ -112,6 +127,7 @@ add("b c", "dog collar", true); add("c d", "dog harness holder extras", true); add("m c e", "dog barks loudly", false); + add("i j k", "feep", true); add("e f", "foo bar", false); add("e f", "baz bee", false); @@ -148,6 +164,9 @@ // two outputs for same input verify("e f", "foo/baz bar/bee"); + // verify multi-word / single-output offsets: + verify("g i j k g", "g i/feep:7 j k g"); + // mixed keepOrig true/false: verify("a m c e x", "a/foo dog barks loudly x"); verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x"); @@ -241,6 +260,10 @@ } else { outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++]; } + if (synOutputs.length == 1) { + // Add endOffset + outputs[matchIDX] = outputs[matchIDX] + ":" + ((inputIDX*2) + syn.in.length()); + } } } } @@ -663,4 +686,24 @@ new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }, new int[] { 1, 0, 1, 1, 1, 0, 1 }); } + + public void testMultiwordOffsets() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = true; + add("national hockey league", "nhl", keepOrig); + final SynonymMap map = b.build(); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(a, "national hockey league", + new String[] { "national", "nhl", "hockey", "league" }, + new int[] { 0, 0, 9, 16 }, + new int[] { 8, 22, 15, 22 }, + new int[] { 1, 0, 1, 1 }); + } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (revision 1227129) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (working copy) @@ -153,12 +153,15 @@ // Holds pending output synonyms for one future position: private static class PendingOutputs { CharsRef[] outputs; + int[] endOffsets; int upto; int count; int posIncr = 1; + int lastEndOffset; public PendingOutputs() { outputs = new CharsRef[1]; + endOffsets = new int[1]; } public void reset() { @@ -168,6 +171,7 @@ public CharsRef pullNext() { assert upto < count; + lastEndOffset = endOffsets[upto]; final CharsRef result = outputs[upto++]; posIncr = 0; if (upto == count) { @@ -176,16 +180,29 @@ return result; } - public void add(char[] output, int offset, int len) { + public int getLastEndOffset() { + return lastEndOffset; + } + + public void add(char[] output, int offset, int len, int endOffset) { if (count == outputs.length) { final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(outputs, 0, next, 0, count); outputs = next; } + if (count == endOffsets.length) { + final int[] next = new int[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_INT)]; + System.arraycopy(endOffsets, 0, next, 0, count); + endOffsets = next; + } if (outputs[count] == null) { outputs[count] = new CharsRef(); } outputs[count].copyChars(output, offset, len); + // endOffset can be -1, in which case we should simply + // use the endOffset of the input token, or X >= 0, in + // which case we use X as the endOffset for this output + endOffsets[count] = endOffset; count++; } }; @@ -281,6 +298,7 @@ // Holds the longest match we've seen so far: BytesRef matchOutput = null; int matchInputLength = 0; + int matchEndOffset = -1; BytesRef pendingOutput = fst.outputs.getNoOutput(); fst.getFirstArc(scratchArc); @@ -297,6 +315,8 @@ final int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); + int inputEndOffset = 0; + if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens @@ -317,6 +337,7 @@ final PendingInput input = futureInputs[nextWrite]; input.startOffset = offsetAtt.startOffset(); input.endOffset = offsetAtt.endOffset(); + inputEndOffset = input.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { capture(); @@ -335,6 +356,7 @@ // Still in our lookahead buffer = futureInputs[curNextRead].term.chars; bufferLen = futureInputs[curNextRead].term.length; + inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } @@ -360,6 +382,7 @@ if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); matchInputLength = tokenCount; + matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } @@ -390,7 +413,7 @@ if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; - addOutput(matchOutput, matchInputLength); + addOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before @@ -404,7 +427,7 @@ } // Interleaves all output tokens onto the futureOutputs: - private void addOutput(BytesRef bytes, int matchInputLength) { + private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); @@ -425,7 +448,21 @@ // Caller is not allowed to have empty string in // the output: assert outputLen > 0: "output contains empty string: " + scratchChars; - futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen); + final int endOffset; + if (chIDX == chEnd && lastStart == scratchChars.offset) { + // This rule had a single output token, so, we set + // this output's endOffset to the current + // endOffset (ie, endOffset of the last input + // token it matched): + endOffset = matchEndOffset; + } else { + // This rule has more than one output token; we + // can't pick any particular endOffset for this + // case, so, we inherit the endOffset for the + // input token which this output overlaps: + endOffset = -1; + } + futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset); //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto); lastStart = 1+chIDX; //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); @@ -507,7 +544,11 @@ clearAttributes(); termAtt.copyBuffer(output.chars, output.offset, output.length); typeAtt.setType(TYPE_SYNONYM); - offsetAtt.setOffset(input.startOffset, input.endOffset); + int endOffset = outputs.getLastEndOffset(); + if (endOffset == -1) { + endOffset = input.endOffset; + } + offsetAtt.setOffset(input.startOffset, endOffset); posIncrAtt.setPositionIncrement(posIncr); if (outputs.count == 0) { // Done with the buffered input and all outputs at Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 1227129) +++ lucene/contrib/CHANGES.txt (working copy) @@ -136,6 +136,14 @@ * LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5, to correctly handle minShouldMatch behaviour of previous versions. (Shay Banon, Uwe Schindler) + + * LUCENE-3668: For a multi-token synonym mapping to a single token, + SynonymFilter will now set the start offset of the synonym token to + the start offset of the first matched token, and the end offset of + the synonym token to the end offset of the last matched token. + This way if the synonym token is used for highlighting, it will + cover all tokens it had matched. (Koji Sekiguchi, Robert Muir, + Mike McCandless) Documentation Index: solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java (revision 1227129) +++ solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java (working copy) @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.StringReader; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -65,6 +66,25 @@ new int[] { 1, 0, 0, 0 }); } + /** test multiword offsets with the old impl + * @deprecated Remove this test in Lucene 5.0 */ + @Deprecated + public void testMultiwordOffsetsOld() throws Exception { + SynonymFilterFactory factory = new SynonymFilterFactory(); + Map args = new HashMap(); + args.put("luceneMatchVersion", Version.LUCENE_33.toString()); + args.put("synonyms", "synonyms.txt"); + factory.init(args); + factory.inform(new StringMockSolrResourceLoader("national hockey league, nhl")); + TokenStream ts = factory.create(new MockTokenizer(new StringReader("national hockey league"), MockTokenizer.WHITESPACE, false)); + // WTF? + assertTokenStreamContents(ts, + new String[] { "national", "nhl", "hockey", "league" }, + new int[] { 0, 0, 0, 0 }, + new int[] { 22, 22, 22, 22 }, + new int[] { 1, 0, 1, 1 }); + } + /** if the synonyms are completely empty, test that we still analyze correctly */ public void testEmptySynonyms() throws Exception { SynonymFilterFactory factory = new SynonymFilterFactory(); @@ -85,7 +105,7 @@ } public List getLines(String resource) throws IOException { - return null; + return Arrays.asList(text.split("\n")); } public Object newInstance(String cname, String... subpackages) {