Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java (revision 1157868) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java (working copy) @@ -29,10 +29,12 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util._TestUtil; @@ -148,7 +150,7 @@ // mixed keepOrig true/false: verify("a m c e x", "a/foo dog barks loudly x"); - verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x"); + verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x"); assertTrue(tokensOut.getCaptureCount() > 0); // no captureStates when no syns matched @@ -181,6 +183,7 @@ assertTrue(doc.length() % 2 == 0); final int numInputs = doc.length()/2; boolean[] keepOrigs = new boolean[numInputs]; + boolean[] hasMatch = new boolean[numInputs]; Arrays.fill(keepOrigs, false); String[] outputs = new String[numInputs + maxOutputLength]; OneSyn[] matches = new OneSyn[numInputs]; @@ -223,6 +226,10 @@ if (syn == null) { continue; } + for(int idx=0;idx<(1+syn.in.length())/2;idx++) { + hasMatch[inputIDX+idx] = true; + keepOrigs[inputIDX+idx] |= syn.keepOrig; + } for(String synOut : syn.out) { final String[] synOutputs = synOut.split(" "); assertEquals(synOutputs.length, (1+synOut.length())/2); @@ -234,9 +241,6 @@ } else { outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++]; } - if (matchIDX < numInputs) { - keepOrigs[matchIDX] |= syn.keepOrig; - } } } } @@ -249,7 +253,8 @@ if (inputIDX >= numInputs && outputs[inputIDX] == null) { break; } - if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) { + if (inputIDX < numInputs && (!hasMatch[inputIDX] || keepOrigs[inputIDX])) { + assertTrue(inputTokens[inputIDX].length() != 0); sb.append(inputTokens[inputIDX]); posHasOutput = true; } @@ -259,6 +264,8 @@ sb.append('/'); } sb.append(outputs[inputIDX]); + } else if (!posHasOutput) { + continue; } if (inputIDX < limit-1) { sb.append(' '); @@ -390,4 +397,62 @@ checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER); } } + + // LUCENE-3375 + public void testVanishingTerms() throws Exception { + String testFile = + "aaa => aaaa1 aaaa2 aaaa3\n" + + "bbb => bbbb1 bbbb2\n"; + + SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random)); + parser.add(new StringReader(testFile)); + final SynonymMap map = parser.build(); + + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + // where did my pot go?! + assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold", + new String[] { "xyzzy", "bbbb1", "pot", "bbbb2", "of", "gold" }); + + // this one nukes 'pot' and 'of' + // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold + assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold", + new String[] { "xyzzy", "aaaa1", "pot", "aaaa2", "of", "aaaa3", "gold" }); + } + + public void testBasic2() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig); + add("bbb", "bbbb1 bbbb2", keepOrig); + tokensIn = new MockTokenizer(new StringReader("a"), + MockTokenizer.WHITESPACE, + true); + tokensIn.reset(); + assertTrue(tokensIn.incrementToken()); + assertFalse(tokensIn.incrementToken()); + tokensIn.end(); + tokensIn.close(); + + tokensOut = new SynonymFilter(tokensIn, + b.build(), + true); + termAtt = tokensOut.addAttribute(CharTermAttribute.class); + posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); + offsetAtt = tokensOut.addAttribute(OffsetAttribute.class); + + if (keepOrig) { + verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold"); + verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold"); + } else { + verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold"); + verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold"); + } + } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (revision 1157868) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (working copy) @@ -132,6 +132,7 @@ final CharsRef term = new CharsRef(); AttributeSource.State state; boolean keepOrig; + boolean matched; boolean consumed = true; int startOffset; int endOffset; @@ -140,6 +141,7 @@ state = null; consumed = true; keepOrig = false; + matched = false; } }; @@ -388,7 +390,7 @@ if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; - addOutput(matchOutput); + addOutput(matchOutput, matchInputLength); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before @@ -402,7 +404,7 @@ } // Interleaves all output tokens onto the futureOutputs: - private void addOutput(BytesRef bytes) { + private void addOutput(BytesRef bytes, int matchInputLength) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); @@ -426,13 +428,19 @@ futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen); //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto); lastStart = 1+chIDX; - futureInputs[outputUpto].keepOrig |= keepOrig; //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); outputUpto = rollIncr(outputUpto); assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite; } } } + + int upto = nextRead; + for(int idx=0;idx