diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleGraphFilter.java new file mode 100644 index 0000000000..52d300162d --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleGraphFilter.java @@ -0,0 +1,302 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.shingle; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; + +public final class ShingleGraphFilter extends TokenFilter { + + private final List tokenPool = new LinkedList<>(); + + private final int minShingleSize; + private final int maxShingleSize; + private final boolean emitUnigrams; + private final String tokenSeparator; + private final Token GAP_TOKEN = new Token(new AttributeSource()); + private final Token END_TOKEN = new Token(new AttributeSource()); + + private final PositionLengthAttribute lenAtt = addAttribute(PositionLengthAttribute.class); + private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + private Token[] currentShingleTokens; + private int shingleSize; + private boolean unigramDone; + + public ShingleGraphFilter(TokenStream input, int minShingleSize, int maxShingleSize, boolean emitUnigrams) { + this(input, minShingleSize, maxShingleSize, emitUnigrams, " ", "_"); + } + + public ShingleGraphFilter(TokenStream input, int minShingleSize, int maxShingleSize, boolean emitUnigrams, String tokenSeparator, String fillerToken) { + super(input); + this.minShingleSize = minShingleSize; + this.maxShingleSize = maxShingleSize; + this.emitUnigrams = emitUnigrams; + this.tokenSeparator = tokenSeparator; + + this.GAP_TOKEN.termAtt.setEmpty().append(fillerToken); + + this.currentShingleTokens = new Token[maxShingleSize]; + } + + @Override + public boolean incrementToken() throws IOException { + int posInc = 0; + if (nextShingle() == false) { + Token nextRoot = nextTokenInStream(currentShingleTokens[0]); + if (nextRoot == END_TOKEN) + return false; + recycleToken(currentShingleTokens[0]); + resetShingleRoot(nextRoot); + posInc = currentShingleTokens[0].posInc(); + } + clearAttributes(); + lenAtt.setPositionLength(shingleLength()); + incAtt.setPositionIncrement(posInc); + offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset()); + termAtt.setEmpty(); + termAtt.append(currentShingleTokens[0].term()); + typeAtt.setType(shingleSize > 1 ? "shingle" : currentShingleTokens[0].type()); + for (int i = 1; i < shingleSize; i++) { + termAtt.append(tokenSeparator).append(currentShingleTokens[i].term()); + } + return true; + } + + @Override + public void reset() throws IOException { + super.reset(); + this.currentShingleTokens[0] = null; + } + + private Token lastTokenInShingle() { + int lastTokenIndex = shingleSize - 1; + while (currentShingleTokens[lastTokenIndex] == GAP_TOKEN) { + lastTokenIndex--; + } + return currentShingleTokens[lastTokenIndex]; + } + + private void resetShingleRoot(Token token) throws IOException { + this.currentShingleTokens[0] = token; + this.shingleSize = maxShingleSize; + this.unigramDone = !emitUnigrams; + token: for (int i = 1; i < maxShingleSize; i++) { + Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]); + if (current == END_TOKEN) { + this.shingleSize = i + END_TOKEN.posInc(); + if (this.shingleSize > 1) { + // fill in any trailing gaps + for (int j = 1; j < shingleSize; j++) { + this.currentShingleTokens[i] = GAP_TOKEN; + i++; + if (i >= maxShingleSize) { + break token; + } + } + } + return; + } + if (current.posInc() > 1) { + // insert gaps into the shingle list + for (int j = 1; j < current.posInc(); j++) { + this.currentShingleTokens[i] = GAP_TOKEN; + i++; + if (i >= maxShingleSize) { + break token; + } + } + } + this.currentShingleTokens[i] = current; + } + } + + private boolean nextShingle() throws IOException { + if (currentShingleTokens[0] == null) + return false; + if (shingleSize <= minShingleSize) { + if (advanceStack()) { + return true; + } + if (unigramDone || shingleSize == 1) { + return false; + } + unigramDone = true; + this.shingleSize = 1; + return true; + } + shingleSize--; + return true; + } + + private int shingleLength() { + int len = 0; + for (int i = 0; i < shingleSize; i++) { + len += currentShingleTokens[i].length(); + } + return len; + } + + // check if the next token in the tokenstream is at the same position as this one + private boolean lastInStack(Token token) throws IOException { + Token next = nextTokenInStream(token); + return next == END_TOKEN || next.posInc() != 0; + } + + private boolean advanceStack() throws IOException { + for (int i = maxShingleSize - 1; i >= 1; i--) { + if (lastInStack(currentShingleTokens[i]) == false) { + currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]); + for (int j = i + 1; j < maxShingleSize; j++) { + currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]); + } + return true; + } + } + return false; + } + + private Token newToken() { + Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.remove(0); + token.reset(this); + return token; + } + + private void recycleToken(Token token) { + if (token == null) + return; + token.nextToken = null; + tokenPool.add(token); + } + + // for testing + int instantiatedTokenCount() { + int tokenCount = tokenPool.size() + 1; + if (currentShingleTokens[0] == END_TOKEN || currentShingleTokens[0] == null) + return tokenCount; + for (Token t = currentShingleTokens[0]; t != END_TOKEN && t != null; t = t.nextToken) { + tokenCount++; + } + return tokenCount; + } + + private Token nextTokenInGraph(Token token) throws IOException { + int length = token.length(); + while (length > 0) { + token = nextTokenInStream(token); + if (token == END_TOKEN) + return END_TOKEN; + length -= token.posInc(); + } + return token; + } + + private Token nextTokenInStream(Token token) throws IOException { + if (token == null) { // first call + if (input.incrementToken() == false) { + input.end(); + // check for gaps at the end of the tokenstream + END_TOKEN.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement()); + END_TOKEN.offsetAtt.setOffset(END_TOKEN.offsetAtt.startOffset(), this.offsetAtt.endOffset()); + return END_TOKEN; + } + return newToken(); + } + if (token.nextToken == null) { // end of cache, advance the underlying tokenstream + if (input.incrementToken() == false) { + input.end(); + // check for gaps at the end of the tokenstream + END_TOKEN.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement()); + END_TOKEN.offsetAtt.setOffset(END_TOKEN.offsetAtt.startOffset(), this.offsetAtt.endOffset()); + token.nextToken = END_TOKEN; + } + else { + token.nextToken = newToken(); + } + } + return token.nextToken; + } + + private static class Token { + final AttributeSource attSource; + final PositionLengthAttribute posLenAtt; + final PositionIncrementAttribute posIncAtt; + final CharTermAttribute termAtt; + final OffsetAttribute offsetAtt; + final TypeAttribute typeAtt; + + Token nextToken; + + Token(AttributeSource attSource) { + this.attSource = attSource; + this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class); + this.posLenAtt = attSource.addAttribute(PositionLengthAttribute.class); + this.termAtt = attSource.addAttribute(CharTermAttribute.class); + this.offsetAtt = attSource.addAttribute(OffsetAttribute.class); + this.typeAtt = attSource.addAttribute(TypeAttribute.class); + } + + int length() { + return this.posLenAtt.getPositionLength(); + } + + int posInc() { + return this.posIncAtt.getPositionIncrement(); + } + + CharSequence term() { + return this.termAtt; + } + + String type() { + return this.typeAtt.type(); + } + + int startOffset() { + return this.offsetAtt.startOffset(); + } + + int endOffset() { + return this.offsetAtt.endOffset(); + } + + void reset(AttributeSource attSource) { + this.attSource.restoreState(attSource.captureState()); + this.nextToken = null; + } + + @Override + public String toString() { + return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc() + "," + length(); + } + } + +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleGraphFilterFactory.java new file mode 100644 index 0000000000..b59348e9bd --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleGraphFilterFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.shingle; + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +public class ShingleGraphFilterFactory extends TokenFilterFactory { + + private final int minShingleSize; + private final int maxShingleSize; + private final boolean outputUnigrams; + private final String tokenSeparator; + private final String fillerToken; + + public ShingleGraphFilterFactory(Map args) { + super(args); + this.maxShingleSize = getInt(args, "maxShingleSize", 2); + this.minShingleSize = getInt(args, "minShingleSize", 1); + this.outputUnigrams = getBoolean(args, "outputUnigrams", true); + this.tokenSeparator = get(args, "tokenSeparator", " "); + this.fillerToken = get(args, "fillerToken", "_"); + } + + @Override + public TokenStream create(TokenStream input) { + return new ShingleGraphFilter(input, minShingleSize, maxShingleSize, outputUnigrams, tokenSeparator, fillerToken); + } +} diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 6dcc81ce04..6c1a8d68ab 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -101,6 +101,7 @@ org.apache.lucene.analysis.pt.PortugueseStemFilterFactory org.apache.lucene.analysis.reverse.ReverseStringFilterFactory org.apache.lucene.analysis.ru.RussianLightStemFilterFactory org.apache.lucene.analysis.shingle.ShingleFilterFactory +org.apache.lucene.analysis.shingle.ShingleGraphFilterFactory org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory org.apache.lucene.analysis.standard.ClassicFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleGraphFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleGraphFilterTest.java new file mode 100644 index 0000000000..5beed40809 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleGraphFilterTest.java @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.shingle; + +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.util.CharsRef; + + +public class ShingleGraphFilterTest extends BaseTokenStreamTestCase { + + public void testBiGramFilter() throws IOException { + + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("whitespace") + .addTokenFilter("shinglegraph", "maxShingleSize", "2", "minShingleSize", "2") + .build(); + + try (TokenStream ts = analyzer.tokenStream("field", "please divide this sentence into shingles")) { + assertTokenStreamContents(ts, + new String[] { "please divide", "please", "divide this", "divide", "this sentence", "this", "sentence into", "sentence", "into shingles", "into", "shingles" }, + new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 28, 33 }, + new int[] { 13, 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 }, + new String[] { "shingle", "word", "shingle", "word", "shingle", "word", "shingle", "word", "shingle", "word", "word" }, + new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, + new int[] { 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1 }); + } + + } + + public void testBiGramFilterWithAltSeparator() throws IOException { + + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("whitespace") + .addTokenFilter("shinglegraph", "maxShingleSize", "2", "minShingleSize", "2", "outputUnigrams", "false", + "tokenSeparator", "") + .build(); + + try (TokenStream ts = analyzer.tokenStream("field", "please divide this sentence into shingles")) { + assertTokenStreamContents(ts, + new String[] { "pleasedivide", "dividethis", "thissentence", "sentenceinto", "intoshingles", "shingles" }, + new int[] { 0, 7, 14, 19, 28, 33 }, + new int[] { 13, 18, 27, 32, 41, 41 }, + new String[] { "shingle", "shingle", "shingle", "shingle", "shingle", "word" }, + new int[] { 1, 1, 1, 1, 1, 1 }, + new int[] { 2, 2, 2, 2, 2, 1 }); + } + + + } + + public void testBiGramNoUnigrams() throws IOException { + + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("whitespace") + .addTokenFilter("shinglegraph", "maxShingleSize", "2", "minShingleSize", "2", "outputUnigrams", "false") + .build(); + + try (TokenStream ts = analyzer.tokenStream("field", "please divide this sentence into shingles")) { + assertTokenStreamContents(ts, + new String[] { "please divide", "divide this", "this sentence", "sentence into", "into shingles", "shingles" }, + new int[] { 0, 7, 14, 19, 28, 33 }, + new int[] { 13, 18, 27, 32, 41, 41 }, + new String[] { "shingle", "shingle", "shingle", "shingle", "shingle", "word" }, + new int[] { 1, 1, 1, 1, 1, 1 }, + new int[] { 2, 2, 2, 2, 2, 1 }); + } + + } + + public void testTriGramFilter() throws IOException { + + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("whitespace") + .addTokenFilter("shinglegraph", "maxShingleSize", "3", "minShingleSize", "2") + .build(); + + try (TokenStream ts = analyzer.tokenStream("field", "please divide this sentence into shingles")) { + assertTokenStreamContents(ts, + new String[] { "please divide this", "please divide", "please", "divide this sentence", "divide this", "divide", + "this sentence into", "this sentence", "this", "sentence into shingles", "sentence into", "sentence", + "into shingles", "into", "shingles" }, + new int[] { 0, 0, 0, 7, 7, 7, + 14, 14, 14, 19, 19, 19, + 28, 28, 33 }, + new int[] { 18, 13, 6, 27, 18, 13, + 32, 27, 18, 41, 32, 27, + 41, 32, 41 }, + new String[] { "shingle", "shingle", "word", "shingle", "shingle", "word", + "shingle", "shingle", "word", "shingle", "shingle", "word", + "shingle", "word", "word" }, + new int[] { 1, 0, 0, 1, 0, 0, + 1, 0, 0, 1, 0, 0, + 1, 0, 1 }, + new int[] { 3, 2, 1, 3, 2, 1, + 3, 2, 1, 3, 2, 1, + 2, 1, 1 }); + } + + } + + public void testTriGramNoUnigrams() throws IOException { + + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("whitespace") + .addTokenFilter("shinglegraph", "maxShingleSize", "3", "minShingleSize", "2", "outputUnigrams", "false") + .build(); + + try (TokenStream ts = analyzer.tokenStream("field", "please divide this sentence into shingles")) { + assertTokenStreamContents(ts, + new String[] { "please divide this", "please divide", "divide this sentence", "divide this", + "this sentence into", "this sentence", "sentence into shingles", "sentence into", + "into shingles", "shingles" }, + new int[] { 0, 0, 7, 7, + 14, 14, 19, 19, + 28, 33 }, + new int[] { 18, 13, 27, 18, + 32, 27, 41, 32, + 41, 41 }, + new String[] { "shingle", "shingle", "shingle", "shingle", + "shingle", "shingle", "shingle", "shingle", + "shingle", "word" }, + new int[] { 1, 0, 1, 0, + 1, 0, 1, 0, + 1, 1 }, + new int[] { 3, 2, 3, 2, + 3, 2, 3, 2, + 2, 1 }); + } + + } + + public void testWithStopwords() throws IOException { + + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("whitespace") + .addTokenFilter("stop") + .addTokenFilter("shinglegraph", "maxShingleSize", "3", "minShingleSize", "2") + .build(); + + try (TokenStream ts = analyzer.tokenStream("field", "please divide this sentence into shingles")) { + assertTokenStreamContents(ts, + new String[] { "please divide _", "please divide", "please", "divide _ sentence", "divide _", "divide", + "sentence _ shingles", "sentence _", "sentence", "shingles" }, + new int[] { 0, 0, 0, 7, 7, 7, + 19, 19, 19, 33 }, + new int[] { 13, 13, 6, 27, 13, 13, + 41, 27, 27, 41 }, + new String[] { "shingle", "shingle", "word", "shingle", "shingle", "word", + "shingle", "shingle", "word", "word" }, + new int[] { 1, 0, 0, 1, 0, 0, + 2, 0, 0, 2 }, + new int[] { 3, 2, 1, 3, 2, 1, + 3, 2, 1, 1 }); + + + } + + } + + public void testConsecutiveStopwords() throws IOException { + + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("whitespace") + .addTokenFilter("stop") + .addTokenFilter("shinglegraph", "maxShingleSize", "4", "minShingleSize", "4", "outputUnigrams", "false") + .build(); + + try (TokenStream ts = analyzer.tokenStream("field", "a b c d a a b c")) { + assertTokenStreamContents(ts, + new String[] { "b c d _", "c d _ _", "d _ _ b", "b c", "c"}, + new int[] { 2, 4, 6, 12, 14 }, + new int[] { 7, 7, 13, 15, 15 }, + new int[] { 2, 1, 1, 3, 1 }, + new int[] { 4, 4, 4, 2, 1 }, + null); + } + + } + + public void testTrailingStopwords() throws IOException { + + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("whitespace") + .addTokenFilter("stop") + .addTokenFilter("shinglegraph", "maxShingleSize", "4", "minShingleSize", "4", "outputUnigrams", "false") + .build(); + + try (TokenStream ts = analyzer.tokenStream("field", "b c d a")) { + assertTokenStreamContents(ts, + new String[] { "b c d _", "c d _", "d _" }, + new int[] { 0, 2, 4 }, + new int[] { 5, 5, 5 }, + new int[] { 1, 1, 1 }, + new int[] { 4, 3, 2 }, + null); + } + + } + + public void testIncomingGraphs() throws IOException { + + SynonymMap.Builder synonymBuilder = new SynonymMap.Builder(); + synonymBuilder.add(new CharsRef("a"), new CharsRef("b"), true); + SynonymMap synonymMap = synonymBuilder.build(); + + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new WhitespaceTokenizer(); + TokenStream sink + = new ShingleGraphFilter(new SynonymGraphFilter(source, synonymMap, true), 2, 2, false); + return new TokenStreamComponents(source, sink); + } + }; + + try (TokenStream ts = analyzer.tokenStream("field", "a c a d")) { + assertTokenStreamContents(ts, + new String[] { "b c", "a c", "c b", "c a", "b d", "a d", "d" }, + new int[] { 0, 0, 2, 2, 4, 4, 6 }, + new int[] { 3, 3, 5, 5, 7, 7, 7 }, + new int[] { 1, 0, 1, 0, 1, 0, 1 }, + new int[] { 2, 2, 2, 2, 2, 2, 1 }, + null); + } + + } + + public void testShinglesSpanningGraphs() throws IOException { + + SynonymMap.Builder synonymBuilder = new SynonymMap.Builder(); + synonymBuilder.add(new CharsRef("a"), new CharsRef("b"), true); + SynonymMap synonymMap = synonymBuilder.build(); + + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new WhitespaceTokenizer(); + TokenStream sink + = new ShingleGraphFilter(new SynonymGraphFilter(source, synonymMap, true), 3, 3, false); + return new TokenStreamComponents(source, sink); + } + }; + + try (TokenStream ts = analyzer.tokenStream("field", "a c a d")) { + assertTokenStreamContents(ts, + new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d", "b d", "a d", "d" }, + new int[] { 0, 0, 0, 0, 2, 2, 4, 4, 6 }, + new int[] { 5, 5, 5, 5, 7, 7, 7, 7, 7 }, + new int[] { 1, 0, 0, 0, 1, 0, 1, 0, 1 }, + new int[] { 3, 3, 3, 3, 3, 3, 2, 2, 1 }, + null); + } + + } + + public void testMultiLengthPathShingles() throws IOException, ParseException { + + String testFile = "usa,united states,united states of america"; + Analyzer mockAnalyzer = new MockAnalyzer(random()); + SolrSynonymParser parser = new SolrSynonymParser(true, true, mockAnalyzer); + + parser.parse(new StringReader(testFile)); + mockAnalyzer.close(); + + SynonymMap synonymMap = parser.build(); + + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new WhitespaceTokenizer(); + TokenStream sink + = new ShingleGraphFilter(new SynonymGraphFilter(source, synonymMap, true), 3, 3, false); + return new TokenStreamComponents(source, sink); + } + }; + + try (TokenStream ts = analyzer.tokenStream("field", "the usa is big")) { + assertTokenStreamContents(ts, + new String[]{ "the united states", "the united states", "the usa is", "united states is", "united states of", + "usa is big", "states is big", "states of america", "of america is", "america is big", "is big", "big" }, + new int[]{ 0, 0, 0, 4, 4, + 4, 4, 4, 4, 4, 8, 11 }, + new int[]{ 7, 7, 10, 10, 7, + 14, 14, 7, 10, 14, 14, 14 }, + new int[]{ 1, 0, 0, 1, 0, + 0, 1, 1, 1, 1, 1, 1 }, + new int[]{ 6, 4, 7, 6, 4, + 7, 6, 3, 3, 3, 2, 1 }, + null); + } + + } + +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index 3e1e375eb0..8c0a2952c5 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -341,6 +341,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null); } + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int[] posLengths) throws IOException { + assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null); + } + public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException { assertTokenStreamContents(ts, output, null, null, null, null, null, null); }