Index: contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java (revision 0) @@ -0,0 +1,64 @@ +package org.apache.lucene.analysis.miscellaneous; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +import java.io.IOException; +import java.io.StringReader; + +public class TestPrefixAwareTokenFilter extends TestCase { + + public void test() throws IOException { + + PrefixAwareTokenFilter ts; + + ts = new PrefixAwareTokenFilter( + new SingleTokenTokenStream(new Token("a", 0, 1)), + new SingleTokenTokenStream(new Token("b", 0, 1))); + assertNext(ts, "a", 0, 1); + assertNext(ts, "b", 1, 2); + assertNull(ts.next()); + + + // prefix and suffix using 2x prefix + + ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(new Token("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world"))); + ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(new Token("$", 0, 0))); + + assertNext(ts, "^", 0, 0); + assertNext(ts, "hello", 0, 5); + assertNext(ts, "world", 6, 11); + assertNext(ts, "$", 11, 11); + assertNull(ts.next()); + } + + + private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException { + Token token = ts.next(); + assertNotNull(token); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(startOffset, token.startOffset()); + assertEquals(endOffset, token.endOffset()); + return token; + } + +} Index: contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestSingleTokenTokenFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestSingleTokenTokenFilter.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestSingleTokenTokenFilter.java (revision 0) @@ -0,0 +1,39 @@ +package org.apache.lucene.analysis.miscellaneous; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; + +public class TestSingleTokenTokenFilter extends TestCase { + + public void test() throws IOException { + + Token token = new Token(); + + SingleTokenTokenStream ts = new SingleTokenTokenStream(token); + + assertEquals(token, ts.next()); + assertNull(ts.next()); + + } + +} Index: contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java (revision 0) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.miscellaneous; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +import java.io.IOException; +import java.io.StringReader; + +public class TestPrefixAndSuffixAwareTokenFilter extends TestCase { + + public void test() throws IOException { + + PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter( + new SingleTokenTokenStream(new Token("^", 0, 0)), + new WhitespaceTokenizer(new StringReader("hello world")), + new SingleTokenTokenStream(new Token("$", 0, 0))); + + assertNext(ts, "^", 0, 0); + assertNext(ts, "hello", 0, 5); + assertNext(ts, "world", 6, 11); + assertNext(ts, "$", 11, 11); + assertNull(ts.next()); + } + + + private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException { + Token token = ts.next(); + assertNotNull(token); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(startOffset, token.startOffset()); + assertEquals(endOffset, token.endOffset()); + return token; + } + +} Index: contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (revision 0) @@ -0,0 +1,470 @@ +package org.apache.lucene.analysis.shingle; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter; +import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream; +import org.apache.lucene.analysis.payloads.PayloadHelper; +import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix; +import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedList; + +public class TestShingleMatrixFilter extends TestCase { + + + /** + * Extracts a matrix from a token stream. + * @throws IOException + */ + public void testTokenStream() throws IOException { + + + Token token = new Token(); // for debug use only + + + TokenStream ts; + TokenListStream tls; + LinkedList tokens; + + + + // test a plain old token stream with synonyms tranlated to rows. + + tokens = new LinkedList(); + tokens.add(tokenFactory("hello", 1, 0, 4)); + tokens.add(tokenFactory("greetings", 0, 0, 4)); + tokens.add(tokenFactory("world", 1, 5, 10)); + tokens.add(tokenFactory("earth", 0, 5, 10)); + tokens.add(tokenFactory("tellus", 0, 5, 10)); + + tls = new TokenListStream(tokens); + + ts = new ShingleMatrixFilter.SynonymToNewRowFilter(tls, ShingleMatrixFilter.defaultSettingsCodec); + tls = new TokenListStream(ts); + + // bi-grams + + ts = new ShingleMatrixFilter(tls, 2, 2); + + assertNext(ts, "hello_world"); + assertNext(ts, "greetings_world"); + assertNext(ts, "hello_earth"); + assertNext(ts, "greetings_earth"); + assertNext(ts, "hello_tellus"); + assertNext(ts, "greetings_tellus"); + assertNull(ts.next()); + + // bi-grams with no spacer character, start offset, end offset + + tls.reset(); + ts = tls; + ts = new ShingleMatrixFilter(ts, 2, 2, null); + + assertNext(ts, "helloworld", 0, 10); + assertNext(ts, "greetingsworld", 0, 10); + assertNext(ts, "helloearth", 0, 10); + assertNext(ts, "greetingsearth", 0, 10); + assertNext(ts, "hellotellus", 0, 10); + assertNext(ts, "greetingstellus", 0, 10); + assertNull(ts.next()); + + // add ^_prefix_and_suffix_$ + + tls.reset(); + ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(tokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(tokenFactory("$", 1, 50f, 0, 0))); + ts = new ShingleMatrixFilter.SynonymToNewRowFilter(ts, ShingleMatrixFilter.defaultSettingsCodec); + tls = new TokenListStream(ts); + + // bi-grams, position incrememnt, weight, start offset, end offset + + ts = new ShingleMatrixFilter(tls, 2, 2); + +// while ((token = ts.next(token)) != null) { +// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// token.clear(); +// } + + assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); + assertNull(ts.next()); + + // test unlimited size and allow single boundary token as shingle + tls.reset(); + ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', false); +// +// while ((token = ts.next(token)) != null) { +// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// token.clear(); +// } + + assertNext(ts, "^", 1, 10.0f, 0, 0); + assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); + assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, "hello", 1, 1.0f, 0, 4); + assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "world", 1, 1.0f, 5, 10); + assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, "$", 1, 7.071068f, 10, 10); + assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); + assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, "greetings", 1, 1.0f, 0, 4); + assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); + assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "earth", 1, 1.0f, 5, 10); + assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); + assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "tellus", 1, 1.0f, 5, 10); + assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); + assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); + + assertNull(ts.next()); + + // test unlimited size but don't allow single boundary token as shingle + + tls.reset(); + ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', true); +// while ((token = ts.next(token)) != null) { +// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// token.clear(); +// } + + assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); + assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, "hello", 1, 1.0f, 0, 4); + assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "world", 1, 1.0f, 5, 10); + assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); + assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, "greetings", 1, 1.0f, 0, 4); + assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); + assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "earth", 1, 1.0f, 5, 10); + assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); + assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); + assertNext(ts, "tellus", 1, 1.0f, 5, 10); + assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); + assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); + + + assertNull(ts.next()); + + System.currentTimeMillis(); + + + + // multi-token synonyms + // + // Token[][][] { + // {{hello}, {greetings, and, salutations}, + // {{world}, {earth}, {tellus}} + // } + // + + + tokens = new LinkedList(); + tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn)); + tokens.add(tokenFactory("greetings", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow)); + tokens.add(tokenFactory("and", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow)); + tokens.add(tokenFactory("salutations", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow)); + tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn)); + tokens.add(tokenFactory("earth", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow)); + tokens.add(tokenFactory("tellus", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow)); + + tls = new TokenListStream(tokens); + + // bi-grams + + ts = new ShingleMatrixFilter(tls, 2, 3); + +// while ((token = ts.next(token)) != null) { +// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// token.clear(); +// } + + // shingle, position increment, weight, start offset, end offset + + assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4); + assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4); + assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4); + assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10); + assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10); + assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10); + assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10); + + assertNull(ts.next()); + + System.currentTimeMillis(); + + + + } + + /** + * Tests creat shingles from a pre-assembled matrix + * + * Tests the row token z-axis, multi token synonyms. + * + * @throws IOException + */ + public void testMatrix() throws IOException { + + Matrix matrix = new Matrix(); + + matrix.new Column(tokenFactory("no", 1)); + matrix.new Column(tokenFactory("surprise", 1)); + matrix.new Column(tokenFactory("to", 1)); + matrix.new Column(tokenFactory("see", 1)); + matrix.new Column(tokenFactory("england", 1)); + matrix.new Column(tokenFactory("manager", 1)); + + Column col = matrix.new Column(); + + // sven göran eriksson is a multi token synonym to svennis + col.new Row().getTokens().add(tokenFactory("svennis", 1)); + + Column.Row row = col.new Row(); + row.getTokens().add(tokenFactory("sven", 1)); + row.getTokens().add(tokenFactory("göran", 1)); + row.getTokens().add(tokenFactory("eriksson", 1)); + + matrix.new Column(tokenFactory("in", 1)); + matrix.new Column(tokenFactory("the", 1)); + matrix.new Column(tokenFactory("croud", 1)); + + TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true, ShingleMatrixFilter.defaultSettingsCodec); + +// Token token = new Token(); +// while ((token = ts.next(token)) != null) { +// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// token.clear(); +// } + + assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0); + assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0); + assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0); + assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0); + assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0); + assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0); + assertNext(ts, "to_see", 1, 1.4142135f, 0, 0); + assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0); + assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0); + assertNext(ts, "see_england", 1, 1.4142135f, 0, 0); + assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0); + assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0); + assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0); + assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0); + assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0); + assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0); + assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0); + assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0); + assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0); + assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0); + assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0); + assertNext(ts, "in_the", 1, 1.4142135f, 0, 0); + assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0); + assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0); + assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0); + assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0); + assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0); + assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0); + assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0); + assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0); + assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0); + assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0); + assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0); + assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0); + assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0); + assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0); + assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0); + assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0); + assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0); + + assertNull(ts.next()); + + } + + private Token tokenFactory(String text, int startOffset, int endOffset) { + return tokenFactory(text, 1, 1f, startOffset, endOffset); + } + + + private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) { + return tokenFactory(text, posIncr, 1f, startOffset, endOffset); + } + + + private Token tokenFactory(String text, int posIncr) { + return tokenFactory(text, posIncr, 1f, 0, 0); + } + + private Token tokenFactory(String text, int posIncr, float weight) { + return tokenFactory(text, posIncr, weight, 0, 0); + } + + private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) { + Token token = new Token(); + token.setTermText(text); + token.setPositionIncrement(posIncr); + ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight); + token.setStartOffset(startOffset); + token.setEndOffset(endOffset); + return token; + } + + private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) { + Token token = new Token(); + token.setTermText(text); + token.setPositionIncrement(posIncr); + ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight); + token.setStartOffset(startOffset); + token.setEndOffset(endOffset); + ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner); + return token; + } + + // assert-methods start here + + private Token assertNext(TokenStream ts, String text) throws IOException { + Token token = ts.next(new Token()); + assertNotNull(token); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + return token; + } + + private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException { + Token token = ts.next(new Token()); + assertNotNull(token); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(positionIncrement, token.getPositionIncrement()); + assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData())); + return token; + } + + private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException { + Token token = ts.next(new Token()); + assertNotNull(token); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(positionIncrement, token.getPositionIncrement()); + assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData())); + assertEquals(startOffset, token.startOffset()); + assertEquals(endOffset, token.endOffset()); + return token; + } + + private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException { + Token token = ts.next(new Token()); + assertNotNull(token); + assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(startOffset, token.startOffset()); + assertEquals(endOffset, token.endOffset()); + return token; + } + + + public static class TokenListStream extends TokenStream { + + private Collection tokens; + + public TokenListStream(TokenStream ts) throws IOException { + tokens = new ArrayList(); + Token token; + while ((token = ts.next(new Token())) != null) { + tokens.add(token); + } + } + + public TokenListStream(Collection tokens) { + this.tokens = tokens; + } + + private Iterator iterator; + + public Token next() throws IOException { + if (iterator == null) { + iterator = tokens.iterator(); + } + if (!iterator.hasNext()) { + return null; + } + return iterator.next(); + } + + + public void reset() throws IOException { + iterator = null; + } + } + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (revision 0) @@ -0,0 +1,748 @@ +package org.apache.lucene.analysis.shingle; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.payloads.PayloadHelper; +import org.apache.lucene.index.Payload; + +import java.io.IOException; +import java.util.*; + + +/** + *

A ShingleFilter constructs shingles (token n-grams) from a token stream. + * In other words, it creates combinations of tokens as a single token. + * + *

For example, the sentence "please divide this sentence into shingles" + * might be tokenized into shingles "please divide", "divide this", + * "this sentence", "sentence into", and "into shingles". + * + *

This filter is backed by a three dimensional column oriented matrix + * used to create permutations of shingles in synonym space. The third dimension, + * the z-axis, is used for multi token synonyms. + * + * + * Token[column][row][z-axis]{ + * {{hello}, {greetings, and, salutations}}, + * {{world}, {earth}, {tellus}} + * }; + * + * + * The matrix above would produce the following 2-3 gram sized shingles: + * + * "hello_world" + * "greetings_and" + * "greetings_and_salutations" + * "and_salutations" + * "and_salutations_world" + * "salutations_world" + * "hello_earth" + * "and_salutations_earth" + * "salutations_earth" + * "hello_tellus" + * "and_salutations_tellus" + * "salutations_tellus" + * + */ +public class ShingleMatrixFilter extends TokenStream { + + public static Character defaultspacerCharacter = '_'; + public static TokenSettingsCodec defaultSettingsCodec = new DefaultTokenSettingsCodec(); + public static boolean ignoringSinglePrefixOrSuffixShingleByDefault = false; + + + private TokenSettingsCodec settingsCodec; + + private int minimumShingleSize; + private int maximumShingleSize; + + private boolean ignoringSinglePrefixOrSuffixShingle = false; + + private Character spacerCharacter = '_'; + + private TokenStream input; + + /** + * Used to describe how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. + * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(org.apache.lucene.analysis.Token) + * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(org.apache.lucene.analysis.Token,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner) + */ + public static enum TokenPositioner { + newColumn(0), newRow(1), sameRow(2); + + private final int index; + + private TokenPositioner(int index) { + this.index = index; + } + + public int getIndex() { + return index; + } + } + + + public ShingleMatrixFilter(Matrix matrix, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { + this.matrix = matrix; + this.minimumShingleSize = minimumShingleSize; + this.maximumShingleSize = maximumShingleSize; + this.spacerCharacter = spacerCharacter; + this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; + this.settingsCodec = settingsCodec; + + this.input = new TokenStream() { + public Token next(Token result) throws IOException { + return null; + } + }; + } + + public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize) { + this(input, minimumShingleSize, maximumShingleSize, defaultspacerCharacter); + } + + + public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter) { + this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingleByDefault); + } + + + public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle) { + this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingle, defaultSettingsCodec); + } + + public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { + this.input = input; + this.minimumShingleSize = minimumShingleSize; + this.maximumShingleSize = maximumShingleSize; + this.spacerCharacter = spacerCharacter; + this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; + this.settingsCodec = settingsCodec; + } + + + /** + * Strategy used to get and set various {@link org.apache.lucene.analysis.Token} meta data. + * + * It is seperate from the {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter} + * mainly to make it easier for {@link org.apache.lucene.analysis.TokenStream}s that + * produce the tokens to be consumed by the {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter}. + */ + public static abstract class TokenSettingsCodec { + public abstract TokenPositioner getTokenPositioner(Token token) throws IOException; + + public abstract void setTokenPositioner(Token token, ShingleMatrixFilter.TokenPositioner tokenPositioner); + + public abstract float getWeight(Token token); + + public abstract void setWeight(Token token, float weight); + } + + + private Iterator permutations; + + /** the current permutation of tokens used to produce shingles */ + private List currentPermuationTokens; + /** index to what row a token in currentShingleTokens represents*/ + private List currentPermutationRows; + + private int currentPermutationTokensStartOffset; + private int currentShingleLength; + + /** + * a set containing shingles that has been the result of a call to next(Token), + * used to avoid producing the same shingle more than once. + */ + private Set> shinglesSeen = new HashSet>(); + + + public void reset() throws IOException { + permutations = null; + shinglesSeen.clear(); + } + + private Matrix matrix; + + + public Token next(Token token) throws IOException { + if (matrix == null) { + matrix = new Matrix(); + // fill matrix with maximumShingleSize columns + while (matrix.columns.size() < maximumShingleSize && readColumn()) { + // this loop looks ugly + } + } + + if (currentPermuationTokens != null) { + currentShingleLength++; + + if (currentShingleLength + currentPermutationTokensStartOffset <= currentPermuationTokens.size() + && currentShingleLength <= maximumShingleSize) { + + // it is possible to create at least one more shingle of the current matrix permutation + + if (ignoringSinglePrefixOrSuffixShingle + && currentShingleLength == 1 + && (currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isFirst() || currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isLast())) { + return next(token); + } + + int termLength = 0; + + List shingle = new ArrayList(); + + for (int i = 0; i < currentShingleLength; i++) { + Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset); + termLength += shingleToken.termLength(); + shingle.add(shingleToken); + } + if (spacerCharacter != null) { + termLength += currentShingleLength - 1; + } + + // only produce shingles that not already has been created + if (!shinglesSeen.add(shingle)) { + return next(token); + } + + // shingle token factory + StringBuilder sb = new StringBuilder(termLength + 10); // paranormal abillity to forsay the future. + for (Token shingleToken : shingle) { + if (spacerCharacter != null && sb.length() > 0) { + sb.append(spacerCharacter); + } + sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength()); + } + token.setTermText(sb.toString()); + updateToken(token, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens); + + return token; + + } else { + + // it is NOT possible to create one more shingle of the current matrix permutation + + if (currentPermutationTokensStartOffset < currentPermuationTokens.size() - 1) { + // reset shingle size and move one step to the right in the current tokens permutation + currentPermutationTokensStartOffset++; + currentShingleLength = minimumShingleSize - 1; + return next(token); + } + + + if (permutations == null) { + // todo does this ever occur? + return null; + } + + + if (!permutations.hasNext()) { + + // load more data (if available) to the matrix + + if (input != null && readColumn()) { + // don't really care, we just read it. + } + + // delete the first column in the matrix + Matrix.Column deletedColumn = matrix.columns.remove(0); + + // remove all shingles seen that include any of the tokens from the deleted column. + List deletedColumnTokens = new ArrayList(); + for (Matrix.Column.Row row : deletedColumn.getRows()) { + for (Token shingleToken : row.getTokens()) { + deletedColumnTokens.add(shingleToken); + } + } + for (Iterator> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { + List shingle = shinglesSeenIterator.next(); + for (Token deletedColumnToken : deletedColumnTokens) { + if (shingle.contains(deletedColumnToken)) { + shinglesSeenIterator.remove(); + break; + } + } + } + + + if (matrix.columns.size() < minimumShingleSize) { + // exhausted + return null; + } + + // create permutations of the matrix it now looks + permutations = matrix.permutationIterator(); + } + + nextTokensPermutation(); + return next(token); + + } + } + + if (permutations == null) { + permutations = matrix.permutationIterator(); + } + + if (!permutations.hasNext()) { + return null; + } + + nextTokensPermutation(); + + return next(token); + } + + /** + * get next permutation of row combinations, + * creates list of all tokens in the row and + * an index from each such token to what row they exist in. + * finally resets the current (next) shingle size and offset. + */ + private void nextTokensPermutation() { + Matrix.Column.Row[] rowsPermutation = permutations.next(); + List currentPermutationRows = new ArrayList(); + List currentPermuationTokens = new ArrayList(); + for (Matrix.Column.Row row : rowsPermutation) { + for (Token shingleToken : row.getTokens()) { + currentPermuationTokens.add(shingleToken); + currentPermutationRows.add(row); + } + } + this.currentPermuationTokens = currentPermuationTokens; + this.currentPermutationRows = currentPermutationRows; + + currentPermutationTokensStartOffset = 0; + currentShingleLength = minimumShingleSize - 1; + + } + + /** + * Set shingle token meta data as the last thing before the shingle token is beeing passed on to the consumer. + * @param token Shingle token + * @param shingle Tokens used to produce the shingle token. + * @param currentPermutationStartOffset Start offset in parameter currentPermutationTokens + * @param currentPermutationRows index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens + * @param currentPermuationTokens tokens of the current permutation of rows in the matrix. + */ + public void updateToken(Token token, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { + token.setType(ShingleMatrixFilter.class.getSimpleName()); + token.setFlags(0); + token.setPositionIncrement(1); + token.setStartOffset(shingle.get(0).startOffset()); + token.setEndOffset(shingle.get(shingle.size() - 1).endOffset()); + settingsCodec.setWeight(token, calculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens)); + } + + + /** + * for (shingle part token in shingle) + * weight += shingle part token weight * (1 / srq(all shingle part token weights summed)) + * + * @param token + * @param shingle + * @param currentPermutationStartOffset + * @param currentPermutationRows + * @param currentPermuationTokens + * @return + */ + public float calculateShingleWeight(Token token, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { + double topWeight = 0f; + double sourceWeightSum = 0f; + + for (Token shingleToken : shingle) { + + double tmp = settingsCodec.getWeight(shingleToken); + if (tmp > topWeight) { + topWeight = tmp; + } + sourceWeightSum += tmp; + } + + double factor = 1d / Math.sqrt(sourceWeightSum); + + double result = 0d; + for (Token shingleToken : shingle) { + result += settingsCodec.getWeight(shingleToken) * factor; + } + + return (float) result; + } + + + private Token readColumnBuf; + + /** + * Loads one column from the token stream. + * + * When the last token is read from the token stream it will column.setLast(true); + * + * @return true if it manage to read one more column from the input token stream + * @throws IOException if the matrix source input stream throws an exception + */ + private boolean readColumn() throws IOException { + + Token token; + if (readColumnBuf != null) { + token = readColumnBuf; + readColumnBuf = null; + } else { + token = input.next(new Token()); + } + + if (token == null) { + return false; + } + + Matrix.Column currentReaderColumn = matrix.new Column(); + Matrix.Column.Row currentReaderRow = currentReaderColumn.new Row(); + + currentReaderRow.getTokens().add(token); + TokenPositioner tokenPositioner; + while ((readColumnBuf = input.next(new Token())) != null + && (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) { + + if (tokenPositioner == TokenPositioner.sameRow) { + currentReaderRow.getTokens().add(readColumnBuf); + } else /*if (tokenPositioner == TokenPositioner.newRow)*/ { + currentReaderRow = currentReaderColumn.new Row(); + currentReaderRow.getTokens().add(readColumnBuf); + } + readColumnBuf = null; + + } + + if (readColumnBuf == null) { + readColumnBuf = input.next(new Token()); + if (readColumnBuf == null) { + currentReaderColumn.setLast(true); + } + } + + + return true; + + } + + + /** + * A column focused matrix in three dimensions: + * + * Token[column][row][z-axis] { + * {{hello}, {greetings, and, salutations}}, + * {{world}, {earth}, {tellus}} + * }; + * + * todo consider row groups + * to indicate that shingles is only to contain permutations with texts in that same row group. + * + */ + public static class Matrix { + + private boolean columnsHasBeenCreated = false; + + private List columns = new ArrayList(); + + public List getColumns() { + return columns; + } + + public class Column { + + private boolean last; + private boolean first; + + public Matrix getMatrix() { + return Matrix.this; + } + + public Column(Token token) { + this(); + Row row = new Row(); + row.getTokens().add(token); + } + + public Column() { + synchronized (Matrix.this) { + if (!columnsHasBeenCreated) { + this.setFirst(true); + columnsHasBeenCreated = true; + } + } + Matrix.this.columns.add(this); + } + + private List rows = new ArrayList(); + + public List getRows() { + return rows; + } + + + public int getIndex() { + return Matrix.this.columns.indexOf(this); + } + + public String toString() { + return "Column{" + + "first=" + first + + ", last=" + last + + ", rows=" + rows + + '}'; + } + + public boolean isFirst() { + return first; + } + + public void setFirst(boolean first) { + this.first = first; + } + + public void setLast(boolean last) { + this.last = last; + } + + public boolean isLast() { + return last; + } + + public class Row { + + public Column getColumn() { + return Column.this; + } + + private List tokens = new LinkedList(); + + public Row() { + Column.this.rows.add(this); + } + + public int getIndex() { + return Column.this.rows.indexOf(this); + } + + public List getTokens() { + return tokens; + } + + public void setTokens(List tokens) { + this.tokens = tokens; + } + +// public int getStartOffset() { +// int ret = tokens[0].startOffset(); +// if (getIndex() > 0 && ret == 0) { +// ret = Column.this.rows.get(0).getStartOffset(); +// } +// return ret; +// } +// +// public int getEndOffset() { +// int ret = tokens[tokens.length - 1].endOffset(); +// if (getIndex() > 0 && ret == 0) { +// ret = Column.this.rows.get(0).getEndOffset(); +// } +// return ret; +// } + + public String toString() { + return "Row{" + + "index=" + getIndex() + + ", tokens=" + (tokens == null ? null : Arrays.asList(tokens)) + + '}'; + } + } + + } + + + public Iterator permutationIterator() { + + return new Iterator() { + + private int[] columnRowCounters = new int[columns.size()]; + + public void remove() { + throw new IllegalStateException("not implemented"); + } + + public boolean hasNext() { + int s = columnRowCounters.length; + return columnRowCounters[s - 1] < columns.get(s - 1).getRows().size(); + } + + public Column.Row[] next() { + if (!hasNext()) { + throw new NoSuchElementException("no more elements"); + } + + Column.Row[] rows = new Column.Row[columnRowCounters.length]; + + for (int i = 0; i < columnRowCounters.length; i++) { + rows[i] = columns.get(i).rows.get(columnRowCounters[i]); + } + incrementColumnRowCounters(); + + return rows; + } + + private void incrementColumnRowCounters() { + for (int i = 0; i < columnRowCounters.length; i++) { + columnRowCounters[i]++; + if (columnRowCounters[i] == columns.get(i).rows.size() && + i < columnRowCounters.length - 1) { + columnRowCounters[i] = 0; + } else { + break; + } + } + } + }; + } + + public String toString() { + return "Matrix{" + + "columns=" + columns + + '}'; + } + } + + + public int getMinimumShingleSize() { + return minimumShingleSize; + } + + public void setMinimumShingleSize(int minimumShingleSize) { + this.minimumShingleSize = minimumShingleSize; + } + + public int getMaximumShingleSize() { + return maximumShingleSize; + } + + public void setMaximumShingleSize(int maximumShingleSize) { + this.maximumShingleSize = maximumShingleSize; + } + + + public Matrix getMatrix() { + return matrix; + } + + public void setMatrix(Matrix matrix) { + this.matrix = matrix; + } + + public Character getSpacerCharacter() { + return spacerCharacter; + } + + public void setSpacerCharacter(Character spacerCharacter) { + this.spacerCharacter = spacerCharacter; + } + + public boolean isIgnoringSinglePrefixOrSuffixShingle() { + return ignoringSinglePrefixOrSuffixShingle; + } + + public void setIgnoringSinglePrefixOrSuffixShingle(boolean ignoringSinglePrefixOrSuffixShingle) { + this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; + } + + + /** + * A very simple codec not to be used for something serious + */ + public static class DefaultTokenSettingsCodec extends TokenSettingsCodec { + public TokenPositioner getTokenPositioner(Token token) throws IOException { + switch (token.getFlags()) { + case 0: + return TokenPositioner.newColumn; + case 1: + return TokenPositioner.newRow; + case 2: + return TokenPositioner.sameRow; + } + throw new IOException("Unknown matrix positioning of token " + token); + } + + public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { + token.setFlags(tokenPositioner.getIndex()); + } + + public float getWeight(Token token) { + if (token.getPayload() == null || token.getPayload().getData() == null) { + return 1f; + } else { + return PayloadHelper.decodeFloat(token.getPayload().getData()); + } + } + + public void setWeight(Token token, float weight) { + if (weight == 1f) { + token.setPayload(null); + } else { + token.setPayload(new Payload(PayloadHelper.encodeFloat(weight))); + } + } + + } + + + /** + * Adds synonyms as rows in the current column in the matrix. + */ + public static class SynonymToNewRowFilter extends TokenFilter { + + private TokenSettingsCodec codec; + + public SynonymToNewRowFilter(TokenStream input, TokenSettingsCodec shingleMatrixFilter) { + super(input); + this.codec = shingleMatrixFilter; + } + + public Token next(Token result) throws IOException { + result = input.next(result); + if (result == null) { + return null; + } + if (result.getPositionIncrement() == 0) { + codec.setTokenPositioner(result, TokenPositioner.newRow); + } else { + codec.setTokenPositioner(result, TokenPositioner.newColumn); + } + return result; + } + + + public void setCodec(TokenSettingsCodec codec) { + this.codec = codec; + } + + public TokenSettingsCodec getCodec() { + return codec; + } + } + + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (revision 0) @@ -0,0 +1,59 @@ +package org.apache.lucene.analysis.miscellaneous; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; + +/** + * A token stream containing a single token. + */ +public class SingleTokenTokenStream extends TokenStream { + + private boolean exhausted = false; + private Token token; + + + public SingleTokenTokenStream(Token token) { + this.token = token; + } + + + public Token next(Token result) throws IOException { + if (exhausted) { + return null; + } + exhausted = true; + return token; + } + + + public void reset() throws IOException { + exhausted = false; + } + + public Token getToken() { + return token; + } + + public void setToken(Token token) { + this.token = token; + } +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (revision 0) @@ -0,0 +1,148 @@ +package org.apache.lucene.analysis.miscellaneous; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.Payload; + +import java.io.IOException; + + +/** + * Joins two token streams and leaves the last token of the first stream available + * to be used when updating the token values in the second stream based on that token. + * + * The default implementation adds last prefix token end offset to the suffix token start and end offsets. + */ +public class PrefixAwareTokenFilter extends TokenStream { + + private TokenStream prefix; + private TokenStream suffix; + + public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) { + this.suffix = suffix; + this.prefix = prefix; + prefixExhausted = false; + } + + private CopyableToken previousPrefixToken = new CopyableToken(); + + private boolean prefixExhausted; + + public Token next(Token result) throws IOException { + + Token buf = result; + + if (!prefixExhausted) { + result = prefix.next(result); + if (result == null) { + prefixExhausted = true; + } else { + previousPrefixToken.copyFrom(result); + return result; + } + } + + result = suffix.next(buf); + if (result == null) { + return null; + } + + return updateSuffixToken(result, previousPrefixToken); + } + + /** + * The default implementation adds last prefix token end offset to the suffix token start and end offsets. + * + * @param suffixToken a token from the suffix stream + * @param lastPrefixToken the last token from the prefix stream + * @return consumer token + */ + public Token updateSuffixToken(Token suffixToken, Token lastPrefixToken) { + suffixToken.setStartOffset(lastPrefixToken.endOffset() + suffixToken.startOffset()); + suffixToken.setEndOffset(lastPrefixToken.endOffset() + suffixToken.endOffset()); + return suffixToken; + } + + public void close() throws IOException { + prefix.close(); + suffix.close(); + } + + public void reset() throws IOException { + super.reset(); + if (prefix != null) { + prefixExhausted = false; + prefix.reset(); + } + if (suffix != null) { + suffix.reset(); + } + + + } + + + public TokenStream getPrefix() { + return prefix; + } + + public void setPrefix(TokenStream prefix) { + this.prefix = prefix; + } + + public TokenStream getSuffix() { + return suffix; + } + + public void setSuffix(TokenStream suffix) { + this.suffix = suffix; + } + + + public static class CopyableToken extends Token { + + private Payload buf = new Payload(); + + public void copyFrom(Token source) { + if (source.termBuffer() != null) { + setTermBuffer(source.termBuffer(), 0, source.termLength()); + } else { + setTermText(null); + setTermLength(0); + } + + setPositionIncrement(source.getPositionIncrement()); + setFlags(source.getFlags()); + setStartOffset(source.startOffset()); + setEndOffset(source.endOffset()); + setType(source.type()); + if (source.getPayload() == null) { + setPayload(null); + } else { + setPayload(buf); + if (buf.getData() == null || buf.getData().length < source.getPayload().length()) { + buf.setData(new byte[source.getPayload().length()]); + } + source.getPayload().copyTo(buf.getData(), 0); + buf.setData(buf.getData(), 0, source.getPayload().length()); + } + } + } +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java (revision 0) @@ -0,0 +1,54 @@ +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; + +/** + * Links two PrefixAwareTokenFilter + */ +public class PrefixAndSuffixAwareTokenFilter extends TokenStream { + + private PrefixAwareTokenFilter suffix; + + public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) { + prefix = new PrefixAwareTokenFilter(prefix, input) { + public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { + return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken); + } + }; + this.suffix = new PrefixAwareTokenFilter(prefix, suffix) { + public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { + return PrefixAndSuffixAwareTokenFilter.this.updateSuffixToken(suffixToken, lastInputToken); + } + }; + } + + public Token updateInputToken(Token inputToken, Token lastPrefixToken) { + inputToken.setStartOffset(lastPrefixToken.endOffset() + inputToken.startOffset()); + inputToken.setEndOffset(lastPrefixToken.endOffset() + inputToken.endOffset()); + return inputToken; + } + + public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { + suffixToken.setStartOffset(lastInputToken.endOffset() + suffixToken.startOffset()); + suffixToken.setEndOffset(lastInputToken.endOffset() + suffixToken.endOffset()); + return suffixToken; + } + + + public Token next(Token result) throws IOException { + return suffix.next(result); + } + + + public void reset() throws IOException { + suffix.reset(); + } + + + public void close() throws IOException { + suffix.close(); + } +}