Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java (revision 0) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java (working copy) @@ -0,0 +1,71 @@ +package org.apache.lucene.search.suggest.analyzing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.search.suggest.Lookup.LookupResult; +import org.apache.lucene.search.suggest.TermFreqPayload; +import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +// Test requires postings offsets: +@SuppressCodecs({"Lucene3x","MockFixedIntBlock","MockVariableIntBlock","MockSep","MockRandom"}) +public class AnalyzingInfixSuggesterTest extends LuceneTestCase { + + public void testBasic() throws Exception { + TermFreqPayload keys[] = new TermFreqPayload[] { + new TermFreqPayload("lend me your ear", 8, new BytesRef("foobar")), + new TermFreqPayload("a penny saved is a penny earned", 10, new BytesRef("foobaz")), + }; + + File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest"); + + AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); + suggester.build(new TermFreqPayloadArrayIterator(keys)); + + List results = suggester.lookup(_TestUtil.stringToCharSequence("ear", random()), false, 10); + assertEquals(2, results.size()); + assertEquals("a penny saved is a penny earned", results.get(0).key); + assertEquals(10, results.get(0).value); + assertEquals(new BytesRef("foobaz"), results.get(0).payload); + assertEquals("lend me your ear", results.get(1).key); + assertEquals(8, results.get(1).value); + assertEquals(new BytesRef("foobar"), results.get(1).payload); + + results = suggester.lookup(_TestUtil.stringToCharSequence("ear ", random()), false, 10); + assertEquals(1, results.size()); + assertEquals("lend me your ear", results.get(0).key); + assertEquals(8, results.get(0).value); + assertEquals(new BytesRef("foobar"), results.get(0).payload); + + results = suggester.lookup(_TestUtil.stringToCharSequence("pen", random()), false, 10); + assertEquals(1, results.size()); + assertEquals("a penny saved is a penny earned", results.get(0).key); + assertEquals(10, results.get(0).value); + assertEquals(new BytesRef("foobaz"), results.get(0).payload); + } + + // nocommit test showing analysis effect +} Property changes on: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java (revision 0) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java (working copy) @@ -0,0 +1,317 @@ +package org.apache.lucene.search.suggest.analyzing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopFieldDocs; +import org.apache.lucene.search.postingshighlight.PassageFormatter; +import org.apache.lucene.search.postingshighlight.PassageScorer; +import org.apache.lucene.search.postingshighlight.PostingsHighlighter; +import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.search.spell.TermFreqPayloadIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Version; + +// TODO: test this on imdb suggestions! + +/** Analyzes the input text and then suggests matches based + * on matches to any matching tokens query text against the + * suggestions. This also highlights the tokens that + * match. + * + *

This just uses an ordinary Lucene index, with {@link + * PostingsHighlighter} to do the highlighting. It + * supports payloads, and records these as a + * {@link BinaryDocValues} field. Matches are sorted only + * by the suggest weight ... would be nice to supported + * blended score + weight sort in the future. This means + * this suggester is really only appropriate in cases + * where there is a strong apriori ranking of all the + * suggestions. */ + +public class AnalyzingInfixSuggester extends Lookup implements Closeable { + + protected final static String TEXT_FIELD_NAME = "text"; + + private final Analyzer queryAnalyzer; + private final Analyzer indexAnalyzer; + private final Directory dir; + private final PostingsHighlighter highlighter; + private final Version matchVersion; + + protected IndexSearcher searcher; + + /** null if payloads were not indexed: */ + private BinaryDocValues payloadsDV; + + public AnalyzingInfixSuggester(Version matchVersion, File indexPath, Analyzer analyzer) throws IOException { + this(matchVersion, indexPath, analyzer, analyzer); + } + + // TODO: this could support NRT additions to the suggester + // ... be sure to remove that forceMerge(1) if we do!!! + public AnalyzingInfixSuggester(Version matchVersion, File indexPath, Analyzer indexAnalyzer, Analyzer queryAnalyzer) throws IOException { + + this.queryAnalyzer = queryAnalyzer; + this.indexAnalyzer = indexAnalyzer; + this.matchVersion = matchVersion; + + highlighter = createHighlighter(); + dir = FSDirectory.open(indexPath); + if (DirectoryReader.indexExists(dir)) { + searcher = new IndexSearcher(DirectoryReader.open(dir)); + payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads"); + } + } + + /** Override this to control how each suggestion is + * highlighted. */ + protected PostingsHighlighter createHighlighter() { + return new PostingsHighlighter(PostingsHighlighter.DEFAULT_MAX_LENGTH, + null, + new PassageScorer(), + new PassageFormatter()); + } + + /** Override this to customize index settings, e.g. which + * codec to use. */ + protected IndexWriterConfig getIndexWriterConfig(Version matchVersion, Analyzer indexAnalyzer) { + IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer); + iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + return iwc; + } + + @Override + public void build(TermFreqIterator iter) throws IOException { + + TermFreqPayloadIterator payloads; + if (iter instanceof TermFreqPayloadIterator) { + payloads = (TermFreqPayloadIterator) iter; + } else { + payloads = null; + } + IndexWriter w = new IndexWriter(dir, + getIndexWriterConfig(matchVersion, indexAnalyzer)); + boolean success = false; + try { + BytesRef text; + Document doc = new Document(); + // nocommit I'd like to do NOT_STORED and put text into DV field but PostingsHighlighter can't pull from DV ...? + FieldType ft = new FieldType(TextField.TYPE_STORED); + ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field textField = new Field(TEXT_FIELD_NAME, "", ft); + doc.add(textField); + + // TODO: use threads...? + Field weightField = new NumericDocValuesField("weight", 0); + doc.add(weightField); + + Field payloadField; + if (payloads != null) { + payloadField = new BinaryDocValuesField("payloads", new BytesRef()); + doc.add(payloadField); + } else { + payloadField = null; + } + + while ((text = iter.next()) != null) { + textField.setStringValue(text.utf8ToString()); + weightField.setLongValue(iter.weight()); + if (payloads != null) { + payloadField.setBytesValue(payloads.payload()); + } + w.addDocument(doc); + } + // TODO: if we ever support adding more suggestsions + // over time then nuke this: + w.forceMerge(1); + searcher = new IndexSearcher(DirectoryReader.open(w, false)); + payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads"); + success = true; + } finally { + if (success) { + w.close(); + } else { + w.rollback(); + } + } + } + + @Override + public List lookup(CharSequence key, boolean onlyMorePopular, int num) { + return lookup(key, num, true, -1.0f); + } + + /** This is called if the last token isn't ended + * (e.g. user did not type a space after it). Return an + * appropriate Query clause to add to the BooleanQuery. */ + protected Query getLastTokenQuery(String token) throws IOException { + if (token.length() < 2) { + return null; + } + + // We manually rewrite to top terms so that highlighter + // "works" ... but this is quite hacky because if there + // are more than 100 terms in the index matching this + // prefix then some are lost! Might be better to + // re-analyze each match and search for prefixes to + // hilight ourselves? + PrefixQuery query = new PrefixQuery(new Term(TEXT_FIELD_NAME, token)); + query.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(100)); + return searcher.rewrite(query); + } + + public List lookup(CharSequence key, int num, boolean allTermsRequired, final float scoreWeight) { + + final BooleanClause.Occur occur; + if (allTermsRequired) { + occur = BooleanClause.Occur.MUST; + } else { + occur = BooleanClause.Occur.SHOULD; + } + + try { + //long t0 = System.currentTimeMillis(); + TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); + ts.reset(); + final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + String lastToken = null; + BooleanQuery query = new BooleanQuery(); + int maxEndOffset = -1; + while (ts.incrementToken()) { + if (lastToken != null) { + query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur); + } + lastToken = termAtt.toString(); + if (lastToken != null) { + maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset()); + } + } + ts.end(); + + if (lastToken != null) { + Query lastQuery; + if (maxEndOffset == offsetAtt.endOffset()) { + lastQuery = getLastTokenQuery(lastToken); + } else { + lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)); + } + if (lastQuery != null) { + query.add(lastQuery, occur); + } + } + ts.close(); + + // TODO: we could allow blended sort here, combining + // weight w/ score. Now we ignore score and sort only + // by weight: + Sort sort = new Sort(new SortField("weight", SortField.Type.LONG, true)); + TopFieldDocs hits = searcher.search(query, null, num, sort); + + // nocommit would be nice to have PH pull from DV + // source instead of stored fields: + String[] highlights = highlighter.highlight(TEXT_FIELD_NAME, + query, + searcher, + hits, 1); + + List results = new ArrayList(); + for (int i=0;i + + + + + + + + +