Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/StopKeywordFilter.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/StopKeywordFilter.java (revision 0) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/StopKeywordFilter.java (working copy) @@ -0,0 +1,131 @@ +package org.apache.lucene.search.suggest.analyzing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.FilteringTokenFilter; +import org.apache.lucene.util.Version; + +/** + * Removes stop words from a token stream, if + * {@link KeywordAttribute} is set then does not remove the + * word. + * + * + *
You must specify the required {@link Version} + * compatibility when creating StopFilter: + *
This just uses an ordinary Lucene index. It
+ * supports payloads, and records these as a
+ * {@link BinaryDocValues} field. Matches are sorted only
+ * by the suggest weight; it would be nice to support
+ * blended score + weight sort in the future. This means
+ * this suggester best applies when there is a strong
+ * apriori ranking of all the suggestions. */
+
+public class AnalyzingInfixSuggester extends Lookup implements Closeable {
+
+ protected final static String TEXT_FIELD_NAME = "text";
+
+ private final Analyzer queryAnalyzer;
+ private final Analyzer indexAnalyzer;
+ private final Directory dir;
+ private final Version matchVersion;
+ private final File indexPath;
+ private final int minPrefixChars;
+
+ protected IndexSearcher searcher;
+
+ /** null if payloads were not indexed: */
+ private BinaryDocValues payloadsDV;
+ private BinaryDocValues textDV;
+ private NumericDocValues weightsDV;
+
+ /** Default minimum number of leading characters before
+ * PrefixQuery is used (4). */
+ public static final int DEFAULT_MIN_PREFIX_CHARS = 4;
+
+ /** Create a new instance, loading from a previously built
+ * directory, if it exists. */
+ public AnalyzingInfixSuggester(Version matchVersion, File indexPath, Analyzer analyzer) throws IOException {
+ this(matchVersion, indexPath, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS);
+ }
+
+ /** Create a new instance, loading from a previously built
+ * directory, if it exists.
+ *
+ * @param minPrefixChars Minimum number of leading characters
+ * before PrefixQuery is used (default 4).
+ * Prefixes shorter than this are indexed as character
+ * ngrams (increasing index size but making lookups
+ * faster).
+ */
+ public AnalyzingInfixSuggester(Version matchVersion, File indexPath, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars) throws IOException {
+
+ if (minPrefixChars < 0) {
+ throw new IllegalArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars);
+ }
+
+ this.queryAnalyzer = queryAnalyzer;
+ this.indexAnalyzer = indexAnalyzer;
+ this.matchVersion = matchVersion;
+ this.indexPath = indexPath;
+ this.minPrefixChars = minPrefixChars;
+ dir = FSDirectory.open(indexPath);
+
+ if (DirectoryReader.indexExists(dir)) {
+ // Already built; open it:
+ searcher = new IndexSearcher(DirectoryReader.open(dir));
+ // This will just be null if app didn't pass payloads to build():
+ // TODO: maybe just stored fields? they compress...
+ payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads");
+ weightsDV = MultiDocValues.getNumericValues(searcher.getIndexReader(), "weight");
+ textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME);
+ assert textDV != null;
+ }
+ }
+
+ /** Override this to customize index settings, e.g. which
+ * codec to use. */
+ protected IndexWriterConfig getIndexWriterConfig(Version matchVersion, Analyzer indexAnalyzer) {
+ IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer);
+ iwc.setCodec(new Lucene42Codec());
+ iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
+ return iwc;
+ }
+
+ @Override
+ public void build(TermFreqIterator iter) throws IOException {
+
+ TermFreqPayloadIterator payloads;
+ if (iter instanceof TermFreqPayloadIterator) {
+ payloads = (TermFreqPayloadIterator) iter;
+ } else {
+ payloads = null;
+ }
+ Directory dirTmp = FSDirectory.open(new File(indexPath.toString() + ".tmp"));
+
+ Analyzer gramAnalyzer = new AnalyzerWrapper() {
+ @Override
+ protected Analyzer getWrappedAnalyzer(String fieldName) {
+ return indexAnalyzer;
+ }
+
+ @Override
+ protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+ if (fieldName.equals("textgrams") && minPrefixChars > 0) {
+ return new TokenStreamComponents(components.getTokenizer(),
+ new EdgeNGramTokenFilter(matchVersion,
+ components.getTokenStream(),
+ 1, minPrefixChars));
+ } else {
+ return components;
+ }
+ }
+ };
+
+ IndexWriter w = new IndexWriter(dirTmp,
+ getIndexWriterConfig(matchVersion, gramAnalyzer));
+ IndexWriter w2 = null;
+ AtomicReader r = null;
+ boolean success = false;
+ try {
+
+ BytesRef text;
+ Document doc = new Document();
+ FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+ ft.setIndexOptions(IndexOptions.DOCS_ONLY);
+ ft.setOmitNorms(true);
+ Field textField = new Field(TEXT_FIELD_NAME, "", ft);
+ doc.add(textField);
+
+ Field textGramField = new Field("textgrams", "", ft);
+ doc.add(textGramField);
+
+ Field textDVField = new BinaryDocValuesField(TEXT_FIELD_NAME, new BytesRef());
+ doc.add(textDVField);
+
+ // TODO: use threads...?
+ Field weightField = new NumericDocValuesField("weight", 0);
+ doc.add(weightField);
+
+ Field payloadField;
+ if (payloads != null) {
+ payloadField = new BinaryDocValuesField("payloads", new BytesRef());
+ doc.add(payloadField);
+ } else {
+ payloadField = null;
+ }
+
+ //long t0 = System.nanoTime();
+ while ((text = iter.next()) != null) {
+ String textString = text.utf8ToString();
+ textField.setStringValue(textString);
+ textGramField.setStringValue(textString);
+ textDVField.setBytesValue(text);
+ weightField.setLongValue(iter.weight());
+ if (payloads != null) {
+ payloadField.setBytesValue(payloads.payload());
+ }
+ w.addDocument(doc);
+ }
+ //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec");
+
+ r = new SlowCompositeReaderWrapper(DirectoryReader.open(w, false));
+ //long t1 = System.nanoTime();
+ w.rollback();
+
+ final int maxDoc = r.maxDoc();
+
+ final NumericDocValues weights = r.getNumericDocValues("weight");
+
+ final Sorter.DocComparator comparator = new Sorter.DocComparator() {
+ @Override
+ public int compare(int docID1, int docID2) {
+ final long v1 = weights.get(docID1);
+ final long v2 = weights.get(docID2);
+ // Reverse sort (highest weight first);
+ // java7 only:
+ //return Long.compare(v2, v1);
+ if (v1 > v2) {
+ return -1;
+ } else if (v1 < v2) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ };
+
+ r = SortingAtomicReader.wrap(r, new Sorter() {
+ @Override
+ public Sorter.DocMap sort(AtomicReader reader) throws IOException {
+ //long t0 = System.nanoTime();
+ try {
+ return Sorter.sort(maxDoc, comparator);
+ } finally {
+ //System.out.println("Sort took " + ((System.nanoTime() - t0)/1000000.) + " msec");
+ }
+ }
+
+ @Override
+ public String getID() {
+ return "Weight";
+ }
+ });
+
+ w2 = new IndexWriter(dir,
+ getIndexWriterConfig(matchVersion, indexAnalyzer));
+ w2.addIndexes(new IndexReader[] {r});
+ r.close();
+
+ //System.out.println("sort time: " + ((System.nanoTime()-t1)/1000000) + " msec");
+
+ searcher = new IndexSearcher(DirectoryReader.open(w2, false));
+ w2.close();
+
+ payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads");
+ weightsDV = MultiDocValues.getNumericValues(searcher.getIndexReader(), "weight");
+ textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME);
+ assert textDV != null;
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(w, w2, r);
+ } else {
+ IOUtils.closeWhileHandlingException(w, w2, r);
+ }
+ }
+ }
+
+ @Override
+ public List