Index: solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java =================================================================== --- solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java (revision 1304701) +++ solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java (working copy) @@ -32,6 +32,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.index.IndexReader; @@ -69,6 +70,7 @@ public static final String ACCURACY = "accuracy"; public static final String STRING_DISTANCE = "distanceMeasure"; public static final String COMPARATOR_CLASS = "comparatorClass"; + public static final String ANALYZER_FIELD_TYPE = "spellcheckerAnalyzerFieldType"; public static final String SCORE_COMP = "score"; public static final String FREQ_COMP = "freq"; @@ -121,12 +123,22 @@ } else { sd = new LevensteinDistance(); } + String analyzerFieldType = (String) config.get(ANALYZER_FIELD_TYPE); + Analyzer analyzer = null; + if (analyzerFieldType != null && + core.getSchema().getFieldTypes().containsKey(analyzerFieldType)) { + FieldType fieldType = core.getSchema().getFieldTypes().get(analyzerFieldType); + analyzer = fieldType.getQueryAnalyzer(); + } try { initIndex(); spellChecker = new SpellChecker(index, sd, comp); } catch (IOException e) { throw new RuntimeException(e); } + if (analyzer != null) { + spellChecker.setAnalyzer(analyzer); + } if (accuracy != null) { try { this.accuracy = Float.parseFloat(accuracy); Index: modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java =================================================================== --- modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java (revision 1304701) +++ modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java (working copy) @@ -18,6 +18,7 @@ */ import java.io.IOException; +import java.io.Reader; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -26,7 +27,12 @@ import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.TextField; import org.apache.lucene.index.CorruptIndexException; @@ -85,6 +91,13 @@ // twice writer.addDocument(doc); } + { + Document doc = new Document(); + doc.add(newField("field1", "APPLE", TextField.TYPE_STORED)); // an "APPLE" + // in the + // index + writer.addDocument(doc); + } writer.close(); searchers = Collections.synchronizedList(new ArrayList()); @@ -230,6 +243,29 @@ } r.close(); } + + public void testSurfaceAnalyzer() throws Exception { + IndexReader r = IndexReader.open(userindex); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(tokenizer, new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer)); + } + }; + spellChecker.setAnalyzer(a); + spellChecker.clearIndex(); + addwords(r, spellChecker, "field1"); + + { + String[] similar = spellChecker.suggestSimilar("appli", 1, r, "field1", + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + assertEquals(1, similar.length); + assertEquals("APPLE", similar[0]); + } + r.close(); + } + private void checkCommonSuggestions(IndexReader r) throws IOException { String[] similar = spellChecker.suggestSimilar("fvie", 2); assertTrue(similar.length > 0); Index: modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java =================================================================== --- modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java (revision 1304701) +++ modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java (working copy) @@ -18,15 +18,18 @@ */ import java.io.IOException; +import java.io.StringReader; import java.util.ArrayList; import java.util.Comparator; -import java.util.Iterator; import java.util.List; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.document.StringField; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.DirectoryReader; @@ -34,9 +37,10 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; @@ -58,7 +62,7 @@ * *

Example Usage: * - *

+ * 
  *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
  *  // To index a field of a user index:
  *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
@@ -66,7 +70,7 @@
  *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
  *  String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
  * 
- * + *

* */ public class SpellChecker implements java.io.Closeable { @@ -117,6 +121,8 @@ private StringDistance sd; private Comparator comparator; + private Analyzer analyzer = null; + /** * Use the given directory as a spell checker index. The directory * is created if it doesn't exist yet. @@ -190,6 +196,14 @@ public Comparator getComparator() { return comparator; } + + public Analyzer getAnalyzer() { + return analyzer; + } + + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } /** * Sets the {@link StringDistance} implementation for this @@ -228,6 +242,27 @@ public float getAccuracy() { return accuracy; } + + // NOTE: can't do any crazy graphs etc + private String analyzeWord(String surface) throws IOException { + if (analyzer == null) { + return surface; + } else { + StringBuilder sb = new StringBuilder(); + TokenStream ts = analyzer.tokenStream("bogus", new StringReader(surface)); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + if (posIncAtt.getPositionIncrement() > 0) { + sb.append(termAtt); + } + } + ts.end(); + ts.close(); + return sb.toString(); + } + } /** * Suggest similar words. @@ -240,7 +275,7 @@ *

I.e. if numSug == 1, don't count on that suggestion being the best one. * Thus, you should set this value to at least 5 for a good suggestion. * - * @param word the word you want a spell check done on + * @param surface the word you want a spell check done on * @param numSug the number of suggested words * @throws IOException if the underlying index throws an {@link IOException} * @throws AlreadyClosedException if the Spellchecker is already closed @@ -248,8 +283,8 @@ * * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) */ - public String[] suggestSimilar(String word, int numSug) throws IOException { - return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + public String[] suggestSimilar(String surface, int numSug) throws IOException { + return this.suggestSimilar(surface, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); } /** @@ -263,7 +298,7 @@ *

I.e. if numSug == 1, don't count on that suggestion being the best one. * Thus, you should set this value to at least 5 for a good suggestion. * - * @param word the word you want a spell check done on + * @param surface the word you want a spell check done on * @param numSug the number of suggested words * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results * @throws IOException if the underlying index throws an {@link IOException} @@ -272,18 +307,18 @@ * * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) */ - public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException { - return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy); + public String[] suggestSimilar(String surface, int numSug, float accuracy) throws IOException { + return this.suggestSimilar(surface, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy); } /** * Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) - * suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)} + * suggestSimilar(surface, numSug, ir, suggestMode, field, this.accuracy)} * */ - public String[] suggestSimilar(String word, int numSug, IndexReader ir, + public String[] suggestSimilar(String surface, int numSug, IndexReader ir, String field, SuggestMode suggestMode) throws IOException { - return suggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy); + return suggestSimilar(surface, numSug, ir, field, suggestMode, this.accuracy); } /** @@ -297,7 +332,7 @@ *

I.e. if numSug == 1, don't count on that suggestion being the best one. * Thus, you should set this value to at least 5 for a good suggestion. * - * @param word the word you want a spell check done on + * @param surface the word you want a spell check done on * @param numSug the number of suggested words * @param ir the indexReader of the user index (can be null see field param) * @param field the field of the user index: if field is not null, the suggested @@ -312,8 +347,12 @@ * of the suggest words in the field of the user index * */ - public String[] suggestSimilar(String word, int numSug, IndexReader ir, + public String[] suggestSimilar(String surface, int numSug, IndexReader ir, String field, SuggestMode suggestMode, float accuracy) throws IOException { + + // the analyzed form + String word = analyzeWord(surface); + // obtainSearcher calls ensureOpen final IndexSearcher indexSearcher = obtainSearcher(); try { @@ -327,11 +366,11 @@ final int lengthWord = word.length(); - final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0; + final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, surface)) : 0; final int goalFreq = suggestMode==SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) { - return new String[] { word }; + return new String[] { surface }; } BooleanQuery query = new BooleanQuery(); @@ -376,12 +415,12 @@ sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly - if (sugWord.string.equals(word)) { + if (sugWord.string.equals(surface)) { continue; } // edit distance - sugWord.score = sd.getDistance(word,sugWord.string); + sugWord.score = sd.getDistance(surface,sugWord.string); if (sugWord.score < accuracy) { continue; } @@ -516,7 +555,8 @@ terms: while ((currentTerm = iter.next()) != null) { - String word = currentTerm.utf8ToString(); + String surface = currentTerm.utf8ToString(); + String word = analyzeWord(surface); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... @@ -531,7 +571,7 @@ } // ok index the word - Document doc = createDocument(word, getMin(len), getMax(len)); + Document doc = createDocument(surface, word, getMin(len), getMax(len)); writer.addDocument(doc); } } finally { @@ -571,13 +611,13 @@ return 2; } - private static Document createDocument(String text, int ng1, int ng2) { + private static Document createDocument(String surface, String analyzed, int ng1, int ng2) { Document doc = new Document(); // the word field is never queried on... its indexed so it can be quickly // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos - Field f = new Field(F_WORD, text, StringField.TYPE_STORED); + Field f = new Field(F_WORD, surface, StringField.TYPE_STORED); doc.add(f); // orig term - addGram(text, doc, ng1, ng2); + addGram(analyzed, doc, ng1, ng2); return doc; }